Merge branch 'main' into openhands-fix-issue-4127

Handle errors when starting session (#4134 )
chore(deps): bump boto3 from 1.35.29 to 1.35.30 (#4144 )
2026-04-29 03:00:45 -04:00 · 2024-10-01 12:44:52 -04:00 · 2024-10-01 12:40:09 -04:00 · 2024-10-01 16:19:07 +00:00 · 2024-10-01 18:18:49 +02:00 · 2024-10-01 18:08:57 +02:00
186 changed files with 7035 additions and 4569 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,4 +1,6 @@
-**Short description of the problem this fixes or functionality that this introduces. This may be used for the CHANGELOG**
+- [ ] Include this change in the Release Notes. If checked, you must provide an **end-user friendly** description for your change below
+
+**End-user friendly description of the problem this fixes or functionality that this introduces**



--- a/.github/workflows/dummy-agent-test.yml
+++ b/.github/workflows/dummy-agent-test.yml
@@ -14,20 +14,38 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Install poetry via pipx
+        run: pipx install poetry
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
-      - name: Set up environment
-        run: |
-          curl -sSL https://install.python-poetry.org | python3 -
-          poetry install --without evaluation,llama-index
-          poetry run playwright install --with-deps chromium
-          wget https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json -P /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/1_Pooling/
+          cache: 'poetry'
+      - name: Install Python dependencies using Poetry
+        run: poetry install --without evaluation,llama-index
+      - name: Build Environment
+        run: make build
      - name: Run tests
        run: |
          set -e
-          poetry run python openhands/core/main.py -t "do a flip" -d ./workspace/ -c DummyAgent
+          poetry run python3 openhands/core/main.py -t "do a flip" -d ./workspace/ -c DummyAgent
      - name: Check exit code
        run: |
          if [ $? -ne 0 ]; then
--- a/.github/workflows/ghcr_runtime.yml
+++ b/.github/workflows/ghcr_runtime.yml
@@ -25,7 +25,71 @@ on:
        required: true
        default: ''

+env:
+  BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST: nikolaik/python-nodejs:python3.11-nodejs22
+
 jobs:
+  # Builds the OpenHands Docker images
+  ghcr_build_app:
+    name: Build App Image
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    outputs:
+      hash_from_app_image: ${{ steps.get_hash_in_app_image.outputs.hash_from_app_image }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3.0.0
+        with:
+          image: tonistiigi/binfmt:latest
+      - name: Login to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build and push app image
+        if: "!github.event.pull_request.head.repo.fork"
+        run: |
+          ./containers/build.sh -i openhands -o ${{ github.repository_owner }} --push
+      - name: Build app image
+        if: "github.event.pull_request.head.repo.fork"
+        run: |
+          ./containers/build.sh -i openhands -o ${{ github.repository_owner }} --load
+      - name: Get hash in App Image
+        id: get_hash_in_app_image
+        run: |
+          # Lowercase the repository owner
+          export REPO_OWNER=${{ github.repository_owner }}
+          REPO_OWNER=$(echo $REPO_OWNER | tr '[:upper:]' '[:lower:]')
+          # Run the build script in the app image
+          docker run -e SANDBOX_USER_ID=0 -v /var/run/docker.sock:/var/run/docker.sock ghcr.io/${REPO_OWNER}/openhands:${{ github.sha }} /bin/bash -c "mkdir -p containers/runtime; python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild" 2>&1 | tee docker-outputs.txt
+          # Get the hash from the build script
+          hash_from_app_image=$(cat docker-outputs.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1)
+          echo "hash_from_app_image=$hash_from_app_image" >> $GITHUB_OUTPUT
+          echo "Hash from app image: $hash_from_app_image"
+
+
  # Builds the runtime Docker images
  ghcr_build_runtime:
    name: Build Image
@@ -56,7 +120,9 @@ jobs:
          docker-images: false
          swap-storage: true
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
+        uses: docker/setup-qemu-action@v3.0.0
+        with:
+          image: tonistiigi/binfmt:latest
      - name: Login to GHCR
        uses: docker/login-action@v3
        with:
@@ -88,7 +154,7 @@ jobs:
      - name: Build and push runtime image ${{ matrix.base_image.image }}
        if: github.event.pull_request.head.repo.fork != true
        run: |
-          ./containers/build.sh runtime ${{ github.repository_owner }} --push ${{ matrix.base_image.tag }}
+          ./containers/build.sh -i runtime -o ${{ github.repository_owner }} --push -t ${{ matrix.base_image.tag }}
      # Forked repos can't push to GHCR, so we need to upload the image as an artifact
      - name: Build runtime image ${{ matrix.base_image.image }} for fork
        if: github.event.pull_request.head.repo.fork
@@ -104,6 +170,56 @@ jobs:
          name: runtime-${{ matrix.base_image.tag }}
          path: /tmp/runtime-${{ matrix.base_image.tag }}.tar

+  verify_hash_equivalence_in_runtime_and_app:
+    name: Verify Hash Equivalence in Runtime and Docker images
+    runs-on: ubuntu-latest
+    needs: [ghcr_build_runtime, ghcr_build_app]
+    strategy:
+      fail-fast: false
+      matrix:
+        base_image: ['nikolaik']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.virtualenvs
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+      - name: Get hash in App Image
+        run: |
+          echo "Hash from app image: ${{ needs.ghcr_build_app.outputs.hash_from_app_image }}"
+          echo "hash_from_app_image=${{ needs.ghcr_build_app.outputs.hash_from_app_image }}" >> $GITHUB_ENV
+
+      - name: Get hash using code (development mode)
+        run: |
+          mkdir -p containers/runtime
+          poetry run python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild > output.txt 2>&1
+          hash_from_code=$(cat output.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1)
+          echo "hash_from_code=$hash_from_code" >> $GITHUB_ENV
+
+      - name: Compare hashes
+        run: |
+          echo "Hash from App Image: ${{ env.hash_from_app_image }}"
+          echo "Hash from Code: ${{ env.hash_from_code }}"
+          if [ "${{ env.hash_from_app_image }}" = "${{ env.hash_from_code }}" ]; then
+            echo "Hashes match!"
+          else
+            echo "Hashes do not match!"
+            exit 1
+          fi
+
  # Run unit tests with the EventStream runtime Docker images as root
  test_runtime_root:
    name: RT Unit Tests (Root)
@@ -115,6 +231,23 @@ jobs:
        base_image: ['nikolaik']
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
        if: github.event.pull_request.head.repo.fork
@@ -145,8 +278,7 @@ jobs:
        run: make install-python-dependencies
      - name: Run runtime tests
        run: |
-          # We install pytest-xdist in order to run tests across CPUs. However, tests start to fail when we run
-          # then across more than 2 CPUs for some reason
+          # We install pytest-xdist in order to run tests across CPUs
          poetry run pip install pytest-xdist

          # Install to be able to retry on failures for flaky tests
@@ -158,10 +290,10 @@ jobs:
          SKIP_CONTAINER_LOGS=true \
          TEST_RUNTIME=eventstream \
          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
+          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=false \
-          poetry run pytest -n 3 --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raR --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -177,6 +309,23 @@ jobs:
        base_image: ['nikolaik']
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
        if: github.event.pull_request.head.repo.fork
@@ -207,8 +356,7 @@ jobs:
        run: make install-python-dependencies
      - name: Run runtime tests
        run: |
-          # We install pytest-xdist in order to run tests across CPUs. However, tests start to fail when we run
-          # then across more than 2 CPUs for some reason
+          # We install pytest-xdist in order to run tests across CPUs
          poetry run pip install pytest-xdist

          # Install to be able to retry on failures for flaky tests
@@ -220,10 +368,10 @@ jobs:
          SKIP_CONTAINER_LOGS=true \
          TEST_RUNTIME=eventstream \
          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
+          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=true \
-          poetry run pytest -n 3 --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raR --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -240,6 +388,23 @@ jobs:
        base_image: ['nikolaik']
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
        if: github.event.pull_request.head.repo.fork
@@ -275,7 +440,7 @@ jobs:

          TEST_RUNTIME=eventstream \
          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
+          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          TEST_ONLY=true \
          ./tests/integration/regenerate.sh
@@ -292,7 +457,7 @@ jobs:
    name: All Runtime Tests Passed
    if: ${{ !cancelled() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
+    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux, verify_hash_equivalence_in_runtime_and_app]
    steps:
      - name: All tests passed
        run: echo "All runtime tests have passed successfully!"
@@ -301,7 +466,7 @@ jobs:
    name: All Runtime Tests Passed
    if: ${{ cancelled() || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
+    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux, verify_hash_equivalence_in_runtime_and_app]
    steps:
      - name: Some tests failed
        run: |
--- a/.github/workflows/ghcr_app.yml
+++ b/.github/workflows/ghcr_app.yml
@@ -1,65 +0,0 @@
-# Workflow that builds, tests and then pushes the app docker images to the ghcr.io repository
-name: Build and Publish App Image
-
-# Always run on "main"
-# Always run on tags
-# Always run on PRs
-# Can also be triggered manually
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - '*'
-  pull_request:
-  workflow_dispatch:
-    inputs:
-      reason:
-        description: 'Reason for manual trigger'
-        required: true
-        default: ''
-
-jobs:
-  # Builds the OpenHands Docker images
-  ghcr_build:
-    name: Build App Image
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: true
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: false
-          swap-storage: true
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-      - name: Login to GHCR
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build and push app image
-        if: "!github.event.pull_request.head.repo.fork"
-        run: |
-          ./containers/build.sh openhands ${{ github.repository_owner }} --push
-      - name: Build app image
-        if: "github.event.pull_request.head.repo.fork"
-        run: |
-          ./containers/build.sh openhands image ${{ github.repository_owner }}
--- a/.github/workflows/openhands-resolver.yml
+++ b/.github/workflows/openhands-resolver.yml
@@ -0,0 +1,13 @@
+name: Resolve Issues with OpenHands
+
+on:
+  issues:
+    types: [labeled]
+
+jobs:
+  call-openhands-resolver:
+    uses: All-Hands-AI/openhands-resolver/.github/workflows/openhands-resolver.yml@main
+    if: github.event.label.name == 'fix-me'
+    with:
+      issue_number: ${{ github.event.issue.number }}
+    secrets: inherit
--- a/.github/workflows/py-unit-tests.yml
+++ b/.github/workflows/py-unit-tests.yml
@@ -89,6 +89,9 @@ jobs:
          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
      - name: Build Environment
        run: make build
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      - name: Run Tests
        run: poetry run pytest --forked --cov=agenthub --cov=openhands --cov-report=xml ./tests/unit
      - name: Upload coverage to Codecov
@@ -107,6 +110,9 @@ jobs:
        python-version: ['3.11']
    steps:
      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Set up Python
--- a/.github/workflows/regenerate_integration_tests.yml
+++ b/.github/workflows/regenerate_integration_tests.yml
@@ -29,6 +29,9 @@ jobs:
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
+    - name: Set up Docker Buildx
+      id: buildx
+      uses: docker/setup-buildx-action@v3
    - name: Set up Python
      uses: actions/setup-python@v5
      with:
--- a/.github/workflows/review-pr.yml
+++ b/.github/workflows/review-pr.yml
@@ -15,6 +15,9 @@ jobs:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
+    - name: Set up Docker Buildx
+      id: buildx
+      uses: docker/setup-buildx-action@v3
    - name: Set up Python
      uses: actions/setup-python@v5
      with:
--- a/.github/workflows/solve-issue.yml
+++ b/.github/workflows/solve-issue.yml
@@ -1,113 +0,0 @@
-# Workflow that uses OpenHands to resolve a GitHub issue. Issue must be labeled 'solve-this'
-name: Use OpenHands to Resolve GitHub Issue
-
-on:
-  issues:
-    types: [labeled]
-
-permissions:
-  contents: write
-  pull-requests: write
-  issues: write
-
-jobs:
-  dogfood:
-    if: github.event.label.name == 'solve-this'
-    runs-on: ubuntu-latest
-    container:
-      image: ghcr.io/all-hands-ai/openhands
-      volumes:
-        - /var/run/docker.sock:/var/run/docker.sock
-    steps:
-    - name: install git, github cli
-      run: apt-get install -y git gh
-    - name: Checkout Repository
-      uses: actions/checkout@v4
-    - name: Write Task File
-      env:
-        ISSUE_TITLE: ${{ github.event.issue.title }}
-        ISSUE_BODY: ${{ github.event.issue.body }}
-      run: |
-        echo "TITLE:" > task.txt
-        echo "${ISSUE_TITLE}" >> task.txt
-        echo "" >> task.txt
-        echo "BODY:" >> task.txt
-        echo "${ISSUE_BODY}" >> task.txt
-    - name: Set up environment
-      run: |
-        curl -sSL https://install.python-poetry.org | python3 -
-        export PATH="/github/home/.local/bin:$PATH"
-        poetry install --without evaluation,llama-index
-        poetry run playwright install --with-deps chromium
-    - name: Run OpenHands
-      env:
-        ISSUE_TITLE: ${{ github.event.issue.title }}
-        ISSUE_BODY: ${{ github.event.issue.body }}
-        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-      run: |
-        # Append path to launch poetry
-        export PATH="/github/home/.local/bin:$PATH"
-        # Append path to correctly import package, note: must set pwd at first
-        export PYTHONPATH=$(pwd):$PYTHONPATH
-        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./openhands/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
-        rm task.txt
-    - name: Setup Git, Create Branch, and Commit Changes
-      run: |
-        # Setup Git configuration
-        git config --global --add safe.directory $PWD
-        git config --global user.name 'OpenHands'
-        git config --global user.email 'OpenHands@users.noreply.github.com'
-
-        # Create a unique branch name with a timestamp
-        BRANCH_NAME="fix/${{ github.event.issue.number }}-$(date +%Y%m%d%H%M%S)"
-
-        # Checkout new branch
-        git checkout -b $BRANCH_NAME
-
-        # Add all changes to staging, except task.txt
-        git add --all -- ':!task.txt'
-
-        # Commit the changes, if any
-        git commit -m "OpenHands: Resolve Issue #${{ github.event.issue.number }}"
-        if [ $? -ne 0 ]; then
-          echo "No changes to commit."
-          exit 0
-        fi
-
-        # Push changes
-        git push --set-upstream origin $BRANCH_NAME
-    - name: Fetch Default Branch
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        # Fetch the default branch using gh cli
-        DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef --jq .defaultBranchRef.name)
-        echo "Default branch is $DEFAULT_BRANCH"
-        echo "DEFAULT_BRANCH=$DEFAULT_BRANCH" >> $GITHUB_ENV
-    - name: Generate PR
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        # Create PR and capture URL
-        PR_URL=$(gh pr create \
-          --title "OpenHands: Resolve Issue #2" \
-          --body "This PR was generated by OpenHands to resolve issue #2" \
-          --repo "foragerr/OpenHands" \
-          --head "${{ github.head_ref }}" \
-          --base "${{ env.DEFAULT_BRANCH }}" \
-          | grep -o 'https://github.com/[^ ]*')
-
-        # Extract PR number from URL
-        PR_NUMBER=$(echo "$PR_URL" | grep -o '[0-9]\+$')
-
-        # Set environment vars
-        echo "PR_URL=$PR_URL" >> $GITHUB_ENV
-        echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV
-
-    - name: Post Comment
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        gh issue comment ${{ github.event.issue.number }} \
-          -b "OpenHands raised [PR #${{ env.PR_NUMBER }}](${{ env.PR_URL }}) to resolve this issue."
--- a/.gitignore
+++ b/.gitignore
@@ -217,8 +217,6 @@ config.toml
 config.toml_
 config.toml.bak

-containers/agnostic_sandbox
-
 # swe-bench-eval
 image_build_logs
 run_instance_logs
@@ -228,3 +226,4 @@ runtime_*.tar
 # docker build
 containers/runtime/Dockerfile
 containers/runtime/project.tar.gz
+containers/runtime/code
--- a/.openhands_instructions
+++ b/.openhands_instructions
@@ -0,0 +1,27 @@
+OpenHands is an automated AI software engineer. It is a repo with a Python backend
+(in the `openhands` directory) and TypeScript frontend (in the `frontend` directory).
+
+General Setup:
+- To set up the entire repo, including frontend and backend, run `make build`
+
+Backend:
+- Located in the `openhands` directory
+- Testing:
+  - All tests are in `tests/unit/test_*.py`
+  - To test new code, run `poetry run pytest tests/unit/test_xxx.py` where `xxx` is the appropriate file for the current functionality
+  - Write all tests with pytest
+
+Frontend:
+- Located in the `frontend` directory
+- Prerequisites: A recent version of NodeJS / NPM
+- Setup: Run `npm install` in the frontend directory
+- Testing:
+  - Run tests: `npm run test`
+  - To run specific tests: `npm run test -- -t "TestName"`
+- Building:
+  - Build for production: `npm run build`
+- Environment Variables:
+  - Set in `frontend/.env` or as environment variables
+  - Available variables: VITE_BACKEND_HOST, VITE_USE_TLS, VITE_INSECURE_SKIP_VERIFY, VITE_FRONTEND_PORT
+- Internationalization:
+  - Generate i18n declaration file: `npm run make-i18n`
--- a/2
+++ b/2
@@ -190,7 +190,7 @@ build-frontend:
 # Start backend
 start-backend:
 	@echo "$(YELLOW)Starting backend...$(RESET)"
-	@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"
+	@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) --reload --reload-exclude "$(shell pwd)/workspace"

 # Start frontend
 start-frontend:
--- a/README.md
+++ b/README.md
@@ -42,6 +42,8 @@ system requirements and more information.
 ```bash
 export WORKSPACE_BASE=$(pwd)/workspace

+docker pull ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+
 docker run -it --pull=always \
    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
@@ -56,6 +58,10 @@ docker run -it --pull=always \

 You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!

+You'll need a model provider and API key. One option that works well: [Claude 3.5 Sonnet](https://www.anthropic.com/api), but you have [many options](https://docs.all-hands.dev/modules/usage/llms).
+
+---
+
 You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
 or as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode).

--- a/agenthub/browsing_agent/browsing_agent.py
+++ b/agenthub/browsing_agent/browsing_agent.py
@@ -216,11 +216,8 @@ class BrowsingAgent(Agent):
        prompt = get_prompt(error_prefix, cur_url, cur_axtree_txt, prev_action_str)
        messages.append(Message(role='user', content=[TextContent(text=prompt)]))

-        flat_messages = self.llm.format_messages_for_llm(messages)
-
        response = self.llm.completion(
-            messages=flat_messages,
-            temperature=0.0,
+            messages=self.llm.format_messages_for_llm(messages),
            stop=[')```', ')\n```'],
        )
        return self.response_parser.parse(response)
--- a/agenthub/browsing_agent/prompt.py
+++ b/agenthub/browsing_agent/prompt.py
@@ -57,7 +57,7 @@ class Flags:

    @classmethod
    def from_dict(self, flags_dict):
-        """Helper for JSON serializble requirement."""
+        """Helper for JSON serializable requirement."""
        if isinstance(flags_dict, Flags):
            return flags_dict

--- a/agenthub/codeact_agent/README.md
+++ b/agenthub/codeact_agent/README.md
@@ -10,20 +10,3 @@ The conceptual idea is illustrated below. At each turn, the agent can:
   - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.

 ![image](https://github.com/All-Hands-AI/OpenHands/assets/38853559/92b622e3-72ad-4a61-8f41-8c040b6d5fb3)
-
-## Plugin System
-
-To make the CodeAct agent more powerful with only access to `bash` action space, CodeAct agent leverages OpenHands's plugin system:
- [Jupyter plugin](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/runtime/plugins/jupyter): for IPython execution via bash command
- [Agent Skills plugin](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/runtime/plugins/agent_skills): Powerful bash command line tools for software development tasks introduced by [swe-agent](https://github.com/princeton-nlp/swe-agent).
-
-## Demo
-
-https://github.com/All-Hands-AI/OpenHands/assets/38853559/f592a192-e86c-4f48-ad31-d69282d5f6ac
-
-*Example of CodeActAgent with `gpt-4-turbo-2024-04-09` performing a data science task (linear regression)*
-
-## Work-in-progress & Next step
-
-[] Support web-browsing
-[] Complete the workflow for CodeAct agent to submit Github PRs
--- a/agenthub/codeact_agent/action_parser.py
+++ b/agenthub/codeact_agent/action_parser.py
@@ -40,6 +40,10 @@ class CodeActResponseParser(ResponseParser):
        if action is None:
            return ''
        for lang in ['bash', 'ipython', 'browse']:
+            # special handling for DeepSeek: it has stop-word bug and returns </execute_ipython instead of </execute_ipython>
+            if f'</execute_{lang}' in action and f'</execute_{lang}>' not in action:
+                action = action.replace(f'</execute_{lang}', f'</execute_{lang}>')
+
            if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
                action += f'</execute_{lang}>'
        return action
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@@ -5,7 +5,6 @@ from agenthub.codeact_agent.action_parser import CodeActResponseParser
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
-from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.events.action import (
    Action,
@@ -153,7 +152,10 @@ class CodeActAgent(Agent):
            text = truncate_content(text, max_message_chars)
            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, AgentDelegateObservation):
-            text = obs_prefix + truncate_content(str(obs.outputs), max_message_chars)
+            text = obs_prefix + truncate_content(
+                obs.outputs['content'] if 'content' in obs.outputs else '',
+                max_message_chars,
+            )
            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, ErrorObservation):
            text = obs_prefix + truncate_content(obs.content, max_message_chars)
@@ -200,22 +202,9 @@ class CodeActAgent(Agent):
                '</execute_bash>',
                '</execute_browse>',
            ],
-            'temperature': 0.0,
        }

-        if self.llm.is_caching_prompt_active():
-            params['extra_headers'] = {
-                'anthropic-beta': 'prompt-caching-2024-07-31',
-            }
-
-        try:
-            response = self.llm.completion(**params)
-        except Exception as e:
-            logger.error(f'{e}')
-            error_message = '{}: {}'.format(type(e).__name__, str(e).split('\n')[0])
-            return AgentFinishAction(
-                thought=f'Agent encountered an error while processing the last action.\nError: {error_message}\nPlease try again.'
-            )
+        response = self.llm.completion(**params)

        return self.action_parser.parse(response)

--- a/agenthub/codeact_swe_agent/codeact_swe_agent.py
+++ b/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -166,7 +166,6 @@ class CodeActSWEAgent(Agent):
                '</execute_ipython>',
                '</execute_bash>',
            ],
-            temperature=0.0,
        )

        return self.response_parser.parse(response)
--- a/agenthub/micro/agent.py
+++ b/agenthub/micro/agent.py
@@ -78,7 +78,6 @@ class MicroAgent(Agent):
        message = Message(role='user', content=content)
        resp = self.llm.completion(
            messages=self.llm.format_messages_for_llm(message),
-            temperature=0.0,
        )
        action_resp = resp['choices'][0]['message']['content']
        action = parse_response(action_resp)
--- a/config.template.toml
+++ b/config.template.toml
@@ -112,7 +112,7 @@ api_key = "your-api-key"
 #embedding_deployment_name = ""

 # Embedding model to use
-embedding_model = ""
+embedding_model = "local"

 # Maximum number of characters in an observation's content
 #max_message_chars = 10000
@@ -146,8 +146,8 @@ model = "gpt-4o"
 # Drop any unmapped (unsupported) params without causing an exception
 #drop_params = false

-# Using the prompt caching feature provided by the LLM
-#caching_prompt = false
+# Using the prompt caching feature if provided by the LLM and supported
+#caching_prompt = true

 # Base URL for the OLLAMA API
 #ollama_base_url = ""
@@ -159,17 +159,17 @@ model = "gpt-4o"
 #timeout = 0

 # Top p for the API
-#top_p = 0.5
+#top_p = 1.0

 # If model is vision capable, this option allows to disable image processing (useful for cost reduction).
 #disable_vision = true

-[llm.gpt3]
+[llm.gpt4o-mini]
 # API key to use
 api_key = "your-api-key"

 # Model to use
-model = "gpt-3.5"
+model = "gpt-4o-mini"

 #################################### Agent ###################################
 # Configuration for agents (group name starts with 'agent')
@@ -188,7 +188,7 @@ model = "gpt-3.5"
 #memory_max_threads = 2

 # LLM config group to use
-#llm_config = 'llm'
+#llm_config = 'your-llm-config-group'

 [agent.RepoExplorerAgent]
 # Example: use a cheaper model for RepoExplorerAgent to reduce cost, especially
@@ -232,7 +232,7 @@ llm_config = 'gpt3'
 [security]

 # Enable confirmation mode
-#confirmation_mode = true
+#confirmation_mode = false

 # The security analyzer to use
 #security_analyzer = ""
--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -37,7 +37,7 @@ ARG OPENHANDS_BUILD_VERSION #re-declare for this section
 ENV RUN_AS_OPENHANDS=true
 # A random number--we need this to be different from the user's UID on the host machine
 ENV OPENHANDS_USER_ID=42420
-ENV SANDBOX_API_HOSTNAME=host.docker.internal
+ENV SANDBOX_LOCAL_RUNTIME_URL=http://host.docker.internal
 ENV USE_HOST_NETWORK=false
 ENV WORKSPACE_BASE=/opt/workspace_base
 ENV OPENHANDS_BUILD_VERSION=$OPENHANDS_BUILD_VERSION
@@ -70,10 +70,11 @@ RUN playwright install --with-deps chromium
 COPY --chown=openhands:app --chmod=770 ./openhands ./openhands
 COPY --chown=openhands:app --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
 COPY --chown=openhands:app --chmod=770 ./agenthub ./agenthub
-COPY --chown=openhands:app --chmod=770 ./pyproject.toml ./pyproject.toml
-COPY --chown=openhands:app --chmod=770 ./poetry.lock ./poetry.lock
-COPY --chown=openhands:app --chmod=770 ./README.md ./README.md
-COPY --chown=openhands:app --chmod=770 ./MANIFEST.in ./MANIFEST.in
+COPY --chown=openhands:app ./pyproject.toml ./pyproject.toml
+COPY --chown=openhands:app ./poetry.lock ./poetry.lock
+COPY --chown=openhands:app ./README.md ./README.md
+COPY --chown=openhands:app ./MANIFEST.in ./MANIFEST.in
+COPY --chown=openhands:app ./LICENSE ./LICENSE

 # This is run as "openhands" user, and will create __pycache__ with openhands:openhands ownership
 RUN python openhands/core/download.py # No-op to download assets
--- a/containers/build.sh
+++ b/containers/build.sh
@@ -1,13 +1,40 @@
 #!/bin/bash
 set -eo pipefail

-image_name=$1
-org_name=$2
+# Initialize variables with default values
+image_name=""
+org_name=""
 push=0
-if [[ $3 == "--push" ]]; then
-  push=1
+load=0
+tag_suffix=""
+
+# Function to display usage information
+usage() {
+    echo "Usage: $0 -i <image_name> [-o <org_name>] [--push] [--load] [-t <tag_suffix>]"
+    echo "  -i: Image name (required)"
+    echo "  -o: Organization name"
+    echo "  --push: Push the image"
+    echo "  --load: Load the image"
+    echo "  -t: Tag suffix"
+    exit 1
+}
+
+# Parse command-line options
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -i) image_name="$2"; shift 2 ;;
+        -o) org_name="$2"; shift 2 ;;
+        --push) push=1; shift ;;
+        --load) load=1; shift ;;
+        -t) tag_suffix="$2"; shift 2 ;;
+        *) usage ;;
+    esac
+done
+# Check if required arguments are provided
+if [[ -z "$image_name" ]]; then
+    echo "Error: Image name is required."
+    usage
 fi
-tag_suffix=$4

 echo "Building: $image_name"
 tags=()
@@ -95,14 +122,35 @@ if [[ $push -eq 1 ]]; then
  args+=" --cache-to=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag,mode=max"
 fi

+if [[ $load -eq 1 ]]; then
+  args+=" --load"
+fi
+
 echo "Args: $args"

+# Modify the platform selection based on --load flag
+if [[ $load -eq 1 ]]; then
+  # When loading, build only for the current platform
+  platform=$(docker version -f '{{.Server.Os}}/{{.Server.Arch}}')
+else
+  # For push or without load, build for multiple platforms
+  platform="linux/amd64,linux/arm64"
+fi
+
+echo "Building for platform(s): $platform"
+
 docker buildx build \
  $args \
  --build-arg OPENHANDS_BUILD_VERSION="$OPENHANDS_BUILD_VERSION" \
  --cache-from=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag \
  --cache-from=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag_base-main \
-  --platform linux/amd64,linux/arm64 \
+  --platform $platform \
  --provenance=false \
  -f "$dir/Dockerfile" \
  "$DOCKER_BASE_DIR"
+
+# If load was requested, print the loaded images
+if [[ $load -eq 1 ]]; then
+  echo "Local images built:"
+  docker images "$DOCKER_REPOSITORY" --format "{{.Repository}}:{{.Tag}}"
+fi
--- a/containers/runtime/README.md
+++ b/containers/runtime/README.md
@@ -1,11 +1,12 @@
-# Dynamic constructed Dockerfile
+# Dynamically constructed Dockerfile

-This folder builds runtime image (sandbox), which will use a `Dockerfile` that is dynamically generated depends on the `base_image` AND a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that's based on the current commit of `openhands`.
+This folder builds a runtime image (sandbox), which will use a dynamically generated `Dockerfile`
+that depends on the `base_image` **AND** a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that is based on the current commit of `openhands`.

-The following command will generate Dockerfile for `ubuntu:22.04` and the source distribution `.tar` into `containers/runtime`.
+The following command will generate a `Dockerfile` file for `nikolaik/python-nodejs:python3.11-nodejs22` (the default base image), an updated `config.sh` and the runtime source distribution files/folders into `containers/runtime`:

 ```bash
 poetry run python3 openhands/runtime/utils/runtime_build.py \
-    --base_image ubuntu:22.04 \
+    --base_image nikolaik/python-nodejs:python3.11-nodejs22 \
    --build_folder containers/runtime
 ```
--- a/containers/sandbox/Dockerfile
+++ b/containers/sandbox/Dockerfile
@@ -1,44 +0,0 @@
-FROM ubuntu:22.04
-
-# install basic packages
-RUN apt-get update && apt-get install -y \
-    curl \
-    wget \
-    git \
-    vim \
-    nano \
-    unzip \
-    zip \
-    python3 \
-    python3-pip \
-    python3-venv \
-    python3-dev \
-    build-essential \
-    openssh-server \
-    sudo \
-    gcc \
-    jq \
-    g++ \
-    make \
-    iproute2 \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN mkdir -p -m0755 /var/run/sshd
-
-# symlink python3 to python
-RUN ln -s /usr/bin/python3 /usr/bin/python
-
-# ==== OpenHands Runtime Client ====
-RUN mkdir -p /openhands && mkdir -p /openhands/logs && chmod 777 /openhands/logs
-RUN wget --progress=bar:force -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
-RUN bash Miniforge3.sh -b -p /openhands/miniforge3
-RUN chmod -R g+w /openhands/miniforge3
-RUN bash -c ". /openhands/miniforge3/etc/profile.d/conda.sh && conda config --set changeps1 False && conda config --append channels conda-forge"
-RUN echo "" > /openhands/bash.bashrc
-RUN rm -f Miniforge3.sh
-
-# - agentskills dependencies
-RUN /openhands/miniforge3/bin/pip install --upgrade pip
-RUN /openhands/miniforge3/bin/pip install jupyterlab notebook jupyter_kernel_gateway flake8
-RUN /openhands/miniforge3/bin/pip install python-docx PyPDF2 python-pptx pylatexenc openai
-RUN /openhands/miniforge3/bin/pip install python-dotenv toml termcolor pydantic python-docx pyyaml docker pexpect tenacity e2b browsergym minio
--- a/containers/sandbox/config.sh
+++ b/containers/sandbox/config.sh
@@ -1,4 +0,0 @@
-DOCKER_REGISTRY=ghcr.io
-DOCKER_ORG=all-hands-ai
-DOCKER_IMAGE=sandbox
-DOCKER_BASE_DIR="."
--- a/docs/modules/usage/getting-started.mdx
+++ b/docs/modules/usage/getting-started.mdx
@@ -18,6 +18,8 @@ existing code that you'd like to modify.
 ```bash
 export WORKSPACE_BASE=$(pwd)/workspace

+docker pull ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+
 docker run -it --pull=always \
    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
@@ -30,8 +32,7 @@ docker run -it --pull=always \
    ghcr.io/all-hands-ai/openhands:0.9
 ```

-You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
-or as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode).
+You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), or using the [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action).

 ## Setup

@@ -54,7 +55,7 @@ The `Advanced Options` also allow you to specify a `Base URL` if required.

 ## Versions

-The command above pulls the `0.9` tag, which represents the most recent stable release of OpenHands. You have other options as well:
+The command above pulls the most recent stable release of OpenHands. You have other options as well:
 - For a specific release, use `ghcr.io/all-hands-ai/openhands:$VERSION`, replacing $VERSION with the version number.
 - We use semver, and release major, minor, and patch tags. So `0.9` will automatically point to the latest `0.9.x` release, and `0` will point to the latest `0.x.x` release.
 - For the most up-to-date development version, you can use `ghcr.io/all-hands-ai/openhands:main`. This version is unstable and is recommended for testing or development purposes only.
--- a/docs/modules/usage/how-to/debugging.md
+++ b/docs/modules/usage/how-to/debugging.md
@@ -0,0 +1,71 @@
+# Debugging
+
+The following is intended as a primer on debugging OpenHands for Development purposes.
+
+## Server / VSCode
+
+The following `launch.json` will allow debugging the agent, controller and server elements, but not the sandbox (Which runs inside docker). It will ignore any changes inside the `workspace/` directory:
+
+```
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "OpenHands CLI",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "openhands.core.cli",
+            "justMyCode": false
+        },
+        {
+            "name": "OpenHands WebApp",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "uvicorn",
+            "args": [
+                "openhands.server.listen:app",
+                "--reload",
+                "--reload-exclude",
+                "${workspaceFolder}/workspace",
+                "--port",
+                "3000"
+            ],
+            "justMyCode": false
+        }
+    ]
+}
+```
+
+More specific debugging configurations which include more parameters may be specified:
+
+```
+    ...
+    {
+      "name": "Debug CodeAct",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "openhands.core.main",
+      "args": [
+        "-t",
+        "Ask me what your task is.",
+        "-d",
+        "${workspaceFolder}/workspace",
+        "-c",
+        "CodeActAgent",
+        "-l",
+        "llm.o1",
+        "-n",
+        "prompts"
+      ],
+      "justMyCode": false
+    }
+    ...
+```
+
+Values in the snippet above can be updated such that:
+
+    * *t*: the task
+    * *d*: the openhands workspace directory
+    * *c*: the agent
+    * *l*: the LLM config (pre-defined in config.toml)
+    * *n*: session name (e.g. eventstream name)
--- a/docs/modules/usage/how-to/github-action.md
+++ b/docs/modules/usage/how-to/github-action.md
@@ -0,0 +1,15 @@
+# Using the OpenHands GitHub Action
+
+This guide explains how to use the OpenHands GitHub Action, both within the OpenHands repository and in your own projects.
+
+## Using the Action in the OpenHands Repository
+
+To use the OpenHands GitHub Action in the OpenHands repository, an OpenHands maintainer can:
+
+1. Create an issue in the repository.
+2. Add the `fix-me` label to the issue.
+3. The action will automatically trigger and attempt to resolve the issue.
+
+## Installing the Action in a New Repository
+
+To install the OpenHands GitHub Action in your own repository, follow the [directions in the OpenHands Resolver repo](https://github.com/All-Hands-AI/OpenHands-resolver?tab=readme-ov-file#using-the-github-actions-workflow).
--- a/docs/modules/usage/how-to/openshift-example.md
+++ b/docs/modules/usage/how-to/openshift-example.md
@@ -177,6 +177,7 @@ spec:
      claimName: docker-pvc
 ```

+
 ```bash
 # create the pod
 $ oc create -f pod.yaml
@@ -262,3 +263,167 @@ Events:                   <none>
 6. Connect to OpenHands UI, configure the Agent, then test:

 ![image](https://github.com/user-attachments/assets/12f94804-a0c7-4744-b873-e003c9caf40e)
+
+
+
+## GCP GKE Openhands deployment
+
+**Warning**: this deployment grants the OpenHands application access to the Kubernetes docker socket, which creates security risk. Use at your own discretion.
+1- Create policy for privillege access
+2- Create gke credentials(optional)
+3- Create openhands deployment
+4- Verification and ui access commands
+5- Tshoot pod to verify the internal container
+
+1. create policy for privillege access
+```bash
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: privileged-role
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["create", "get", "list", "watch", "delete"]
+- apiGroups: ["apps"]
+  resources: ["deployments"]
+  verbs: ["create", "get", "list", "watch", "delete"]
+- apiGroups: [""]
+  resources: ["pods/exec"]
+  verbs: ["create"]
+- apiGroups: [""]
+  resources: ["pods/log"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: privileged-role-binding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: privileged-role
+subjects:
+- kind: ServiceAccount
+  name: default  # Change to your service account name
+  namespace: default
+```
+2. create gke credentials(optional)
+```bash
+kubectl create secret generic google-cloud-key \
+  --from-file=key.json=/path/to/your/google-cloud-key.json
+  ```
+3. create openhands deployment
+## as this is tested for the single worker node if you have multiple specify the flag for the single worker
+
+```bash
+kind: Deployment
+metadata:
+  name: openhands-app-2024
+  labels:
+    app: openhands-app-2024
+spec:
+  replicas: 1  # You can increase this number for multiple replicas
+  selector:
+    matchLabels:
+      app: openhands-app-2024
+  template:
+    metadata:
+      labels:
+        app: openhands-app-2024
+    spec:
+      containers:
+      - name: openhands-app-2024
+        image: ghcr.io/all-hands-ai/openhands:main
+        env:
+        - name: SANDBOX_USER_ID
+          value: "1000"
+        - name: SANDBOX_API_HOSTNAME
+          value: '10.164.0.4'
+        - name: WORKSPACE_MOUNT_PATH
+          value: "/tmp/workspace_base"
+        - name: GOOGLE_APPLICATION_CREDENTIALS
+          value: "/tmp/workspace_base/google-cloud-key.json"
+        volumeMounts:
+        - name: workspace-volume
+          mountPath: /tmp/workspace_base
+        - name: docker-sock
+          mountPath: /var/run/docker.sock
+        - name: google-credentials
+          mountPath: "/tmp/workspace_base/google-cloud-key.json"
+        securityContext:
+          privileged: true  # Add this to allow privileged access
+        ports:
+        - containerPort: 3000
+      - name: openhands-sandbox-2024
+        image: ghcr.io/opendevin/sandbox:main
+    #    securityContext:
+    #      privileged: true  # Add this to allow privileged access
+        ports:
+        - containerPort: 51963
+        command: ["/usr/sbin/sshd", "-D", "-p 51963", "-o", "PermitRootLogin=yes"]
+      volumes:
+      #- name: workspace-volume
+      #  persistentVolumeClaim:
+      #    claimName: workspace-pvc
+      - name: workspace-volume
+        emptyDir: {}
+      - name: docker-sock
+        hostPath:
+          path: /var/run/docker.sock       # Use host's Docker socket
+          type: Socket
+      - name: google-credentials
+        secret:
+          secretName: google-cloud-key
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: openhands-app-2024-svc
+spec:
+  selector:
+    app: openhands-app-2024
+  ports:
+  - name: http
+    protocol: TCP
+    port: 80
+    targetPort: 3000
+  - name: ssh
+    protocol: TCP
+    port: 51963
+    targetPort: 51963
+  type: LoadBalancer
+  ```
+
+5. Tshoot pod to verify the internal container
+### if you want to know more regarding the internal container runtime use below mention pod deployment use kubectl exec -it to enter into container and you can check the contaienr run time using normal docker commands like "docker ps -a"
+
+```bash
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docker-in-docker
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: docker-in-docker
+  template:
+    metadata:
+      labels:
+        app: docker-in-docker
+    spec:
+      containers:
+      - name: dind
+        image: docker:20.10-dind
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: docker-sock
+          mountPath: /var/run/docker.sock
+      volumes:
+      - name: docker-sock
+        hostPath:
+          path: /var/run/docker.sock
+          type: Socket
+```
--- a/docs/modules/usage/llms/azure-llms.md
+++ b/docs/modules/usage/llms/azure-llms.md
@@ -1,6 +1,6 @@
 # Azure

-OpenHands uses LiteLLM for completion calls. You can find their documentation on Azure [here](https://docs.litellm.ai/docs/providers/azure).
+OpenHands uses LiteLLM to make calls to Azure's chat models. You can find their documentation on using Azure as a provider [here](https://docs.litellm.ai/docs/providers/azure).

 ## Azure OpenAI Configuration

@@ -27,7 +27,7 @@ You will need your ChatGPT deployment name which can be found on the deployments

 * Enable `Advanced Options`
 * `Custom Model` to azure/&lt;deployment-name&gt;
-* `Base URL` to your Azure API Base URL (Example: `https://example-endpoint.openai.azure.com`)
+* `Base URL` to your Azure API Base URL (e.g. `https://example-endpoint.openai.azure.com`)
 * `API Key` to your Azure API key

 ## Embeddings
--- a/docs/modules/usage/llms/google-llms.md
+++ b/docs/modules/usage/llms/google-llms.md
@@ -1,6 +1,6 @@
 # Google Gemini/Vertex

-OpenHands uses LiteLLM for completion calls. The following resources are relevant for using OpenHands with Google's LLMs:
+OpenHands uses LiteLLM to make calls to Google's chat models. You can find their documentation on using Google as a provider:

 - [Gemini - Google AI Studio](https://docs.litellm.ai/docs/providers/gemini)
 - [VertexAI - Google Cloud Platform](https://docs.litellm.ai/docs/providers/vertex)
@@ -10,7 +10,7 @@ OpenHands uses LiteLLM for completion calls. The following resources are relevan
 When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
 * `LLM Provider` to `Gemini`
 * `LLM Model` to the model you will be using.
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (i.e. gemini/&lt;model-name&gt;).
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. gemini/&lt;model-name&gt; like `gemini/gemini-1.5-pro`).
 * `API Key` to your Gemini API key

 ## VertexAI - Google Cloud Platform Configs
@@ -27,4 +27,4 @@ VERTEXAI_LOCATION="<your-gcp-location>"
 Then set the following in the OpenHands UI through the Settings:
 * `LLM Provider` to `VertexAI`
 * `LLM Model` to the model you will be using.
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (i.e. vertex_ai/&lt;model-name&gt;).
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. vertex_ai/&lt;model-name&gt;).
--- a/docs/modules/usage/llms/groq.md
+++ b/docs/modules/usage/llms/groq.md
@@ -1,15 +1,15 @@
 # Groq

-OpenHands uses LiteLLM to make calls to chat models on Groq. You can find their full documentation on using Groq as provider [here](https://docs.litellm.ai/docs/providers/groq).
+OpenHands uses LiteLLM to make calls to chat models on Groq. You can find their documentation on using Groq as a provider [here](https://docs.litellm.ai/docs/providers/groq).

 ## Configuration

 When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
 * `LLM Provider` to `Groq`
-* `LLM Model` to the model you will be using. [Visit **here** to see the list of
+* `LLM Model` to the model you will be using. [Visit here to see the list of
 models that Groq hosts](https://console.groq.com/docs/models). If the model is not in the list, toggle
-`Advanced Options`, and enter it in `Custom Model` (i.e. groq/&lt;model-name&gt;)
-* `API key` to your Groq API key. To find or create your Groq API Key, [see **here**](https://console.groq.com/keys)
+`Advanced Options`, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`)
+* `API key` to your Groq API key. To find or create your Groq API Key, [see here](https://console.groq.com/keys)



@@ -18,6 +18,6 @@ models that Groq hosts](https://console.groq.com/docs/models). If the model is n
 The Groq endpoint for chat completion is [mostly OpenAI-compatible](https://console.groq.com/docs/openai). Therefore, you can access Groq models as you
 would access any OpenAI-compatible endpoint. You can set the following in the OpenHands UI through the Settings:
 * Enable `Advanced Options`
-* `Custom Model` to the prefix `openai/` + the model you will be using (Example: `openai/llama3-8b-8192`)
+* `Custom Model` to the prefix `openai/` + the model you will be using (e.g. `openai/llama3-70b-8192`)
 * `Base URL` to `https://api.groq.com/openai/v1`
 * `API Key` to your Groq API key
--- a/docs/modules/usage/llms/llms.md
+++ b/docs/modules/usage/llms/llms.md
@@ -54,14 +54,26 @@ We have a few guides for running OpenHands with specific model providers:
 * [Azure](llms/azure-llms)
 * [Google](llms/google-llms)
 * [Groq](llms/groq)
-* [ollama](llms/local-llms)
 * [OpenAI](llms/openai-llms)
+* [OpenRouter](llms/openrouter)

 ### API retries and rate limits

-Some LLMs have rate limits and may require retries. OpenHands will automatically retry requests if it receives a 429 error or API connection error.
-You can set the following environment variables to control the number of retries and the time between retries:
+LLM providers typically have rate limits, sometimes very low, and may require retries. OpenHands will automatically retry requests if it receives a Rate Limit Error (429 error code), API connection error, or other transient errors.
+
+You can customize these options as you need for the provider you're using. Check their documentation, and set the following environment variables to control the number of retries and the time between retries:

 * `LLM_NUM_RETRIES` (Default of 8)
 * `LLM_RETRY_MIN_WAIT` (Default of 15 seconds)
 * `LLM_RETRY_MAX_WAIT` (Default of 120 seconds)
+* `LLM_RETRY_MULTIPLIER` (Default of 2)
+
+If you running `openhands` in development mode, you can also set these options to the values you need in `config.toml` file:
+
+```toml
+[llm]
+num_retries = 8
+retry_min_wait = 15
+retry_max_wait = 120
+retry_multiplier = 2
+```
--- a/docs/modules/usage/llms/local-llms.md
+++ b/docs/modules/usage/llms/local-llms.md
@@ -28,17 +28,14 @@ mistral:7b-instruct-v0.2-q4_K_M eb14864c7427    4.4 GB  2 weeks ago
 starcoder2:latest               f67ae0f64584    1.7 GB  19 hours ago
 ```

-## Start OpenHands
-
-### Docker
+## Run OpenHands with Docker

+### Start OpenHands
 Use the instructions [here](../getting-started) to start OpenHands using Docker.
 But when running `docker run`, you'll need to add a few more arguments:

 ```bash
 --add-host host.docker.internal:host-gateway \
-e LLM_API_KEY="ollama" \
-e LLM_BASE_URL="http://host.docker.internal:11434" \
 -e LLM_OLLAMA_BASE_URL="http://host.docker.internal:11434" \
 ```

@@ -55,8 +52,6 @@ docker run \
    --pull=always \
    --add-host host.docker.internal:host-gateway \
    -e SANDBOX_USER_ID=$(id -u) \
-    -e LLM_API_KEY="ollama" \
-    -e LLM_BASE_URL="http://host.docker.internal:11434" \
    -e LLM_OLLAMA_BASE_URL="http://host.docker.internal:11434" \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
@@ -67,6 +62,16 @@ docker run \

 You should now be able to connect to `http://localhost:3000/`

+### Configure the Web Application
+
+When running `openhands`, you'll need to set the following in the OpenHands UI through the Settings:
+- the model to "ollama/&lt;model-name&gt;"
+- the base url to `http://host.docker.internal:11434`
+- the API key is optional, you can use any string, such as `ollama`.
+
+
+## Run OpenHands in Development Mode
+
 ### Build from Source

 Use the instructions in [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to build OpenHands.
@@ -77,23 +82,22 @@ Make sure `config.toml` is there by running `make setup-config` which will creat
 workspace_base="./workspace"

 [llm]
-model="ollama/codellama:7b"
-api_key="ollama"
 embedding_model="local"
-base_url="http://localhost:11434"
 ollama_base_url="http://localhost:11434"

 ```

-Replace `LLM_MODEL` of your choice if you need to.
+Done! Now you can start OpenHands by: `make run`. You now should be able to connect to `http://localhost:3000/`

-Done! Now you can start OpenHands by: `make run` without Docker. You now should be able to connect to `http://localhost:3000/`
-
-## Select your Model
+### Configure the Web Application

 In the OpenHands UI, click on the Settings wheel in the bottom-left corner.
 Then in the `Model` input, enter `ollama/codellama:7b`, or the name of the model you pulled earlier.
-If it doesn’t show up in a dropdown, that’s fine, just type it in. Click Save when you’re done.
+If it doesn’t show up in the dropdown, enable `Advanced Settings` and type it in. Please note: you need the model name as listed by `ollama list`, with the prefix `ollama/`.
+
+In the API Key field, enter `ollama` or any value, since you don't need a particular key.
+
+In the Base URL field, enter `http://localhost:11434`.

 And now you're ready to go!

--- a/docs/modules/usage/llms/openai-llms.md
+++ b/docs/modules/usage/llms/openai-llms.md
@@ -1,15 +1,15 @@
 # OpenAI

-OpenHands uses LiteLLM to make calls to OpenAI's chat models. You can find their full documentation on OpenAI chat calls [here](https://docs.litellm.ai/docs/providers/openai).
+OpenHands uses LiteLLM to make calls to OpenAI's chat models. You can find their documentation on using OpenAI as a provider [here](https://docs.litellm.ai/docs/providers/openai).

 ## Configuration

 When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
 * `LLM Provider` to `OpenAI`
 * `LLM Model` to the model you will be using.
-[Visit **here** to see a full list of OpenAI models that LiteLLM supports.](https://docs.litellm.ai/docs/providers/openai#openai-chat-completion-models)
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (i.e. openai/&lt;model-name&gt;).
-* `API Key` to your OpenAI API key. To find or create your OpenAI Project API Key, [see **here**](https://platform.openai.com/api-keys).
+[Visit here to see a full list of OpenAI models that LiteLLM supports.](https://docs.litellm.ai/docs/providers/openai#openai-chat-completion-models)
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. openai/&lt;model-name&gt; like `openai/gpt-4o`).
+* `API Key` to your OpenAI API key. To find or create your OpenAI Project API Key, [see here](https://platform.openai.com/api-keys).

 ## Using OpenAI-Compatible Endpoints

@@ -19,6 +19,6 @@ Just as for OpenAI Chat completions, we use LiteLLM for OpenAI-compatible endpoi

 If you're using an OpenAI proxy, you'll need to set the following in the OpenHands UI through the Settings:
 * Enable `Advanced Options`
-* `Custom Model` to openai/&lt;model-name&gt; (i.e.: `openai/gpt-4o` or openai/&lt;proxy-prefix&gt;/&lt;model-name&gt;)
+* `Custom Model` to openai/&lt;model-name&gt; (e.g. `openai/gpt-4o` or openai/&lt;proxy-prefix&gt;/&lt;model-name&gt;)
 * `Base URL` to the URL of your OpenAI proxy
 * `API Key` to your OpenAI API key
--- a/docs/modules/usage/llms/openrouter.md
+++ b/docs/modules/usage/llms/openrouter.md
@@ -0,0 +1,12 @@
+# OpenRouter
+
+OpenHands uses LiteLLM to make calls to chat models on OpenRouter. You can find their documentation on using OpenRouter as a provider [here](https://docs.litellm.ai/docs/providers/openrouter).
+
+## Configuration
+
+When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
+* `LLM Provider` to `OpenRouter`
+* `LLM Model` to the model you will be using.
+[Visit here to see a full list of OpenRouter models](https://openrouter.ai/models).
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. openrouter/&lt;model-name&gt; like `openrouter/anthropic/claude-3.5-sonnet`).
+* `API Key` to your OpenRouter API key.
--- a/docs/modules/usage/troubleshooting/troubleshooting.md
+++ b/docs/modules/usage/troubleshooting/troubleshooting.md
@@ -17,7 +17,6 @@ Check out [Notes for WSL on Windows Users](troubleshooting/windows) for some tro
 ## Common Issues

 * [Unable to connect to Docker](#unable-to-connect-to-docker)
-* [Unable to connect to LLM](#unable-to-connect-to-llm)
 * [404 Resource not found](#404-resource-not-found)
 * [`make build` getting stuck on package installations](#make-build-getting-stuck-on-package-installations)
 * [Sessions are not restored](#sessions-are-not-restored)
@@ -47,33 +46,6 @@ OpenHands uses a Docker container to do its work safely, without potentially bre
 * If you are on a Mac, check the [permissions requirements](https://docs.docker.com/desktop/mac/permission-requirements/) and in particular consider enabling the `Allow the default Docker socket to be used` under `Settings > Advanced` in Docker Desktop.
 * In addition, upgrade your Docker to the latest version under `Check for Updates`

---
-### Unable to connect to LLM
-
-[GitHub Issue](https://github.com/All-Hands-AI/OpenHands/issues/1208)
-
-**Symptoms**
-
-```python
-  File "/app/.venv/lib/python3.12/site-packages/openai/_exceptions.py", line 81, in __init__
-    super().__init__(message, response.request, body=body)
-                              ^^^^^^^^^^^^^^^^
-AttributeError: 'NoneType' object has no attribute 'request'
-```
-
-**Details**
-
-[GitHub Issues](https://github.com/All-Hands-AI/OpenHands/issues?q=is%3Aissue+is%3Aopen+404)
-
-This usually happens with *local* LLM setups, when OpenHands can't connect to the LLM server.
-See our guide for [local LLMs](llms/local-llms) for more information.
-
-**Workarounds**
-
-* Check your `base_url` in your config.toml (if it exists) under the "llm" section
-* Check that ollama (or whatever LLM you're using) is running OK
-* Make sure you're using `--add-host host.docker.internal:host-gateway` when running in Docker
-
 ---
 ### `404 Resource not found`

@@ -115,7 +87,6 @@ the API endpoint you're trying to connect to. Most often this happens for Azure
  * If you're running inside the UI, be sure to set the `model` in the settings modal
  * If you're running headless (via main.py) be sure to set `LLM_MODEL` in your env/config
 * Make sure you've followed any special instructions for your LLM provider
-  * [ollama](/modules/usage/llms/local-llms)
  * [Azure](/modules/usage/llms/azure-llms)
  * [Google](/modules/usage/llms/google-llms)
 * Make sure your API key is correct
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -21,11 +21,6 @@ const sidebars: SidebarsConfig = {
          type: 'category',
          label: 'Providers',
          items: [
-            {
-              type: 'doc',
-              label: 'OpenAI',
-              id: 'usage/llms/openai-llms',
-            },
            {
              type: 'doc',
              label: 'Azure',
@@ -43,9 +38,14 @@ const sidebars: SidebarsConfig = {
            },
            {
              type: 'doc',
-              label: 'Local/ollama',
-              id: 'usage/llms/local-llms',
-            }
+              label: 'OpenAI',
+              id: 'usage/llms/openai-llms',
+            },
+            {
+              type: 'doc',
+              label: 'OpenRouter',
+              id: 'usage/llms/openrouter',
+            },
          ],
        },
      ],
@@ -72,6 +72,10 @@ const sidebars: SidebarsConfig = {
          type: 'doc',
          id: 'usage/how-to/headless-mode',
        },
+        {
+          type: 'doc',
+          id: 'usage/how-to/github-action',
+        },
        {
          type: 'doc',
          id: 'usage/how-to/custom-sandbox-guide',
@@ -83,6 +87,10 @@ const sidebars: SidebarsConfig = {
        {
          type: 'doc',
          id: 'usage/how-to/openshift-example',
+        },
+        {
+          type: 'doc',
+          id: 'usage/how-to/debugging',
        }
      ]
    },
--- a/docs/src/css/custom.css
+++ b/docs/src/css/custom.css
@@ -28,6 +28,6 @@
  --secondary-light: #ccc;
 }

-p a, .a {
+article a, .a {
  text-decoration: underline;
 }
--- a/evaluation/regression/README.md
+++ b/evaluation/regression/README.md
@@ -14,9 +14,9 @@ To run the tests for OpenHands project, you can use the provided test runner scr
 3. Navigate to the root directory of the project.
 4. Run the test suite using the test runner script with the required arguments:
   ```
-   python evaluation/regression/run_tests.py --OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxx --model=gpt-3.5-turbo
+   python evaluation/regression/run_tests.py --OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxx --model=gpt-4o
   ```
-   Replace `sk-xxxxxxxxxxxxxxxxxxxxxx` with your actual OpenAI API key. The default model is `gpt-3.5-turbo`, but you can specify a different model if needed.
+   Replace `sk-xxxxxxxxxxxxxxxxxxxxxx` with your actual OpenAI API key. The default model is `gpt-4o`, but you can specify a different model if needed.

 The test runner will discover and execute all the test cases in the `cases/` directory, and display the results of the test suite, including the status of each individual test case and the overall summary.

--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -24,7 +24,7 @@ This is now the default behavior.

 Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-Bench set you are running on) for the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support).

-When the `run_infer.sh` script is started, it will automatically pull the relavant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.
+When the `run_infer.sh` script is started, it will automatically pull the relevant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.

 ```bash
 ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
@@ -63,13 +63,13 @@ then your command would be:
 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
 ```

-### Run Inference on `RemoteRuntime`
+### Run Inference on `RemoteRuntime` (experimental)

 This is in limited beta. Contact Xingyao over slack if you want to try this out!

 ```bash
 # ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" \
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
 # This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
 ```
@@ -157,6 +157,24 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
 - `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent.
 - `logs/`: a directory of test logs

+### Run evaluation with `RemoteRuntime` (experimental)
+
+This is in limited beta. Contact Xingyao over slack if you want to try this out!
+
+```bash
+# ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
+evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
+# This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
+```
+
+To clean-up all existing runtimes that you've already started, run:
+
+```bash
+ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+```
+
+
 ## Visualize Results

 First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -0,0 +1,377 @@
+import os
+import tempfile
+import time
+
+import pandas as pd
+from swebench.harness.grading import get_eval_report
+from swebench.harness.run_evaluation import (
+    APPLY_PATCH_FAIL,
+    APPLY_PATCH_PASS,
+)
+from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
+from swebench.harness.utils import load_swebench_dataset
+
+from evaluation.swe_bench.run_infer import get_instance_docker_image
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+)
+from openhands.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_parser,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime
+from openhands.events.action import CmdRunAction
+from openhands.events.observation import CmdOutputObservation
+
+# TODO: migrate all swe-bench docker to ghcr.io/openhands
+DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
+logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
+
+
+def process_git_patch(patch):
+    if not isinstance(patch, str):
+        return ''
+
+    if not patch.strip():
+        # skip empty patches
+        return ''
+
+    patch = patch.replace('\r\n', '\n')
+    # There might be some weird characters at the beginning of the patch
+    # due to some OpenHands inference command outputs
+
+    # FOR EXAMPLE:
+    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
+    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
+    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
+    # new file mode 100644
+    # index 0000000000..fc13db5948
+
+    # We "find" the first line that starts with "diff" and then we remove lines before it
+    lines = patch.split('\n')
+    for i, line in enumerate(lines):
+        if line.startswith('diff --git'):
+            patch = '\n'.join(lines[i:])
+            break
+
+    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
+    return patch
+
+
+def get_config(instance: pd.Series) -> AppConfig:
+    # We use a different instance image for the each instance of swe-bench eval
+    base_container_image = get_instance_docker_image(instance['instance_id'])
+    logger.info(
+        f'Using instance container image: {base_container_image}. '
+        f'Please make sure this image exists. '
+        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
+    )
+    config = AppConfig(
+        run_as_openhands=False,
+        runtime=os.environ.get('RUNTIME', 'eventstream'),
+        sandbox=SandboxConfig(
+            base_container_image=base_container_image,
+            use_host_network=False,
+            # large enough timeout, since some testcases take very long to run
+            timeout=1800,
+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    return config
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata | None = None,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        global output_file
+        log_dir = output_file.replace('.jsonl', '.logs')
+        os.makedirs(log_dir, exist_ok=True)
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
+
+    config = get_config(instance)
+    instance_id = instance.instance_id
+    model_patch = instance['model_patch']
+    test_spec: TestSpec = instance['test_spec']
+    logger.info(f'Starting evaluation for instance {instance_id}.')
+
+    if 'test_result' not in instance.keys():
+        instance['test_result'] = {}
+    instance['test_result']['report'] = {
+        'empty_generation': False,
+        'resolved': False,
+        'failed_apply_patch': False,
+        'error_eval': False,
+        'test_timeout': False,
+    }
+
+    if model_patch == '':
+        instance['test_result']['report']['empty_generation'] = True
+        return EvalOutput(
+            instance_id=instance_id,
+            test_result=instance['test_result'],
+        )
+
+    runtime = create_runtime(config, sid=instance_id)
+
+    # Get patch and save it to /tmp/patch.diff
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Patch file
+        patch_file_path = os.path.join(temp_dir, 'patch.diff')
+        with open(patch_file_path, 'w') as f:
+            f.write(model_patch)
+        runtime.copy_to(patch_file_path, '/tmp')
+        # Eval script
+        eval_script_path = os.path.join(temp_dir, 'eval.sh')
+        with open(eval_script_path, 'w') as f:
+            f.write(test_spec.eval_script)
+        runtime.copy_to(eval_script_path, '/tmp')
+
+    # Set +x
+    action = CmdRunAction(command='chmod +x /tmp/eval.sh')
+    action.timeout = 600
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    # Apply patch
+    exec_command = (
+        'cd /testbed && '
+        "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
+        "(echo 'Failed to apply patch with git apply, trying with patch command...' && "
+        "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
+        "echo 'APPLY_PATCH_FAIL')))"
+    )
+    action = CmdRunAction(command=exec_command, keep_prompt=False)
+    action.timeout = 600
+    obs = runtime.run_action(action)
+    assert isinstance(obs, CmdOutputObservation)
+    apply_patch_output = obs.content
+    assert isinstance(apply_patch_output, str)
+    instance['test_result']['apply_patch_output'] = apply_patch_output
+
+    try:
+        if 'APPLY_PATCH_FAIL' in apply_patch_output:
+            logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}')
+            instance['test_result']['report']['failed_apply_patch'] = True
+
+            return EvalOutput(
+                instance_id=instance_id,
+                test_result=instance['test_result'],
+            )
+        elif 'APPLY_PATCH_PASS' in apply_patch_output:
+            logger.info(f'[{instance_id}] {APPLY_PATCH_PASS}:\n{apply_patch_output}')
+
+            # Run eval script in background and save output to log file
+            log_file = '/tmp/eval_output.log'
+            action = CmdRunAction(
+                command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False
+            )
+            action.timeout = 60  # Short timeout just to get the process ID
+            obs = runtime.run_action(action)
+
+            if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
+                pid = obs.content.split()[-1].strip()
+                logger.info(
+                    f'[{instance_id}] Evaluation process started with PID: {pid}'
+                )
+
+                # Poll for completion
+                start_time = time.time()
+                timeout = 1800  # 30 minutes
+                while True:
+                    seconds_elapsed = time.time() - start_time
+                    if seconds_elapsed > timeout:
+                        logger.info(
+                            f'[{instance_id}] Evaluation timed out after {timeout} seconds'
+                        )
+                        instance['test_result']['report']['test_timeout'] = True
+                        break
+                    check_action = CmdRunAction(
+                        command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
+                    )
+                    check_action.timeout = 60
+                    check_obs = runtime.run_action(check_action)
+                    if (
+                        isinstance(check_obs, CmdOutputObservation)
+                        and check_obs.content.split()[-1].strip() == '1'
+                    ):
+                        logger.info(
+                            f'[{instance_id}] Evaluation process completed after {seconds_elapsed} seconds'
+                        )
+                        break
+                    logger.info(
+                        f'[{instance_id}] [{seconds_elapsed:.0f}s] Evaluation still running, waiting...'
+                    )
+                    time.sleep(30)  # Wait for 30 seconds before checking again
+
+                # Read the log file
+                cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False)
+                cat_action.timeout = 300
+                cat_obs = runtime.run_action(cat_action)
+
+                # Grade answer
+                if isinstance(cat_obs, CmdOutputObservation) and cat_obs.exit_code == 0:
+                    test_output = cat_obs.content
+                    assert isinstance(test_output, str)
+                    instance['test_result']['test_output'] = test_output
+
+                    # Get report from test output
+                    logger.info(f'[{instance_id}] Grading answer...')
+                    with tempfile.TemporaryDirectory() as temp_dir:
+                        # Create a directory structure that matches the expected format
+                        # NOTE: this is a hack to make the eval report format consistent
+                        # with the original SWE-Bench eval script
+                        log_dir = os.path.join(temp_dir, 'logs', instance_id)
+                        os.makedirs(log_dir, exist_ok=True)
+                        test_output_path = os.path.join(log_dir, 'test_output.txt')
+                        with open(test_output_path, 'w') as f:
+                            f.write(test_output)
+
+                        _report = get_eval_report(
+                            test_spec=test_spec,
+                            prediction={
+                                'model_patch': model_patch,
+                                'instance_id': instance_id,
+                            },
+                            log_path=test_output_path,
+                            include_tests_status=True,
+                        )
+                        report = _report[instance_id]
+                        logger.info(
+                            f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
+                        )
+                        instance['test_result']['report']['resolved'] = report[
+                            'resolved'
+                        ]
+            else:
+                logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
+                instance['test_result']['report']['error_eval'] = True
+
+            return EvalOutput(
+                instance_id=instance_id,
+                test_result=instance['test_result'],
+            )
+        else:
+            logger.info(
+                f'[{instance_id}] Unexpected output when applying patch:\n{apply_patch_output}'
+            )
+            raise RuntimeError(
+                instance_id,
+                f'Unexpected output when applying patch:\n{apply_patch_output}',
+                logger,
+            )
+    finally:
+        runtime.close()
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '--input-file',
+        type=str,
+        help='Path to input predictions file',
+        required=True,
+    )
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='princeton-nlp/SWE-bench',
+        help='data set to evaluate on, either full-test or lite-test',
+    )
+    parser.add_argument(
+        '--split',
+        type=str,
+        default='test',
+        help='split to evaluate on',
+    )
+    args, _ = parser.parse_known_args()
+
+    # Load SWE-Bench dataset
+    full_dataset: list[SWEbenchInstance] = load_swebench_dataset(
+        args.dataset, args.split
+    )
+    instance_id_to_instance = {
+        instance['instance_id']: instance for instance in full_dataset
+    }
+    logger.info(
+        f'Loaded dataset {args.dataset} with split {args.split} to run inference on.'
+    )
+
+    # Load predictions
+    assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
+    predictions = pd.read_json(args.input_file, lines=True)
+    assert (
+        'instance_id' in predictions.columns
+    ), 'Input file must contain instance_id column.'
+
+    if 'model_patch' not in predictions.columns and (
+        'test_result' in predictions.columns
+        and 'model_patch' in predictions['test_result'].iloc[0]
+    ):
+        raise ValueError(
+            'Input file must contain model_patch column OR test_result column with model_patch field.'
+        )
+    assert len(predictions['instance_id'].unique()) == len(
+        predictions
+    ), 'instance_id column must be unique.'
+
+    if 'model_patch' not in predictions.columns:
+        predictions['model_patch'] = predictions['test_result'].apply(
+            lambda x: x['git_patch']
+        )
+    assert {'instance_id', 'model_patch'}.issubset(
+        set(predictions.columns)
+    ), 'Input file must contain instance_id and model_patch columns.'
+
+    # Process model_patch
+    predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch)
+
+    # Merge predictions with dataset
+    predictions['instance'] = predictions['instance_id'].apply(
+        lambda x: instance_id_to_instance[x]
+    )
+    predictions['test_spec'] = predictions['instance'].apply(make_test_spec)
+
+    # Prepare dataset
+    output_file = args.input_file.replace('.jsonl', '.swebench_eval.jsonl')
+    instances = prepare_dataset(predictions, output_file, args.eval_n_limit)
+
+    run_evaluation(
+        instances,
+        metadata=None,
+        output_file=output_file,
+        num_workers=args.eval_num_workers,
+        process_instance_func=process_instance,
+    )
+
+    # Load evaluated predictions & print number of resolved predictions
+    evaluated_predictions = pd.read_json(output_file, lines=True)
+    fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation']
+
+    def count_report_field(row, field):
+        return row['test_result']['report'][field]
+
+    for field in fields:
+        count = evaluated_predictions.apply(
+            count_report_field, args=(field,), axis=1
+        ).sum()
+        logger.info(
+            f'# {field}: {count} / {len(evaluated_predictions)}. ({count / len(evaluated_predictions):.2%})'
+        )
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -2,7 +2,6 @@ import asyncio
 import json
 import os
 import tempfile
-import time
 from typing import Any

 import pandas as pd
@@ -31,7 +30,9 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction
 from openhands.events.observation import CmdOutputObservation, ErrorObservation
+from openhands.events.serialization.event import event_to_dict
 from openhands.runtime.runtime import Runtime
+from openhands.runtime.utils.shutdown_listener import sleep_if_should_continue

 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
 USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
@@ -130,6 +131,7 @@ def get_config(
            # large enough timeout, since some testcases take very long to run
            timeout=300,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
        ),
        # do not mount workspace
        workspace_base=None,
@@ -316,10 +318,10 @@ def complete_runtime(
                break
            else:
                logger.info('Failed to get git diff, retrying...')
-                time.sleep(10)
+                sleep_if_should_continue(10)
        elif isinstance(obs, ErrorObservation):
            logger.error(f'Error occurred: {obs.content}. Retrying...')
-            time.sleep(10)
+            sleep_if_should_continue(10)
        else:
            raise ValueError(f'Unexpected observation type: {type(obs)}')

@@ -383,10 +385,7 @@ def process_instance(
    if state is None:
        raise ValueError('State should not be None.')

-    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
-    # for compatibility with the existing output format, we can remake the pairs here
-    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = [event_to_dict(event) for event in state.history.get_events()]
    metrics = state.metrics.get() if state.metrics else None

    # Save the output
@@ -398,6 +397,7 @@ def process_instance(
        metadata=metadata,
        history=histories,
        metrics=metrics,
+        llm_completions=state.extra_data.get('llm_completions', []),
        error=state.last_error if state and state.last_error else None,
    )
    return output
--- a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+++ b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
@@ -2,20 +2,26 @@


 # API base URL
-BASE_URL="https://api.all-hands.dev/v0"
+BASE_URL="https://runtime.eval.all-hands.dev"

 # Get the list of runtimes
-runtimes=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
-  --header "X-API-Key: ${ALLHANDS_API_KEY}" | jq -r '.runtimes | .[].runtime_id')
+response=$(curl --silent --location --request GET "${BASE_URL}/list" \
+  --header "X-API-Key: ${ALLHANDS_API_KEY}")

+n_runtimes=$(echo $response | jq -r '.total')
+echo "Found ${n_runtimes} runtimes. Stopping them..."
+
+runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
 # Loop through each runtime and stop it
-for runtime_id in $runtimes; do
-  echo "Stopping runtime: ${runtime_id}"
-  curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
+counter=1
+for runtime_id in $runtime_ids; do
+  echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
+  curl --silent --location --request POST "${BASE_URL}/stop" \
    --header "X-API-Key: ${ALLHANDS_API_KEY}" \
    --header "Content-Type: application/json" \
    --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
  echo
+  ((counter++))
 done

 echo "All runtimes have been stopped."
--- a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
+++ b/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
@@ -3,6 +3,8 @@ import os

 import pandas as pd

+from evaluation.swe_bench.eval_infer import process_git_patch
+
 parser = argparse.ArgumentParser()
 parser.add_argument('oh_output_file', type=str)
 args = parser.parse_args()
@@ -14,36 +16,6 @@ oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
 model_name = os.path.basename(os.path.dirname(args.oh_output_file))


-def process_git_patch(patch):
-    if not isinstance(patch, str):
-        return ''
-
-    if not patch.strip():
-        # skip empty patches
-        return ''
-
-    patch = patch.replace('\r\n', '\n')
-    # There might be some weird characters at the beginning of the patch
-    # due to some OpenHands inference command outputs
-
-    # FOR EXAMPLE:
-    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
-    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
-    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
-    # new file mode 100644
-    # index 0000000000..fc13db5948
-
-    # We "find" the first line that starts with "diff" and then we remove lines before it
-    lines = patch.split('\n')
-    for i, line in enumerate(lines):
-        if line.startswith('diff --git'):
-            patch = '\n'.join(lines[i:])
-            break
-
-    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
-    return patch
-
-
 def convert_row_to_swebench_format(row):
    if 'git_patch' in row:
        model_patch = row['git_patch']
--- a/evaluation/swe_bench/scripts/eval/download_gold_patch.py
+++ b/evaluation/swe_bench/scripts/eval/download_gold_patch.py
@@ -0,0 +1,27 @@
+import argparse
+
+import pandas as pd
+from datasets import load_dataset
+
+parser = argparse.ArgumentParser()
+parser.add_argument('output_filepath', type=str, help='Path to save the output file')
+parser.add_argument(
+    '--dataset_name',
+    type=str,
+    help='Name of the dataset to download',
+    default='princeton-nlp/SWE-bench_Lite',
+)
+parser.add_argument('--split', type=str, help='Split to download', default='test')
+args = parser.parse_args()
+
+dataset = load_dataset(args.dataset_name, split=args.split)
+output_filepath = args.output_filepath
+print(
+    f'Downloading gold patches from {args.dataset_name} (split: {args.split}) to {output_filepath}'
+)
+patches = [
+    {'instance_id': row['instance_id'], 'model_patch': row['patch']} for row in dataset
+]
+print(f'{len(patches)} gold patches loaded')
+pd.DataFrame(patches).to_json(output_filepath, lines=True, orient='records')
+print(f'Patches saved to {output_filepath}')
--- a/evaluation/swe_bench/scripts/eval_infer.sh
+++ b/evaluation/swe_bench/scripts/eval_infer.sh
@@ -106,7 +106,7 @@ if [ -z "$INSTANCE_ID" ]; then
        rm -rf $RESULT_OUTPUT_DIR/eval_outputs
    fi

-    mv run_instance_logs/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
+    mv logs/run_evaluation/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
    mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
    echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt

--- a/evaluation/swe_bench/scripts/eval_infer_remote.sh
+++ b/evaluation/swe_bench/scripts/eval_infer_remote.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+set -eo pipefail
+
+INPUT_FILE=$1
+NUM_WORKERS=$2
+DATASET=$3
+SPLIT=$4
+
+if [ -z "$INPUT_FILE" ]; then
+  echo "INPUT_FILE not specified (should be a path to a jsonl file)"
+  exit 1
+fi
+
+if [ -z "$DATASET" ]; then
+  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
+  DATASET="princeton-nlp/SWE-bench_Lite"
+fi
+
+if [ -z "$SPLIT" ]; then
+  echo "SPLIT not specified, use default test"
+  SPLIT="test"
+fi
+
+if [ -z "$NUM_WORKERS" ]; then
+  echo "NUM_WORKERS not specified, use default 1"
+  NUM_WORKERS=1
+fi
+
+echo "... Evaluating on $INPUT_FILE ..."
+
+COMMAND="poetry run python evaluation/swe_bench/eval_infer.py \
+  --eval-num-workers $NUM_WORKERS \
+  --input-file $INPUT_FILE \
+  --dataset $DATASET \
+  --split $SPLIT"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -6,7 +6,6 @@ import pathlib
 import subprocess
 import time
 import traceback
-from concurrent.futures import ProcessPoolExecutor, as_completed
 from typing import Any, Awaitable, Callable, TextIO

 import pandas as pd
@@ -41,24 +40,29 @@ class EvalMetadata(BaseModel):
    def model_dump_json(self, *args, **kwargs):
        dumped = super().model_dump_json(*args, **kwargs)
        dumped_dict = json.loads(dumped)
-        logger.debug(f'Dumped metadata: {dumped_dict}')
        # avoid leaking sensitive information
        dumped_dict['llm_config'] = self.llm_config.to_safe_dict()
+        logger.debug(f'Dumped metadata: {dumped_dict}')
        return json.dumps(dumped_dict)


 class EvalOutput(BaseModel):
    # NOTE: User-specified
    instance_id: str
-    instruction: str
    # output of the evaluation
    # store anything that is needed for the score calculation
    test_result: dict[str, Any]

+    instruction: str | None = None
+
    # Interaction info
-    metadata: EvalMetadata
-    history: list[tuple[dict[str, Any], dict[str, Any]]]
-    metrics: dict[str, Any]
+    metadata: EvalMetadata | None = None
+    # list[tuple[dict[str, Any], dict[str, Any]]] - for compatibility with the old format
+    history: (
+        list[dict[str, Any]] | list[tuple[dict[str, Any], dict[str, Any]]] | None
+    ) = None
+    llm_completions: list[dict[str, Any]]
+    metrics: dict[str, Any] | None = None
    error: str | None = None

    # Optionally save the input test instance
@@ -66,15 +70,19 @@ class EvalOutput(BaseModel):

    def model_dump(self, *args, **kwargs):
        dumped_dict = super().model_dump(*args, **kwargs)
+        # Remove None values
+        dumped_dict = {k: v for k, v in dumped_dict.items() if v is not None}
        # Apply custom serialization for metadata (to avoid leaking sensitive information)
-        dumped_dict['metadata'] = self.metadata.model_dump()
+        if self.metadata is not None:
+            dumped_dict['metadata'] = self.metadata.model_dump()
        return dumped_dict

    def model_dump_json(self, *args, **kwargs):
        dumped = super().model_dump_json(*args, **kwargs)
        dumped_dict = json.loads(dumped)
        # Apply custom serialization for metadata (to avoid leaking sensitive information)
-        dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
+        if 'metadata' in dumped_dict:
+            dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
        return json.dumps(dumped_dict)


@@ -260,32 +268,47 @@ def _process_instance_wrapper(
            result = process_instance_func(instance, metadata, use_mp)
            return result
        except Exception as e:
+            error = str(e)
+            stacktrace = traceback.format_exc()
            if attempt == max_retries:
+                logger.exception(e)
+                msg = (
+                    '-' * 10
+                    + '\n'
+                    + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
+                    + '\n'
+                    + f'[Encountered after {max_retries} retries. Please check the logs and report the issue.]'
+                    + '-' * 10
+                )
                # Raise an error after all retries & stop the evaluation
+                logger.exception(e)
                raise RuntimeError(
                    f'Maximum error retries reached for instance {instance.instance_id}'
                ) from e
-            error = str(e)
-            stacktrace = traceback.format_exc()
            msg = (
                '-' * 10
                + '\n'
                + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
                + '\n'
                + '-' * 10
-                + '[This error occurred after maximum retries]'
+                + f'[The above error occurred. Retrying... (attempt {attempt + 1} of {max_retries})]'
                + '-' * 10
                + '\n'
            )
            logger.error(msg)
            if use_mp:
                print(msg)  # use print to directly print to console
-            time.sleep(1)  # Add a small delay before retrying
+            time.sleep(5)
+
+
+def _process_instance_wrapper_mp(args):
+    """Wrapper for multiprocessing, especially for imap_unordered."""
+    return _process_instance_wrapper(*args)


 def run_evaluation(
    dataset: pd.DataFrame,
-    metadata: EvalMetadata,
+    metadata: EvalMetadata | None,
    output_file: str,
    num_workers: int,
    process_instance_func: Callable[
@@ -294,10 +317,14 @@ def run_evaluation(
    max_retries: int = 5,  # number of retries for each instance
 ):
    use_multiprocessing = num_workers > 1
-    logger.info(
-        f'Evaluation started with Agent {metadata.agent_class}:\n'
-        f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
-    )
+
+    if metadata is not None:
+        logger.info(
+            f'Evaluation started with Agent {metadata.agent_class}:\n'
+            f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
+        )
+    else:
+        logger.info(f'Evaluation started with {num_workers} workers.')

    total_instances = len(dataset)
    pbar = tqdm(total=total_instances, desc='Instances processed')
@@ -305,20 +332,13 @@ def run_evaluation(

    try:
        if use_multiprocessing:
-            with ProcessPoolExecutor(num_workers) as executor:
-                futures = [
-                    executor.submit(
-                        _process_instance_wrapper,
-                        process_instance_func=process_instance_func,
-                        instance=instance,
-                        metadata=metadata,
-                        use_mp=True,
-                        max_retries=max_retries,
-                    )
+            with mp.Pool(num_workers) as pool:
+                args_iter = (
+                    (process_instance_func, instance, metadata, True, max_retries)
                    for _, instance in dataset.iterrows()
-                ]
-                for future in as_completed(futures):
-                    result = future.result()
+                )
+                results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)
+                for result in results:
                    update_progress(result, pbar, output_fp)
        else:
            for _, instance in dataset.iterrows():
@@ -355,18 +375,27 @@ def reset_logger_for_multiprocessing(
    # Remove all existing handlers from logger
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
-    # add back the console handler to print ONE line
-    logger.addHandler(get_console_handler())
+
+    # add console handler to print ONE line
+    console_handler = get_console_handler(log_level=logging.INFO)
+    console_handler.setFormatter(
+        logging.Formatter(
+            f'Instance {instance_id} - ' + '%(asctime)s - %(levelname)s - %(message)s'
+        )
+    )
+    logger.addHandler(console_handler)
    logger.info(
        f'Starting evaluation for instance {instance_id}.\n'
        f'Hint: run "tail -f {log_file}" to see live logs in a separate shell'
    )
-    # Remove all existing handlers from logger
-    for handler in logger.handlers[:]:
-        logger.removeHandler(handler)
+    # Only log WARNING or higher to console
+    console_handler.setLevel(logging.WARNING)
+
+    # Log INFO and above to file
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    file_handler = logging.FileHandler(log_file)
    file_handler.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    )
+    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
  "name": "openhands-frontend",
-  "version": "0.9.3",
+  "version": "0.9.7",
  "private": true,
  "type": "module",
  "engines": {
@@ -8,7 +8,7 @@
  },
  "dependencies": {
    "@monaco-editor/react": "^4.6.0",
-    "@nextui-org/react": "^2.4.6",
+    "@nextui-org/react": "^2.4.8",
    "@react-types/shared": "^3.24.1",
    "@reduxjs/toolkit": "^2.2.7",
    "@vitejs/plugin-react": "^4.3.1",
@@ -19,8 +19,8 @@
    "i18next": "^23.15.1",
    "i18next-browser-languagedetector": "^8.0.0",
    "i18next-http-backend": "^2.6.1",
-    "jose": "^5.9.2",
-    "monaco-editor": "^0.51.0",
+    "jose": "^5.9.3",
+    "monaco-editor": "^0.52.0",
    "react": "^18.3.1",
    "react-dom": "^18.3.1",
    "react-highlight": "^0.15.0",
@@ -32,7 +32,7 @@
    "react-syntax-highlighter": "^15.5.0",
    "remark-gfm": "^4.0.0",
    "tailwind-merge": "^2.5.2",
-    "vite": "^5.4.6",
+    "vite": "^5.4.8",
    "web-vitals": "^3.5.2"
  },
  "scripts": {
@@ -64,8 +64,8 @@
    "@testing-library/jest-dom": "^6.5.0",
    "@testing-library/react": "^16.0.1",
    "@testing-library/user-event": "^14.5.2",
-    "@types/node": "^22.5.5",
-    "@types/react": "^18.3.7",
+    "@types/node": "^22.7.3",
+    "@types/react": "^18.3.10",
    "@types/react-dom": "^18.3.0",
    "@types/react-highlight": "^0.12.8",
    "@types/react-syntax-highlighter": "^15.5.13",
@@ -83,11 +83,11 @@
    "eslint-plugin-react": "^7.35.0",
    "eslint-plugin-react-hooks": "^4.6.2",
    "husky": "^9.1.6",
-    "jsdom": "^25.0.0",
+    "jsdom": "^25.0.1",
    "lint-staged": "^15.2.10",
    "postcss": "^8.4.47",
    "prettier": "^3.3.3",
-    "tailwindcss": "^3.4.11",
+    "tailwindcss": "^3.4.13",
    "typescript": "^5.6.2",
    "vite-tsconfig-paths": "^5.0.1",
    "vitest": "^1.6.0"
--- a/frontend/src/components/AgentStatusBar.tsx
+++ b/frontend/src/components/AgentStatusBar.tsx
@@ -18,6 +18,7 @@ enum IndicatorColor {
 function AgentStatusBar() {
  const { t } = useTranslation();
  const { curAgentState } = useSelector((state: RootState) => state.agent);
+  const { curStatusMessage } = useSelector((state: RootState) => state.status);

  const AgentStatusMap: {
    [k: string]: { message: string; indicator: IndicatorColor };
@@ -90,14 +91,25 @@ function AgentStatusBar() {
    }
  }, [curAgentState]);

+  const [statusMessage, setStatusMessage] = React.useState<string>("");
+
+  React.useEffect(() => {
+    const trimmedCustomMessage = curStatusMessage.status.trim();
+    if (trimmedCustomMessage) {
+      setStatusMessage(t(trimmedCustomMessage));
+    } else {
+      setStatusMessage(AgentStatusMap[curAgentState].message);
+    }
+  }, [curAgentState, curStatusMessage.status]);
+
  return (
-    <div className="flex items-center">
-      <div
-        className={`w-3 h-3 mr-2 rounded-full animate-pulse ${AgentStatusMap[curAgentState].indicator}`}
-      />
-      <span className="text-sm text-stone-400">
-        {AgentStatusMap[curAgentState].message}
-      </span>
+    <div className="flex flex-col items-center">
+      <div className="flex items-center">
+        <div
+          className={`w-3 h-3 mr-2 rounded-full animate-pulse ${AgentStatusMap[curAgentState].indicator}`}
+        />
+        <span className="text-sm text-stone-400">{statusMessage}</span>
+      </div>
    </div>
  );
 }
--- a/frontend/src/components/chat/ChatMessage.tsx
+++ b/frontend/src/components/chat/ChatMessage.tsx
@@ -60,6 +60,10 @@ function ChatMessage({
    }
  };

+  const copyButtonTitle = message.timestamp
+    ? `${t(I18nKey.CHAT_INTERFACE$TOOLTIP_COPY_MESSAGE)} - ${formatTimestamp(message.timestamp)}`
+    : t(I18nKey.CHAT_INTERFACE$TOOLTIP_COPY_MESSAGE);
+
  return (
    <article
      data-testid="article"
@@ -78,7 +82,8 @@ function ChatMessage({
          data-testid="copy-button"
          onClick={copyToClipboard}
          className="absolute top-1 right-1 p-1 bg-neutral-600 rounded hover:bg-neutral-700"
-          aria-label={t(I18nKey.CHAT_INTERFACE$TOOLTIP_COPY_MESSAGE)}
+          aria-label={copyButtonTitle}
+          title={copyButtonTitle}
          type="button"
        >
          {isCopy ? <FaClipboardCheck /> : <FaClipboard />}
@@ -99,9 +104,6 @@ function ChatMessage({
          ))}
        </div>
      )}
-      <div className="text-xs text-neutral-400 mt-2">
-        {formatTimestamp(message.timestamp)}
-      </div>
      {isLastMessage &&
        message.sender === "assistant" &&
        awaitingUserConfirmation && <ConfirmationButtons />}
--- a/frontend/src/components/modals/settings/ModelSelector.tsx
+++ b/frontend/src/components/modals/settings/ModelSelector.tsx
@@ -112,7 +112,7 @@ export function ModelSelector({
            {models[selectedProvider || ""]?.models
              .filter((model) => VERIFIED_MODELS.includes(model))
              .map((model) => (
-                <AutocompleteItem key={model} value={model}>
+                <AutocompleteItem key={model} value={model} title={model}>
                  {model}
                </AutocompleteItem>
              ))}
@@ -121,7 +121,7 @@ export function ModelSelector({
            {models[selectedProvider || ""]?.models
              .filter((model) => !VERIFIED_MODELS.includes(model))
              .map((model) => (
-                <AutocompleteItem key={model} value={model}>
+                <AutocompleteItem key={model} value={model} title={model}>
                  {model}
                </AutocompleteItem>
              ))}
--- a/frontend/src/components/modals/settings/SettingsForm.tsx
+++ b/frontend/src/components/modals/settings/SettingsForm.tsx
@@ -7,6 +7,7 @@ import { I18nKey } from "../../../i18n/declaration";
 import { AutocompleteCombobox } from "./AutocompleteCombobox";
 import { Settings } from "#/services/settings";
 import { organizeModelsAndProviders } from "#/utils/organizeModelsAndProviders";
+import { extractModelAndProvider } from "#/utils/extractModelAndProvider";
 import { ModelSelector } from "./ModelSelector";

 interface SettingsFormProps {
@@ -41,24 +42,40 @@ function SettingsForm({
 }: SettingsFormProps) {
  const { t } = useTranslation();
  const { isOpen: isVisible, onOpenChange: onVisibleChange } = useDisclosure();
-  const advancedAlreadyInUse = React.useMemo(
-    () =>
+  const advancedAlreadyInUse = React.useMemo(() => {
+    const organizedModels = organizeModelsAndProviders(models);
+    const { provider, model } = extractModelAndProvider(
+      settings.LLM_MODEL || "",
+    );
+    const isKnownModel =
+      provider in organizedModels &&
+      organizedModels[provider].models.includes(model);
+
+    return (
      !!settings.SECURITY_ANALYZER ||
      !!settings.CONFIRMATION_MODE ||
      !!settings.LLM_BASE_URL ||
-      (!!settings.LLM_MODEL && !models.includes(settings.LLM_MODEL)),
-    [],
-  );
+      (!!settings.LLM_MODEL && !isKnownModel)
+    );
+  }, [settings, models]);
  const [enableAdvanced, setEnableAdvanced] =
    React.useState(advancedAlreadyInUse);

+  React.useEffect(() => {
+    setEnableAdvanced(advancedAlreadyInUse);
+  }, [advancedAlreadyInUse]);
+
+  const handleAdvancedChange = (value: boolean) => {
+    setEnableAdvanced(value);
+  };
+
  return (
    <>
      <Switch
        data-testid="advanced-options-toggle"
        aria-checked={enableAdvanced}
        isSelected={enableAdvanced}
-        onValueChange={(value) => setEnableAdvanced(value)}
+        onValueChange={handleAdvancedChange}
      >
        Advanced Options
      </Switch>
--- a/frontend/src/i18n/translation.json
+++ b/frontend/src/i18n/translation.json
--- a/frontend/src/services/actions.ts
+++ b/frontend/src/services/actions.ts
@@ -6,10 +6,11 @@ import {
  ActionSecurityRisk,
  appendSecurityAnalyzerInput,
 } from "#/state/securityAnalyzerSlice";
+import { setCurStatusMessage } from "#/state/statusSlice";
 import { setRootTask } from "#/state/taskSlice";
 import store from "#/store";
 import ActionType from "#/types/ActionType";
-import { ActionMessage } from "#/types/Message";
+import { ActionMessage, StatusMessage } from "#/types/Message";
 import { SocketMessage } from "#/types/ResponseType";
 import { handleObservationMessage } from "./observations";
 import { getRootTask } from "./taskService";
@@ -138,6 +139,16 @@ export function handleActionMessage(message: ActionMessage) {
  }
 }

+export function handleStatusMessage(message: StatusMessage) {
+  const msg = message.status == null ? "" : message.status.trim();
+  store.dispatch(
+    setCurStatusMessage({
+      ...message,
+      status: msg,
+    }),
+  );
+}
+
 export function handleAssistantMessage(data: string | SocketMessage) {
  let socketMessage: SocketMessage;

@@ -149,6 +160,8 @@ export function handleAssistantMessage(data: string | SocketMessage) {

  if ("action" in socketMessage) {
    handleActionMessage(socketMessage);
+  } else if ("status" in socketMessage) {
+    handleStatusMessage(socketMessage);
  } else {
    handleObservationMessage(socketMessage);
  }
--- a/frontend/src/services/session.ts
+++ b/frontend/src/services/session.ts
@@ -8,11 +8,19 @@ import { I18nKey } from "#/i18n/declaration";

 const translate = (key: I18nKey) => i18next.t(key);

+// Define a type for the messages
+type Message = {
+  action: ActionType;
+  args: Record<string, unknown>;
+};
+
 class Session {
  private static _socket: WebSocket | null = null;

  private static _latest_event_id: number = -1;

+  private static _messageQueue: Message[] = [];
+
  public static _history: Record<string, unknown>[] = [];

  // callbacks contain a list of callable functions
@@ -83,6 +91,7 @@ class Session {
      toast.success("ws", translate(I18nKey.SESSION$SERVER_CONNECTED_MESSAGE));
      Session._connecting = false;
      Session._initializeAgent();
+      Session._flushQueue();
      Session.callbacks.open?.forEach((callback) => {
        callback(e);
      });
@@ -94,7 +103,6 @@ class Session {
        data = JSON.parse(e.data);
        Session._history.push(data);
      } catch (err) {
-        // TODO: report the error
        toast.error(
          "ws",
          translate(I18nKey.SESSION$SESSION_HANDLING_ERROR_MESSAGE),
@@ -115,6 +123,7 @@ class Session {
    };

    Session._socket.onerror = () => {
+      // TODO report error
      toast.error(
        "ws",
        translate(I18nKey.SESSION$SESSION_CONNECTION_ERROR_MESSAGE),
@@ -145,9 +154,20 @@ class Session {
    Session._socket = null;
  }

+  private static _flushQueue(): void {
+    while (Session._messageQueue.length > 0) {
+      const message = Session._messageQueue.shift();
+      if (message) {
+        setTimeout(() => Session.send(JSON.stringify(message)), 1000);
+      }
+    }
+  }
+
  static send(message: string): void {
+    const messageObject: Message = JSON.parse(message);
+
    if (Session._connecting) {
-      setTimeout(() => Session.send(message), 1000);
+      Session._messageQueue.push(messageObject);
      return;
    }
    if (!Session.isConnected()) {
--- a/frontend/src/services/settings.ts
+++ b/frontend/src/services/settings.ts
@@ -87,10 +87,10 @@ export const getSettings = (): Settings => {
 export const saveSettings = (settings: Partial<Settings>) => {
  Object.keys(settings).forEach((key) => {
    const isValid = validKeys.includes(key as keyof Settings);
-    const value = settings[key as keyof Settings];
-
-    if (isValid && typeof value !== "undefined")
-      localStorage.setItem(key, value.toString());
+    if (!isValid) return;
+    let value = settings[key as keyof Settings];
+    if (value === undefined || value === null) value = "";
+    localStorage.setItem(key, value.toString());
  });
  localStorage.setItem("SETTINGS_VERSION", LATEST_SETTINGS_VERSION.toString());
 };
--- a/frontend/src/state/statusSlice.ts
+++ b/frontend/src/state/statusSlice.ts
@@ -0,0 +1,23 @@
+import { createSlice, PayloadAction } from "@reduxjs/toolkit";
+import { StatusMessage } from "#/types/Message";
+
+const initialStatusMessage: StatusMessage = {
+  status: "",
+  is_error: false,
+};
+
+export const statusSlice = createSlice({
+  name: "status",
+  initialState: {
+    curStatusMessage: initialStatusMessage,
+  },
+  reducers: {
+    setCurStatusMessage: (state, action: PayloadAction<StatusMessage>) => {
+      state.curStatusMessage = action.payload;
+    },
+  },
+});
+
+export const { setCurStatusMessage } = statusSlice.actions;
+
+export default statusSlice.reducer;
--- a/frontend/src/store.ts
+++ b/frontend/src/store.ts
@@ -8,6 +8,7 @@ import errorsReducer from "./state/errorsSlice";
 import taskReducer from "./state/taskSlice";
 import jupyterReducer from "./state/jupyterSlice";
 import securityAnalyzerReducer from "./state/securityAnalyzerSlice";
+import statusReducer from "./state/statusSlice";

 export const rootReducer = combineReducers({
  browser: browserReducer,
@@ -19,6 +20,7 @@ export const rootReducer = combineReducers({
  agent: agentReducer,
  jupyter: jupyterReducer,
  securityAnalyzer: securityAnalyzerReducer,
+  status: statusReducer,
 });

 const store = configureStore({
--- a/frontend/src/types/Message.tsx
+++ b/frontend/src/types/Message.tsx
@@ -31,3 +31,12 @@ export interface ObservationMessage {
  // The timestamp of the message
  timestamp: string;
 }
+
+export interface StatusMessage {
+  // TODO not implemented yet
+  // Whether the status is an error, default is false
+  is_error: boolean;
+
+  // A status message to display to the user
+  status: string;
+}
--- a/frontend/src/types/ResponseType.tsx
+++ b/frontend/src/types/ResponseType.tsx
@@ -1,5 +1,5 @@
-import { ActionMessage, ObservationMessage } from "./Message";
+import { ActionMessage, ObservationMessage, StatusMessage } from "./Message";

-type SocketMessage = ActionMessage | ObservationMessage;
+type SocketMessage = ActionMessage | ObservationMessage | StatusMessage;

 export { type SocketMessage };
--- a/openhands/init.py
+++ b/openhands/init.py
@@ -0,0 +1,41 @@
+import os
+
+
+def get_version():
+    try:
+        from importlib.metadata import PackageNotFoundError, version
+
+        try:
+            return version('openhands-ai')
+        except PackageNotFoundError:
+            pass
+    except ImportError:
+        pass
+
+    try:
+        from pkg_resources import DistributionNotFound, get_distribution
+
+        try:
+            return get_distribution('openhands-ai').version
+        except DistributionNotFound:
+            pass
+    except ImportError:
+        pass
+
+    # Try getting the version from pyproject.toml
+    try:
+        root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        with open(os.path.join(root_dir, 'pyproject.toml'), 'r') as f:
+            for line in f:
+                if line.startswith('version ='):
+                    return line.split('=')[1].strip().strip('"')
+    except FileNotFoundError:
+        pass
+
+    return 'unknown'
+
+
+try:
+    __version__ = get_version()
+except Exception:
+    __version__ = 'unknown'
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -1,4 +1,5 @@
 import asyncio
+import copy
 import traceback
 from typing import Type

@@ -36,7 +37,9 @@ from openhands.events.observation import (
    ErrorObservation,
    Observation,
 )
+from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
+from openhands.runtime.utils.shutdown_listener import should_continue

 # note: RESUME is only available on web GUI
 TRAFFIC_CONTROL_REMINDER = (
@@ -53,7 +56,7 @@ class AgentController:
    confirmation_mode: bool
    agent_to_llm_config: dict[str, LLMConfig]
    agent_configs: dict[str, AgentConfig]
-    agent_task: asyncio.Task | None = None
+    agent_task: asyncio.Future | None = None
    parent: 'AgentController | None' = None
    delegate: 'AgentController | None' = None
    _pending_action: Action | None = None
@@ -114,13 +117,8 @@ class AgentController:
        # stuck helper
        self._stuck_detector = StuckDetector(self.state)

-        if not is_delegate:
-            self.agent_task = asyncio.create_task(self._start_step_loop())
-
    async def close(self):
        """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream."""
-        if self.agent_task is not None:
-            self.agent_task.cancel()
        await self.set_agent_state_to(AgentState.STOPPED)
        self.event_stream.unsubscribe(EventStreamSubscriber.AGENT_CONTROLLER)

@@ -131,6 +129,10 @@ class AgentController:
    async def update_state_after_step(self):
        # update metrics especially for cost
        self.state.local_metrics = self.agent.llm.metrics
+        if 'llm_completions' not in self.state.extra_data:
+            self.state.extra_data['llm_completions'] = []
+        self.state.extra_data['llm_completions'].extend(self.agent.llm.llm_completions)
+        self.agent.llm.llm_completions.clear()

    async def report_error(self, message: str, exception: Exception | None = None):
        """Reports an error to the user and sends the exception to the LLM next step, in the hope it can self-correct.
@@ -144,11 +146,11 @@ class AgentController:
            self.state.last_error += f': {exception}'
        self.event_stream.add_event(ErrorObservation(message), EventSource.AGENT)

-    async def _start_step_loop(self):
+    async def start_step_loop(self):
        """The main loop for the agent's step-by-step execution."""

        logger.info(f'[Agent Controller {self.id}] Starting step loop...')
-        while True:
+        while should_continue():
            try:
                await self._step()
            except asyncio.CancelledError:
@@ -218,7 +220,13 @@ class AgentController:
        ):
            return

-        logger.info(observation, extra={'msg_type': 'OBSERVATION'})
+        # Make sure we print the observation in the same way as the LLM sees it
+        observation_to_print = copy.deepcopy(observation)
+        if len(observation_to_print.content) > self.agent.llm.config.max_message_chars:
+            observation_to_print.content = truncate_content(
+                observation_to_print.content, self.agent.llm.config.max_message_chars
+            )
+        logger.info(observation_to_print, extra={'msg_type': 'OBSERVATION'})
        if self._pending_action and self._pending_action.id == observation.cause:
            self._pending_action = None
            if self.state.agent_state == AgentState.USER_CONFIRMED:
@@ -382,7 +390,10 @@ class AgentController:

        if self.delegate is not None:
            assert self.delegate != self
-            await self._delegate_step()
+            if self.delegate.get_agent_state() == AgentState.PAUSED:
+                await asyncio.sleep(1)
+            else:
+                await self._delegate_step()
            return

        logger.info(
@@ -457,7 +468,7 @@ class AgentController:
            self.delegate = None
            self.delegateAction = None

-            await self.report_error('Delegator agent encounters an error')
+            await self.report_error('Delegator agent encountered an error')
        elif delegate_state in (AgentState.FINISHED, AgentState.REJECTED):
            logger.info(
                f'[Agent Controller {self.id}] Delegate agent has finished execution'
--- a/openhands/core/cli.py
+++ b/openhands/core/cli.py
@@ -1,3 +1,4 @@
+import argparse
 import asyncio
 import logging
 from typing import Type
@@ -5,6 +6,7 @@ from typing import Type
 from termcolor import colored

 import agenthub  # noqa F401 (we import this to get the agents registered)
+from openhands import __version__
 from openhands.controller import AgentController
 from openhands.controller.agent import Agent
 from openhands.core.config import (
@@ -61,8 +63,33 @@ def display_event(event: Event):
        display_command_output(event.content)


+def get_parser() -> argparse.ArgumentParser:
+    """Get the parser for the command line arguments."""
+    parser = argparse.ArgumentParser(description='Run an agent with a specific task')
+
+    # Add the version argument
+    parser.add_argument(
+        '-v',
+        '--version',
+        action='version',
+        version=f'{__version__}',
+        help='Show the version number and exit',
+        default=None,
+    )
+
+    return parser
+
+
 async def main():
    """Runs the agent in CLI mode"""
+
+    parser = get_parser()
+    args = parser.parse_args()
+
+    if args.version:
+        print(f'OpenHands version: {__version__}')
+        return
+
    logger.setLevel(logging.WARNING)
    config = load_app_config()
    sid = 'cli'
@@ -94,6 +121,9 @@ async def main():
        event_stream=event_stream,
    )

+    if controller is not None:
+        controller.agent_task = asyncio.create_task(controller.start_step_loop())
+
    async def prompt_for_next_task():
        next_message = input('How can I help? >> ')
        if next_message == 'exit':
--- a/openhands/core/config.py
+++ b/openhands/core/config.py
@@ -1,781 +0,0 @@
-import argparse
-import os
-import pathlib
-import platform
-import uuid
-from dataclasses import dataclass, field, fields, is_dataclass
-from enum import Enum
-from types import UnionType
-from typing import Any, ClassVar, MutableMapping, get_args, get_origin
-
-import toml
-from dotenv import load_dotenv
-
-from openhands.core import logger
-
-load_dotenv()
-
-
-LLM_SENSITIVE_FIELDS = ['api_key', 'aws_access_key_id', 'aws_secret_access_key']
-_DEFAULT_AGENT = 'CodeActAgent'
-_MAX_ITERATIONS = 100
-
-
-@dataclass
-class LLMConfig:
-    """Configuration for the LLM model.
-
-    Attributes:
-        model: The model to use.
-        api_key: The API key to use.
-        base_url: The base URL for the API. This is necessary for local LLMs. It is also used for Azure embeddings.
-        api_version: The version of the API.
-        embedding_model: The embedding model to use.
-        embedding_base_url: The base URL for the embedding API.
-        embedding_deployment_name: The name of the deployment for the embedding API. This is used for Azure OpenAI.
-        aws_access_key_id: The AWS access key ID.
-        aws_secret_access_key: The AWS secret access key.
-        aws_region_name: The AWS region name.
-        num_retries: The number of retries to attempt.
-        retry_multiplier: The multiplier for the exponential backoff.
-        retry_min_wait: The minimum time to wait between retries, in seconds. This is exponential backoff minimum. For models with very low limits, this can be set to 15-20.
-        retry_max_wait: The maximum time to wait between retries, in seconds. This is exponential backoff maximum.
-        timeout: The timeout for the API.
-        max_message_chars: The approximate max number of characters in the content of an event included in the prompt to the LLM. Larger observations are truncated.
-        temperature: The temperature for the API.
-        top_p: The top p for the API.
-        custom_llm_provider: The custom LLM provider to use. This is undocumented in openhands, and normally not used. It is documented on the litellm side.
-        max_input_tokens: The maximum number of input tokens. Note that this is currently unused, and the value at runtime is actually the total tokens in OpenAI (e.g. 128,000 tokens for GPT-4).
-        max_output_tokens: The maximum number of output tokens. This is sent to the LLM.
-        input_cost_per_token: The cost per input token. This will available in logs for the user to check.
-        output_cost_per_token: The cost per output token. This will available in logs for the user to check.
-        ollama_base_url: The base URL for the OLLAMA API.
-        drop_params: Drop any unmapped (unsupported) params without causing an exception.
-        disable_vision: If model is vision capable, this option allows to disable image processing (useful for cost reduction).
-        caching_prompt: Using the prompt caching feature provided by the LLM.
-    """
-
-    model: str = 'gpt-4o'
-    api_key: str | None = None
-    base_url: str | None = None
-    api_version: str | None = None
-    embedding_model: str = 'local'
-    embedding_base_url: str | None = None
-    embedding_deployment_name: str | None = None
-    aws_access_key_id: str | None = None
-    aws_secret_access_key: str | None = None
-    aws_region_name: str | None = None
-    num_retries: int = 8
-    retry_multiplier: float = 2
-    retry_min_wait: int = 15
-    retry_max_wait: int = 120
-    timeout: int | None = None
-    max_message_chars: int = 10_000  # maximum number of characters in an observation's content when sent to the llm
-    temperature: float = 0
-    top_p: float = 0.5
-    custom_llm_provider: str | None = None
-    max_input_tokens: int | None = None
-    max_output_tokens: int | None = None
-    input_cost_per_token: float | None = None
-    output_cost_per_token: float | None = None
-    ollama_base_url: str | None = None
-    drop_params: bool | None = None
-    disable_vision: bool | None = None
-    caching_prompt: bool = False
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        result = {}
-        for f in fields(self):
-            result[f.name] = get_field_info(f)
-        return result
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            if attr_name in LLM_SENSITIVE_FIELDS:
-                attr_value = '******' if attr_value else None
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"LLMConfig({', '.join(attr_str)})"
-
-    def __repr__(self):
-        return self.__str__()
-
-    def to_safe_dict(self):
-        """Return a dict with the sensitive fields replaced with ******."""
-        ret = self.__dict__.copy()
-        for k, v in ret.items():
-            if k in LLM_SENSITIVE_FIELDS:
-                ret[k] = '******' if v else None
-        return ret
-
-    def set_missing_attributes(self):
-        """Set any missing attributes to their default values."""
-        for field_name, field_obj in self.__dataclass_fields__.items():
-            if not hasattr(self, field_name):
-                setattr(self, field_name, field_obj.default)
-
-
-@dataclass
-class AgentConfig:
-    """Configuration for the agent.
-
-    Attributes:
-        micro_agent_name: The name of the micro agent to use for this agent.
-        memory_enabled: Whether long-term memory (embeddings) is enabled.
-        memory_max_threads: The maximum number of threads indexing at the same time for embeddings.
-        llm_config: The name of the llm config to use. If specified, this will override global llm config.
-    """
-
-    micro_agent_name: str | None = None
-    memory_enabled: bool = False
-    memory_max_threads: int = 2
-    llm_config: str | None = None
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        result = {}
-        for f in fields(self):
-            result[f.name] = get_field_info(f)
-        return result
-
-
-@dataclass
-class SecurityConfig:
-    """Configuration for security related functionalities.
-
-    Attributes:
-        confirmation_mode: Whether to enable confirmation mode.
-        security_analyzer: The security analyzer to use.
-    """
-
-    confirmation_mode: bool = False
-    security_analyzer: str | None = None
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        dict = {}
-        for f in fields(self):
-            dict[f.name] = get_field_info(f)
-        return dict
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"SecurityConfig({', '.join(attr_str)})"
-
-    def __repr__(self):
-        return self.__str__()
-
-
-@dataclass
-class SandboxConfig:
-    """Configuration for the sandbox.
-
-    Attributes:
-        api_hostname: The hostname for the EventStream Runtime API.
-        base_container_image: The base container image from which to build the runtime image.
-        runtime_container_image: The runtime container image to use.
-        user_id: The user ID for the sandbox.
-        timeout: The timeout for the sandbox.
-        enable_auto_lint: Whether to enable auto-lint.
-        use_host_network: Whether to use the host network.
-        initialize_plugins: Whether to initialize plugins.
-        runtime_extra_deps: The extra dependencies to install in the runtime image (typically used for evaluation).
-            This will be rendered into the end of the Dockerfile that builds the runtime image.
-            It can contain any valid shell commands (e.g., pip install numpy).
-            The path to the interpreter is available as $OH_INTERPRETER_PATH,
-            which can be used to install dependencies for the OH-specific Python interpreter.
-        runtime_startup_env_vars: The environment variables to set at the launch of the runtime.
-            This is a dictionary of key-value pairs.
-            This is useful for setting environment variables that are needed by the runtime.
-            For example, for specifying the base url of website for browsergym evaluation.
-        browsergym_eval_env: The BrowserGym environment to use for evaluation.
-            Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
-    """
-
-    api_hostname: str = 'localhost'
-    api_key: str | None = None
-    base_container_image: str = 'nikolaik/python-nodejs:python3.11-nodejs22'  # default to nikolaik/python-nodejs:python3.11-nodejs22 for eventstream runtime
-    runtime_container_image: str | None = None
-    user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
-    timeout: int = 120
-    enable_auto_lint: bool = (
-        False  # once enabled, OpenHands would lint files after editing
-    )
-    use_host_network: bool = False
-    initialize_plugins: bool = True
-    runtime_extra_deps: str | None = None
-    runtime_startup_env_vars: dict[str, str] = field(default_factory=dict)
-    browsergym_eval_env: str | None = None
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        dict = {}
-        for f in fields(self):
-            dict[f.name] = get_field_info(f)
-        return dict
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"SandboxConfig({', '.join(attr_str)})"
-
-    def __repr__(self):
-        return self.__str__()
-
-
-class UndefinedString(str, Enum):
-    UNDEFINED = 'UNDEFINED'
-
-
-@dataclass
-class AppConfig:
-    """Configuration for the app.
-
-    Attributes:
-        llms: A dictionary of name -> LLM configuration. Default config is under 'llm' key.
-        agents: A dictionary of name -> Agent configuration. Default config is under 'agent' key.
-        default_agent: The name of the default agent to use.
-        sandbox: The sandbox configuration.
-        runtime: The runtime environment.
-        file_store: The file store to use.
-        file_store_path: The path to the file store.
-        workspace_base: The base path for the workspace. Defaults to ./workspace as an absolute path.
-        workspace_mount_path: The path to mount the workspace. This is set to the workspace base by default.
-        workspace_mount_path_in_sandbox: The path to mount the workspace in the sandbox. Defaults to /workspace.
-        workspace_mount_rewrite: The path to rewrite the workspace mount path to.
-        cache_dir: The path to the cache directory. Defaults to /tmp/cache.
-        run_as_openhands: Whether to run as openhands.
-        max_iterations: The maximum number of iterations.
-        max_budget_per_task: The maximum budget allowed per task, beyond which the agent will stop.
-        e2b_api_key: The E2B API key.
-        disable_color: Whether to disable color. For terminals that don't support color.
-        debug: Whether to enable debugging.
-        enable_cli_session: Whether to enable saving and restoring the session when run from CLI.
-        file_uploads_max_file_size_mb: Maximum file size for uploads in megabytes. 0 means no limit.
-        file_uploads_restrict_file_types: Whether to restrict file types for file uploads. Defaults to False.
-        file_uploads_allowed_extensions: List of allowed file extensions for uploads. ['.*'] means all extensions are allowed.
-    """
-
-    llms: dict[str, LLMConfig] = field(default_factory=dict)
-    agents: dict = field(default_factory=dict)
-    default_agent: str = _DEFAULT_AGENT
-    sandbox: SandboxConfig = field(default_factory=SandboxConfig)
-    security: SecurityConfig = field(default_factory=SecurityConfig)
-    runtime: str = 'eventstream'
-    file_store: str = 'memory'
-    file_store_path: str = '/tmp/file_store'
-    # TODO: clean up workspace path after the removal of ServerRuntime
-    workspace_base: str = os.path.join(os.getcwd(), 'workspace')
-    workspace_mount_path: str | None = (
-        UndefinedString.UNDEFINED  # this path should always be set when config is fully loaded
-    )  # when set to None, do not mount the workspace
-    workspace_mount_path_in_sandbox: str = '/workspace'
-    workspace_mount_rewrite: str | None = None
-    cache_dir: str = '/tmp/cache'
-    run_as_openhands: bool = True
-    max_iterations: int = _MAX_ITERATIONS
-    max_budget_per_task: float | None = None
-    e2b_api_key: str = ''
-    disable_color: bool = False
-    jwt_secret: str = uuid.uuid4().hex
-    debug: bool = False
-    enable_cli_session: bool = False
-    file_uploads_max_file_size_mb: int = 0
-    file_uploads_restrict_file_types: bool = False
-    file_uploads_allowed_extensions: list[str] = field(default_factory=lambda: ['.*'])
-
-    defaults_dict: ClassVar[dict] = {}
-
-    def get_llm_config(self, name='llm') -> LLMConfig:
-        """Llm is the name for default config (for backward compatibility prior to 0.8)"""
-        if name in self.llms:
-            return self.llms[name]
-        if name is not None and name != 'llm':
-            logger.openhands_logger.warning(
-                f'llm config group {name} not found, using default config'
-            )
-        if 'llm' not in self.llms:
-            self.llms['llm'] = LLMConfig()
-        return self.llms['llm']
-
-    def set_llm_config(self, value: LLMConfig, name='llm'):
-        self.llms[name] = value
-
-    def get_agent_config(self, name='agent') -> AgentConfig:
-        """Agent is the name for default config (for backward compability prior to 0.8)"""
-        if name in self.agents:
-            return self.agents[name]
-        if 'agent' not in self.agents:
-            self.agents['agent'] = AgentConfig()
-        return self.agents['agent']
-
-    def set_agent_config(self, value: AgentConfig, name='agent'):
-        self.agents[name] = value
-
-    def get_agent_to_llm_config_map(self) -> dict[str, LLMConfig]:
-        """Get a map of agent names to llm configs."""
-        return {name: self.get_llm_config_from_agent(name) for name in self.agents}
-
-    def get_llm_config_from_agent(self, name='agent') -> LLMConfig:
-        agent_config: AgentConfig = self.get_agent_config(name)
-        llm_config_name = agent_config.llm_config
-        return self.get_llm_config(llm_config_name)
-
-    def get_agent_configs(self) -> dict[str, AgentConfig]:
-        return self.agents
-
-    def __post_init__(self):
-        """Post-initialization hook, called when the instance is created with only default values."""
-        AppConfig.defaults_dict = self.defaults_to_dict()
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        result = {}
-        for f in fields(self):
-            field_value = getattr(self, f.name)
-
-            # dataclasses compute their defaults themselves
-            if is_dataclass(type(field_value)):
-                result[f.name] = field_value.defaults_to_dict()
-            else:
-                result[f.name] = get_field_info(f)
-        return result
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            if attr_name in [
-                'e2b_api_key',
-                'github_token',
-                'jwt_secret',
-            ]:
-                attr_value = '******' if attr_value else None
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"AppConfig({', '.join(attr_str)}"
-
-    def __repr__(self):
-        return self.__str__()
-
-
-def get_field_info(f):
-    """Extract information about a dataclass field: type, optional, and default.
-
-    Args:
-        f: The field to extract information from.
-
-    Returns: A dict with the field's type, whether it's optional, and its default value.
-    """
-    field_type = f.type
-    optional = False
-
-    # for types like str | None, find the non-None type and set optional to True
-    # this is useful for the frontend to know if a field is optional
-    # and to show the correct type in the UI
-    # Note: this only works for UnionTypes with None as one of the types
-    if get_origin(field_type) is UnionType:
-        types = get_args(field_type)
-        non_none_arg = next((t for t in types if t is not type(None)), None)
-        if non_none_arg is not None:
-            field_type = non_none_arg
-            optional = True
-
-    # type name in a pretty format
-    type_name = (
-        field_type.__name__ if hasattr(field_type, '__name__') else str(field_type)
-    )
-
-    # default is always present
-    default = f.default
-
-    # return a schema with the useful info for frontend
-    return {'type': type_name.lower(), 'optional': optional, 'default': default}
-
-
-def load_from_env(cfg: AppConfig, env_or_toml_dict: dict | MutableMapping[str, str]):
-    """Reads the env-style vars and sets config attributes based on env vars or a config.toml dict.
-    Compatibility with vars like LLM_BASE_URL, AGENT_MEMORY_ENABLED, SANDBOX_TIMEOUT and others.
-
-    Args:
-        cfg: The AppConfig object to set attributes on.
-        env_or_toml_dict: The environment variables or a config.toml dict.
-    """
-
-    def get_optional_type(union_type: UnionType) -> Any:
-        """Returns the non-None type from a Union."""
-        types = get_args(union_type)
-        return next((t for t in types if t is not type(None)), None)
-
-    # helper function to set attributes based on env vars
-    def set_attr_from_env(sub_config: Any, prefix=''):
-        """Set attributes of a config dataclass based on environment variables."""
-        for field_name, field_type in sub_config.__annotations__.items():
-            # compute the expected env var name from the prefix and field name
-            # e.g. LLM_BASE_URL
-            env_var_name = (prefix + field_name).upper()
-
-            if is_dataclass(field_type):
-                # nested dataclass
-                nested_sub_config = getattr(sub_config, field_name)
-                set_attr_from_env(nested_sub_config, prefix=field_name + '_')
-            elif env_var_name in env_or_toml_dict:
-                # convert the env var to the correct type and set it
-                value = env_or_toml_dict[env_var_name]
-
-                # skip empty config values (fall back to default)
-                if not value:
-                    continue
-
-                try:
-                    # if it's an optional type, get the non-None type
-                    if get_origin(field_type) is UnionType:
-                        field_type = get_optional_type(field_type)
-
-                    # Attempt to cast the env var to type hinted in the dataclass
-                    if field_type is bool:
-                        cast_value = str(value).lower() in ['true', '1']
-                    else:
-                        cast_value = field_type(value)
-                    setattr(sub_config, field_name, cast_value)
-                except (ValueError, TypeError):
-                    logger.openhands_logger.error(
-                        f'Error setting env var {env_var_name}={value}: check that the value is of the right type'
-                    )
-
-    # Start processing from the root of the config object
-    set_attr_from_env(cfg)
-
-    # load default LLM config from env
-    default_llm_config = cfg.get_llm_config()
-    set_attr_from_env(default_llm_config, 'LLM_')
-    # load default agent config from env
-    default_agent_config = cfg.get_agent_config()
-    set_attr_from_env(default_agent_config, 'AGENT_')
-
-
-def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
-    """Load the config from the toml file. Supports both styles of config vars.
-
-    Args:
-        cfg: The AppConfig object to update attributes of.
-        toml_file: The path to the toml file. Defaults to 'config.toml'.
-    """
-    # try to read the config.toml file into the config object
-    try:
-        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
-            toml_config = toml.load(toml_contents)
-    except FileNotFoundError:
-        return
-    except toml.TomlDecodeError as e:
-        logger.openhands_logger.warning(
-            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
-            exc_info=False,
-        )
-        return
-
-    # if there was an exception or core is not in the toml, try to use the old-style toml
-    if 'core' not in toml_config:
-        # re-use the env loader to set the config from env-style vars
-        load_from_env(cfg, toml_config)
-        return
-
-    core_config = toml_config['core']
-
-    # load llm configs and agent configs
-    for key, value in toml_config.items():
-        if isinstance(value, dict):
-            try:
-                if key is not None and key.lower() == 'agent':
-                    logger.openhands_logger.debug(
-                        'Attempt to load default agent config from config toml'
-                    )
-                    non_dict_fields = {
-                        k: v for k, v in value.items() if not isinstance(v, dict)
-                    }
-                    agent_config = AgentConfig(**non_dict_fields)
-                    cfg.set_agent_config(agent_config, 'agent')
-                    for nested_key, nested_value in value.items():
-                        if isinstance(nested_value, dict):
-                            logger.openhands_logger.debug(
-                                f'Attempt to load group {nested_key} from config toml as agent config'
-                            )
-                            agent_config = AgentConfig(**nested_value)
-                            cfg.set_agent_config(agent_config, nested_key)
-                elif key is not None and key.lower() == 'llm':
-                    logger.openhands_logger.debug(
-                        'Attempt to load default LLM config from config toml'
-                    )
-                    non_dict_fields = {
-                        k: v for k, v in value.items() if not isinstance(v, dict)
-                    }
-                    llm_config = LLMConfig(**non_dict_fields)
-                    cfg.set_llm_config(llm_config, 'llm')
-                    for nested_key, nested_value in value.items():
-                        if isinstance(nested_value, dict):
-                            logger.openhands_logger.debug(
-                                f'Attempt to load group {nested_key} from config toml as llm config'
-                            )
-                            llm_config = LLMConfig(**nested_value)
-                            cfg.set_llm_config(llm_config, nested_key)
-                elif not key.startswith('sandbox') and key.lower() != 'core':
-                    logger.openhands_logger.warning(
-                        f'Unknown key in {toml_file}: "{key}"'
-                    )
-            except (TypeError, KeyError) as e:
-                logger.openhands_logger.warning(
-                    f'Cannot parse config from toml, toml values have not been applied.\n Error: {e}',
-                    exc_info=False,
-                )
-        else:
-            logger.openhands_logger.warning(f'Unknown key in {toml_file}: "{key}')
-
-    try:
-        # set sandbox config from the toml file
-        sandbox_config = cfg.sandbox
-
-        # migrate old sandbox configs from [core] section to sandbox config
-        keys_to_migrate = [key for key in core_config if key.startswith('sandbox_')]
-        for key in keys_to_migrate:
-            new_key = key.replace('sandbox_', '')
-            if new_key in sandbox_config.__annotations__:
-                # read the key in sandbox and remove it from core
-                setattr(sandbox_config, new_key, core_config.pop(key))
-            else:
-                logger.openhands_logger.warning(f'Unknown sandbox config: {key}')
-
-        # the new style values override the old style values
-        if 'sandbox' in toml_config:
-            sandbox_config = SandboxConfig(**toml_config['sandbox'])
-
-        # update the config object with the new values
-        cfg.sandbox = sandbox_config
-        for key, value in core_config.items():
-            if hasattr(cfg, key):
-                setattr(cfg, key, value)
-            else:
-                logger.openhands_logger.warning(f'Unknown core config key: {key}')
-    except (TypeError, KeyError) as e:
-        logger.openhands_logger.warning(
-            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
-            exc_info=False,
-        )
-
-
-def finalize_config(cfg: AppConfig):
-    """More tweaks to the config after it's been loaded."""
-    cfg.workspace_base = os.path.abspath(cfg.workspace_base)
-    # Set workspace_mount_path if not set by the user
-    if cfg.workspace_mount_path is UndefinedString.UNDEFINED:
-        cfg.workspace_mount_path = cfg.workspace_base
-
-    if cfg.workspace_mount_rewrite:  # and not config.workspace_mount_path:
-        # TODO why do we need to check if workspace_mount_path is None?
-        base = cfg.workspace_base or os.getcwd()
-        parts = cfg.workspace_mount_rewrite.split(':')
-        cfg.workspace_mount_path = base.replace(parts[0], parts[1])
-
-    for llm in cfg.llms.values():
-        if llm.embedding_base_url is None:
-            llm.embedding_base_url = llm.base_url
-
-    if cfg.sandbox.use_host_network and platform.system() == 'Darwin':
-        logger.openhands_logger.warning(
-            'Please upgrade to Docker Desktop 4.29.0 or later to use host network mode on macOS. '
-            'See https://github.com/docker/roadmap/issues/238#issuecomment-2044688144 for more information.'
-        )
-
-    # make sure cache dir exists
-    if cfg.cache_dir:
-        pathlib.Path(cfg.cache_dir).mkdir(parents=True, exist_ok=True)
-
-
-# Utility function for command line --group argument
-def get_llm_config_arg(
-    llm_config_arg: str, toml_file: str = 'config.toml'
-) -> LLMConfig | None:
-    """Get a group of llm settings from the config file.
-
-    A group in config.toml can look like this:
-
-    ```
-    [llm.gpt-3.5-for-eval]
-    model = 'gpt-3.5-turbo'
-    api_key = '...'
-    temperature = 0.5
-    num_retries = 8
-    ...
-    ```
-
-    The user-defined group name, like "gpt-3.5-for-eval", is the argument to this function. The function will load the LLMConfig object
-    with the settings of this group, from the config file, and set it as the LLMConfig object for the app.
-
-    Note that the group must be under "llm" group, or in other words, the group name must start with "llm.".
-
-    Args:
-        llm_config_arg: The group of llm settings to get from the config.toml file.
-
-    Returns:
-        LLMConfig: The LLMConfig object with the settings from the config file.
-    """
-    # keep only the name, just in case
-    llm_config_arg = llm_config_arg.strip('[]')
-
-    # truncate the prefix, just in case
-    if llm_config_arg.startswith('llm.'):
-        llm_config_arg = llm_config_arg[4:]
-
-    logger.openhands_logger.info(f'Loading llm config from {llm_config_arg}')
-
-    # load the toml file
-    try:
-        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
-            toml_config = toml.load(toml_contents)
-    except FileNotFoundError as e:
-        logger.openhands_logger.error(f'Config file not found: {e}')
-        return None
-    except toml.TomlDecodeError as e:
-        logger.openhands_logger.error(
-            f'Cannot parse llm group from {llm_config_arg}. Exception: {e}'
-        )
-        return None
-
-    # update the llm config with the specified section
-    if 'llm' in toml_config and llm_config_arg in toml_config['llm']:
-        return LLMConfig(**toml_config['llm'][llm_config_arg])
-    logger.openhands_logger.debug(f'Loading from toml failed for {llm_config_arg}')
-    return None
-
-
-# Command line arguments
-def get_parser() -> argparse.ArgumentParser:
-    """Get the parser for the command line arguments."""
-    parser = argparse.ArgumentParser(description='Run an agent with a specific task')
-    parser.add_argument(
-        '-d',
-        '--directory',
-        type=str,
-        help='The working directory for the agent',
-    )
-    parser.add_argument(
-        '-t',
-        '--task',
-        type=str,
-        default='',
-        help='The task for the agent to perform',
-    )
-    parser.add_argument(
-        '-f',
-        '--file',
-        type=str,
-        help='Path to a file containing the task. Overrides -t if both are provided.',
-    )
-    parser.add_argument(
-        '-c',
-        '--agent-cls',
-        default=_DEFAULT_AGENT,
-        type=str,
-        help='Name of the default agent to use',
-    )
-    parser.add_argument(
-        '-i',
-        '--max-iterations',
-        default=_MAX_ITERATIONS,
-        type=int,
-        help='The maximum number of iterations to run the agent',
-    )
-    parser.add_argument(
-        '-b',
-        '--max-budget-per-task',
-        type=float,
-        help='The maximum budget allowed per task, beyond which the agent will stop.',
-    )
-    # --eval configs are for evaluations only
-    parser.add_argument(
-        '--eval-output-dir',
-        default='evaluation/evaluation_outputs/outputs',
-        type=str,
-        help='The directory to save evaluation output',
-    )
-    parser.add_argument(
-        '--eval-n-limit',
-        default=None,
-        type=int,
-        help='The number of instances to evaluate',
-    )
-    parser.add_argument(
-        '--eval-num-workers',
-        default=4,
-        type=int,
-        help='The number of workers to use for evaluation',
-    )
-    parser.add_argument(
-        '--eval-note',
-        default=None,
-        type=str,
-        help='The note to add to the evaluation directory',
-    )
-    parser.add_argument(
-        '-l',
-        '--llm-config',
-        default=None,
-        type=str,
-        help='Replace default LLM ([llm] section in config.toml) config with the specified LLM config, e.g. "llama3" for [llm.llama3] section in config.toml',
-    )
-    parser.add_argument(
-        '-n',
-        '--name',
-        default='default',
-        type=str,
-        help='Name for the session',
-    )
-    parser.add_argument(
-        '--eval-ids',
-        default=None,
-        type=str,
-        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
-    )
-    return parser
-
-
-def parse_arguments() -> argparse.Namespace:
-    """Parse the command line arguments."""
-    parser = get_parser()
-    parsed_args, _ = parser.parse_known_args()
-    return parsed_args
-
-
-def load_app_config(set_logging_levels: bool = True) -> AppConfig:
-    """Load the configuration from the config.toml file and environment variables.
-
-    Args:
-        set_logger_levels: Whether to set the global variables for logging levels.
-    """
-    config = AppConfig()
-    load_from_toml(config)
-    load_from_env(config, os.environ)
-    finalize_config(config)
-    if set_logging_levels:
-        logger.DEBUG = config.debug
-        logger.DISABLE_COLOR_PRINTING = config.disable_color
-    return config
--- a/openhands/core/config/README.md
+++ b/openhands/core/config/README.md
@@ -0,0 +1,104 @@
+# Configuration Management in OpenHands
+
+## Overview
+
+OpenHands uses a flexible configuration system that allows settings to be defined through environment variables, TOML files, and command-line arguments. The configuration is managed through a package structure in `openhands/core/config/`.
+
+## Configuration Classes
+
+The main configuration classes are:
+
+- `AppConfig`: The root configuration class
+- `LLMConfig`: Configuration for the Language Model
+- `AgentConfig`: Configuration for the agent
+- `SandboxConfig`: Configuration for the sandbox environment
+- `SecurityConfig`: Configuration for security settings
+
+These classes are defined as dataclasses, with class attributes holding default values for all fields.
+
+## Loading Configuration from Environment Variables
+
+The `load_from_env` function in the config package is responsible for loading configuration values from environment variables. It recursively processes the configuration classes, mapping environment variable names to class attributes.
+
+### Naming Convention for Environment Variables
+
+- Prefix: uppercase name of the configuration class followed by an underscore (e.g., `LLM_`, `AGENT_`)
+- Field Names: all uppercase
+- Full Variable Name: Prefix + Field Name (e.g., `LLM_API_KEY`, `AGENT_MEMORY_ENABLED`)
+
+### Examples
+
+```bash
+export LLM_API_KEY='your_api_key_here'
+export LLM_MODEL='gpt-4'
+export AGENT_MEMORY_ENABLED='true'
+export SANDBOX_TIMEOUT='300'
+```
+
+## Type Handling
+
+The `load_from_env` function attempts to cast environment variable values to the types specified in the dataclasses. It handles:
+
+- Basic types (str, int, bool)
+- Optional types (e.g., `str | None`)
+- Nested dataclasses
+
+If type casting fails, an error is logged, and the default value is retained.
+
+## Default Values
+
+If an environment variable is not set, the default value specified in the dataclass is used.
+
+## Nested Configurations
+
+The `AppConfig` class contains nested configurations like `LLMConfig` and `AgentConfig`. The `load_from_env` function handles these by recursively processing nested dataclasses with updated prefixes.
+
+## Security Considerations
+
+Be cautious when setting sensitive information like API keys in environment variables. Ensure your environment is secure.
+
+## Usage
+
+The `load_app_config()` function is the recommended way to initialize your configuration. It performs the following steps:
+
+1. Creates an instance of `AppConfig`
+2. Loads settings from the `config.toml` file (if present)
+3. Loads settings from environment variables, overriding TOML settings if applicable
+4. Applies final tweaks and validations to the configuration, falling back to the default values specified in the code
+5. Optionally sets global logging levels based on the configuration
+
+There are also command line args, which may work to override other sources.
+
+Here's an example of how to use `load_app_config()`:
+
+````python
+from openhands.core.config import load_app_config
+
+# Load all configuration settings
+config = load_app_config()
+
+# Now you can access your configuration
+llm_config = config.get_llm_config()
+agent_config = config.get_agent_config()
+sandbox_config = config.sandbox
+
+# Use the configuration in your application
+print(f"Using LLM model: {llm_config.model}")
+print(f"Agent memory enabled: {agent_config.memory_enabled}")
+print(f"Sandbox timeout: {sandbox_config.timeout}")
+````
+
+By using `load_app_config()`, you ensure that all configuration sources are properly loaded and processed, providing a consistent and fully initialized configuration for your application.
+
+## Additional Configuration Methods
+
+While this document focuses on environment variable configuration, OpenHands also supports:
+
+- Loading from TOML files
+- Parsing command-line arguments
+
+These methods are handled by separate functions in the config package.
+
+## Conclusion
+
+The OpenHands configuration system provides a flexible and type-safe way to manage application settings. By following the naming conventions and utilizing the provided functions, developers can easily customize the behavior of OpenHands components through environment variables and other configuration sources.
--- a/openhands/core/config/init.py
+++ b/openhands/core/config/init.py
@@ -0,0 +1,39 @@
+from openhands.core.config.agent_config import AgentConfig
+from openhands.core.config.app_config import AppConfig
+from openhands.core.config.config_utils import (
+    OH_DEFAULT_AGENT,
+    OH_MAX_ITERATIONS,
+    UndefinedString,
+    get_field_info,
+)
+from openhands.core.config.llm_config import LLMConfig
+from openhands.core.config.sandbox_config import SandboxConfig
+from openhands.core.config.security_config import SecurityConfig
+from openhands.core.config.utils import (
+    finalize_config,
+    get_llm_config_arg,
+    get_parser,
+    load_app_config,
+    load_from_env,
+    load_from_toml,
+    parse_arguments,
+)
+
+__all__ = [
+    'OH_DEFAULT_AGENT',
+    'OH_MAX_ITERATIONS',
+    'UndefinedString',
+    'AgentConfig',
+    'AppConfig',
+    'LLMConfig',
+    'SandboxConfig',
+    'SecurityConfig',
+    'load_app_config',
+    'load_from_env',
+    'load_from_toml',
+    'finalize_config',
+    'get_llm_config_arg',
+    'get_field_info',
+    'get_parser',
+    'parse_arguments',
+]
--- a/openhands/core/config/agent_config.py
+++ b/openhands/core/config/agent_config.py
@@ -0,0 +1,27 @@
+from dataclasses import dataclass, fields
+
+from openhands.core.config.config_utils import get_field_info
+
+
+@dataclass
+class AgentConfig:
+    """Configuration for the agent.
+
+    Attributes:
+        micro_agent_name: The name of the micro agent to use for this agent.
+        memory_enabled: Whether long-term memory (embeddings) is enabled.
+        memory_max_threads: The maximum number of threads indexing at the same time for embeddings.
+        llm_config: The name of the llm config to use. If specified, this will override global llm config.
+    """
+
+    micro_agent_name: str | None = None
+    memory_enabled: bool = False
+    memory_max_threads: int = 2
+    llm_config: str | None = None
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        result = {}
+        for f in fields(self):
+            result[f.name] = get_field_info(f)
+        return result
--- a/openhands/core/config/app_config.py
+++ b/openhands/core/config/app_config.py
@@ -0,0 +1,151 @@
+import os
+import uuid
+from dataclasses import dataclass, field, fields, is_dataclass
+from typing import ClassVar
+
+from openhands.core import logger
+from openhands.core.config.agent_config import AgentConfig
+from openhands.core.config.config_utils import (
+    OH_DEFAULT_AGENT,
+    OH_MAX_ITERATIONS,
+    UndefinedString,
+    get_field_info,
+)
+from openhands.core.config.llm_config import LLMConfig
+from openhands.core.config.sandbox_config import SandboxConfig
+from openhands.core.config.security_config import SecurityConfig
+
+
+@dataclass
+class AppConfig:
+    """Configuration for the app.
+
+    Attributes:
+        llms: A dictionary of name -> LLM configuration. Default config is under 'llm' key.
+        agents: A dictionary of name -> Agent configuration. Default config is under 'agent' key.
+        default_agent: The name of the default agent to use.
+        sandbox: The sandbox configuration.
+        runtime: The runtime environment.
+        file_store: The file store to use.
+        file_store_path: The path to the file store.
+        workspace_base: The base path for the workspace. Defaults to ./workspace as an absolute path.
+        workspace_mount_path: The path to mount the workspace. This is set to the workspace base by default.
+        workspace_mount_path_in_sandbox: The path to mount the workspace in the sandbox. Defaults to /workspace.
+        workspace_mount_rewrite: The path to rewrite the workspace mount path to.
+        cache_dir: The path to the cache directory. Defaults to /tmp/cache.
+        run_as_openhands: Whether to run as openhands.
+        max_iterations: The maximum number of iterations.
+        max_budget_per_task: The maximum budget allowed per task, beyond which the agent will stop.
+        e2b_api_key: The E2B API key.
+        disable_color: Whether to disable color. For terminals that don't support color.
+        debug: Whether to enable debugging.
+        enable_cli_session: Whether to enable saving and restoring the session when run from CLI.
+        file_uploads_max_file_size_mb: Maximum file size for uploads in megabytes. 0 means no limit.
+        file_uploads_restrict_file_types: Whether to restrict file types for file uploads. Defaults to False.
+        file_uploads_allowed_extensions: List of allowed file extensions for uploads. ['.*'] means all extensions are allowed.
+    """
+
+    llms: dict[str, LLMConfig] = field(default_factory=dict)
+    agents: dict = field(default_factory=dict)
+    default_agent: str = OH_DEFAULT_AGENT
+    sandbox: SandboxConfig = field(default_factory=SandboxConfig)
+    security: SecurityConfig = field(default_factory=SecurityConfig)
+    runtime: str = 'eventstream'
+    file_store: str = 'memory'
+    file_store_path: str = '/tmp/file_store'
+    # TODO: clean up workspace path after the removal of ServerRuntime
+    workspace_base: str = os.path.join(os.getcwd(), 'workspace')
+    workspace_mount_path: str | None = (
+        UndefinedString.UNDEFINED  # this path should always be set when config is fully loaded
+    )  # when set to None, do not mount the workspace
+    workspace_mount_path_in_sandbox: str = '/workspace'
+    workspace_mount_rewrite: str | None = None
+    cache_dir: str = '/tmp/cache'
+    run_as_openhands: bool = True
+    max_iterations: int = OH_MAX_ITERATIONS
+    max_budget_per_task: float | None = None
+    e2b_api_key: str = ''
+    disable_color: bool = False
+    jwt_secret: str = uuid.uuid4().hex
+    debug: bool = False
+    enable_cli_session: bool = False
+    file_uploads_max_file_size_mb: int = 0
+    file_uploads_restrict_file_types: bool = False
+    file_uploads_allowed_extensions: list[str] = field(default_factory=lambda: ['.*'])
+
+    defaults_dict: ClassVar[dict] = {}
+
+    def get_llm_config(self, name='llm') -> LLMConfig:
+        """Llm is the name for default config (for backward compatibility prior to 0.8)"""
+        if name in self.llms:
+            return self.llms[name]
+        if name is not None and name != 'llm':
+            logger.openhands_logger.warning(
+                f'llm config group {name} not found, using default config'
+            )
+        if 'llm' not in self.llms:
+            self.llms['llm'] = LLMConfig()
+        return self.llms['llm']
+
+    def set_llm_config(self, value: LLMConfig, name='llm'):
+        self.llms[name] = value
+
+    def get_agent_config(self, name='agent') -> AgentConfig:
+        """Agent is the name for default config (for backward compability prior to 0.8)"""
+        if name in self.agents:
+            return self.agents[name]
+        if 'agent' not in self.agents:
+            self.agents['agent'] = AgentConfig()
+        return self.agents['agent']
+
+    def set_agent_config(self, value: AgentConfig, name='agent'):
+        self.agents[name] = value
+
+    def get_agent_to_llm_config_map(self) -> dict[str, LLMConfig]:
+        """Get a map of agent names to llm configs."""
+        return {name: self.get_llm_config_from_agent(name) for name in self.agents}
+
+    def get_llm_config_from_agent(self, name='agent') -> LLMConfig:
+        agent_config: AgentConfig = self.get_agent_config(name)
+        llm_config_name = agent_config.llm_config
+        return self.get_llm_config(llm_config_name)
+
+    def get_agent_configs(self) -> dict[str, AgentConfig]:
+        return self.agents
+
+    def __post_init__(self):
+        """Post-initialization hook, called when the instance is created with only default values."""
+        AppConfig.defaults_dict = self.defaults_to_dict()
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        result = {}
+        for f in fields(self):
+            field_value = getattr(self, f.name)
+
+            # dataclasses compute their defaults themselves
+            if is_dataclass(type(field_value)):
+                result[f.name] = field_value.defaults_to_dict()
+            else:
+                result[f.name] = get_field_info(f)
+        return result
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            if attr_name in [
+                'e2b_api_key',
+                'github_token',
+                'jwt_secret',
+            ]:
+                attr_value = '******' if attr_value else None
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"AppConfig({', '.join(attr_str)}"
+
+    def __repr__(self):
+        return self.__str__()
--- a/openhands/core/config/config_utils.py
+++ b/openhands/core/config/config_utils.py
@@ -0,0 +1,44 @@
+from enum import Enum
+from types import UnionType
+from typing import get_args, get_origin
+
+OH_DEFAULT_AGENT = 'CodeActAgent'
+OH_MAX_ITERATIONS = 100
+
+
+class UndefinedString(str, Enum):
+    UNDEFINED = 'UNDEFINED'
+
+
+def get_field_info(f):
+    """Extract information about a dataclass field: type, optional, and default.
+
+    Args:
+        f: The field to extract information from.
+
+    Returns: A dict with the field's type, whether it's optional, and its default value.
+    """
+    field_type = f.type
+    optional = False
+
+    # for types like str | None, find the non-None type and set optional to True
+    # this is useful for the frontend to know if a field is optional
+    # and to show the correct type in the UI
+    # Note: this only works for UnionTypes with None as one of the types
+    if get_origin(field_type) is UnionType:
+        types = get_args(field_type)
+        non_none_arg = next((t for t in types if t is not type(None)), None)
+        if non_none_arg is not None:
+            field_type = non_none_arg
+            optional = True
+
+    # type name in a pretty format
+    type_name = (
+        field_type.__name__ if hasattr(field_type, '__name__') else str(field_type)
+    )
+
+    # default is always present
+    default = f.default
+
+    # return a schema with the useful info for frontend
+    return {'type': type_name.lower(), 'optional': optional, 'default': default}
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -0,0 +1,116 @@
+import os
+from dataclasses import dataclass, fields
+
+from openhands.core.config.config_utils import get_field_info
+
+LLM_SENSITIVE_FIELDS = ['api_key', 'aws_access_key_id', 'aws_secret_access_key']
+
+
+@dataclass
+class LLMConfig:
+    """Configuration for the LLM model.
+
+    Attributes:
+        model: The model to use.
+        api_key: The API key to use.
+        base_url: The base URL for the API. This is necessary for local LLMs. It is also used for Azure embeddings.
+        api_version: The version of the API.
+        embedding_model: The embedding model to use.
+        embedding_base_url: The base URL for the embedding API.
+        embedding_deployment_name: The name of the deployment for the embedding API. This is used for Azure OpenAI.
+        aws_access_key_id: The AWS access key ID.
+        aws_secret_access_key: The AWS secret access key.
+        aws_region_name: The AWS region name.
+        num_retries: The number of retries to attempt.
+        retry_multiplier: The multiplier for the exponential backoff.
+        retry_min_wait: The minimum time to wait between retries, in seconds. This is exponential backoff minimum. For models with very low limits, this can be set to 15-20.
+        retry_max_wait: The maximum time to wait between retries, in seconds. This is exponential backoff maximum.
+        timeout: The timeout for the API.
+        max_message_chars: The approximate max number of characters in the content of an event included in the prompt to the LLM. Larger observations are truncated.
+        temperature: The temperature for the API.
+        top_p: The top p for the API.
+        custom_llm_provider: The custom LLM provider to use. This is undocumented in openhands, and normally not used. It is documented on the litellm side.
+        max_input_tokens: The maximum number of input tokens. Note that this is currently unused, and the value at runtime is actually the total tokens in OpenAI (e.g. 128,000 tokens for GPT-4).
+        max_output_tokens: The maximum number of output tokens. This is sent to the LLM.
+        input_cost_per_token: The cost per input token. This will available in logs for the user to check.
+        output_cost_per_token: The cost per output token. This will available in logs for the user to check.
+        ollama_base_url: The base URL for the OLLAMA API.
+        drop_params: Drop any unmapped (unsupported) params without causing an exception.
+        disable_vision: If model is vision capable, this option allows to disable image processing (useful for cost reduction).
+        caching_prompt: Use the prompt caching feature if provided by the LLM and supported by the provider.
+        log_completions: Whether to log LLM completions to the state.
+    """
+
+    model: str = 'gpt-4o'
+    api_key: str | None = None
+    base_url: str | None = None
+    api_version: str | None = None
+    embedding_model: str = 'local'
+    embedding_base_url: str | None = None
+    embedding_deployment_name: str | None = None
+    aws_access_key_id: str | None = None
+    aws_secret_access_key: str | None = None
+    aws_region_name: str | None = None
+    openrouter_site_url: str = 'https://docs.all-hands.dev/'
+    openrouter_app_name: str = 'OpenHands'
+    num_retries: int = 8
+    retry_multiplier: float = 2
+    retry_min_wait: int = 15
+    retry_max_wait: int = 120
+    timeout: int | None = None
+    max_message_chars: int = 10_000  # maximum number of characters in an observation's content when sent to the llm
+    temperature: float = 0.0
+    top_p: float = 1.0
+    custom_llm_provider: str | None = None
+    max_input_tokens: int | None = None
+    max_output_tokens: int | None = None
+    input_cost_per_token: float | None = None
+    output_cost_per_token: float | None = None
+    ollama_base_url: str | None = None
+    drop_params: bool = True
+    disable_vision: bool | None = None
+    caching_prompt: bool = True
+    log_completions: bool = False
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        result = {}
+        for f in fields(self):
+            result[f.name] = get_field_info(f)
+        return result
+
+    def __post_init__(self):
+        """
+        Post-initialization hook to assign OpenRouter-related variables to environment variables.
+        This ensures that these values are accessible to litellm at runtime.
+        """
+
+        # Assign OpenRouter-specific variables to environment variables
+        if self.openrouter_site_url:
+            os.environ['OR_SITE_URL'] = self.openrouter_site_url
+        if self.openrouter_app_name:
+            os.environ['OR_APP_NAME'] = self.openrouter_app_name
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            if attr_name in LLM_SENSITIVE_FIELDS:
+                attr_value = '******' if attr_value else None
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"LLMConfig({', '.join(attr_str)})"
+
+    def __repr__(self):
+        return self.__str__()
+
+    def to_safe_dict(self):
+        """Return a dict with the sensitive fields replaced with ******."""
+        ret = self.__dict__.copy()
+        for k, v in ret.items():
+            if k in LLM_SENSITIVE_FIELDS:
+                ret[k] = '******' if v else None
+        return ret
--- a/openhands/core/config/sandbox_config.py
+++ b/openhands/core/config/sandbox_config.py
@@ -0,0 +1,68 @@
+import os
+from dataclasses import dataclass, field, fields
+
+from openhands.core.config.config_utils import get_field_info
+
+
+@dataclass
+class SandboxConfig:
+    """Configuration for the sandbox.
+
+    Attributes:
+        remote_runtime_api_url: The hostname for the Remote Runtime API.
+        local_runtime_url: The default hostname for the local runtime. You may want to change to http://host.docker.internal for DIND environments
+        base_container_image: The base container image from which to build the runtime image.
+        runtime_container_image: The runtime container image to use.
+        user_id: The user ID for the sandbox.
+        timeout: The timeout for the sandbox.
+        enable_auto_lint: Whether to enable auto-lint.
+        use_host_network: Whether to use the host network.
+        initialize_plugins: Whether to initialize plugins.
+        runtime_extra_deps: The extra dependencies to install in the runtime image (typically used for evaluation).
+            This will be rendered into the end of the Dockerfile that builds the runtime image.
+            It can contain any valid shell commands (e.g., pip install numpy).
+            The path to the interpreter is available as $OH_INTERPRETER_PATH,
+            which can be used to install dependencies for the OH-specific Python interpreter.
+        runtime_startup_env_vars: The environment variables to set at the launch of the runtime.
+            This is a dictionary of key-value pairs.
+            This is useful for setting environment variables that are needed by the runtime.
+            For example, for specifying the base url of website for browsergym evaluation.
+        browsergym_eval_env: The BrowserGym environment to use for evaluation.
+            Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
+    """
+
+    remote_runtime_api_url: str = 'http://localhost:8000'
+    local_runtime_url: str = 'http://localhost'
+    api_key: str | None = None
+    base_container_image: str = 'nikolaik/python-nodejs:python3.11-nodejs22'  # default to nikolaik/python-nodejs:python3.11-nodejs22 for eventstream runtime
+    runtime_container_image: str | None = None
+    user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
+    timeout: int = 120
+    enable_auto_lint: bool = (
+        False  # once enabled, OpenHands would lint files after editing
+    )
+    use_host_network: bool = False
+    initialize_plugins: bool = True
+    runtime_extra_deps: str | None = None
+    runtime_startup_env_vars: dict[str, str] = field(default_factory=dict)
+    browsergym_eval_env: str | None = None
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        dict = {}
+        for f in fields(self):
+            dict[f.name] = get_field_info(f)
+        return dict
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"SandboxConfig({', '.join(attr_str)})"
+
+    def __repr__(self):
+        return self.__str__()
--- a/openhands/core/config/security_config.py
+++ b/openhands/core/config/security_config.py
@@ -0,0 +1,36 @@
+from dataclasses import dataclass, fields
+
+from openhands.core.config.config_utils import get_field_info
+
+
+@dataclass
+class SecurityConfig:
+    """Configuration for security related functionalities.
+
+    Attributes:
+        confirmation_mode: Whether to enable confirmation mode.
+        security_analyzer: The security analyzer to use.
+    """
+
+    confirmation_mode: bool = False
+    security_analyzer: str | None = None
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        dict = {}
+        for f in fields(self):
+            dict[f.name] = get_field_info(f)
+        return dict
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"SecurityConfig({', '.join(attr_str)})"
+
+    def __repr__(self):
+        return self.__str__()
--- a/openhands/core/config/utils.py
+++ b/openhands/core/config/utils.py
@@ -0,0 +1,391 @@
+import argparse
+import os
+import pathlib
+import platform
+from dataclasses import is_dataclass
+from types import UnionType
+from typing import Any, MutableMapping, get_args, get_origin
+
+import toml
+from dotenv import load_dotenv
+
+from openhands.core import logger
+from openhands.core.config.agent_config import AgentConfig
+from openhands.core.config.app_config import AppConfig
+from openhands.core.config.config_utils import (
+    OH_DEFAULT_AGENT,
+    OH_MAX_ITERATIONS,
+    UndefinedString,
+)
+from openhands.core.config.llm_config import LLMConfig
+from openhands.core.config.sandbox_config import SandboxConfig
+
+load_dotenv()
+
+
+def load_from_env(cfg: AppConfig, env_or_toml_dict: dict | MutableMapping[str, str]):
+    """Reads the env-style vars and sets config attributes based on env vars or a config.toml dict.
+    Compatibility with vars like LLM_BASE_URL, AGENT_MEMORY_ENABLED, SANDBOX_TIMEOUT and others.
+
+    Args:
+        cfg: The AppConfig object to set attributes on.
+        env_or_toml_dict: The environment variables or a config.toml dict.
+    """
+
+    def get_optional_type(union_type: UnionType) -> Any:
+        """Returns the non-None type from a Union."""
+        types = get_args(union_type)
+        return next((t for t in types if t is not type(None)), None)
+
+    # helper function to set attributes based on env vars
+    def set_attr_from_env(sub_config: Any, prefix=''):
+        """Set attributes of a config dataclass based on environment variables."""
+        for field_name, field_type in sub_config.__annotations__.items():
+            # compute the expected env var name from the prefix and field name
+            # e.g. LLM_BASE_URL
+            env_var_name = (prefix + field_name).upper()
+
+            if is_dataclass(field_type):
+                # nested dataclass
+                nested_sub_config = getattr(sub_config, field_name)
+                set_attr_from_env(nested_sub_config, prefix=field_name + '_')
+            elif env_var_name in env_or_toml_dict:
+                # convert the env var to the correct type and set it
+                value = env_or_toml_dict[env_var_name]
+
+                # skip empty config values (fall back to default)
+                if not value:
+                    continue
+
+                try:
+                    # if it's an optional type, get the non-None type
+                    if get_origin(field_type) is UnionType:
+                        field_type = get_optional_type(field_type)
+
+                    # Attempt to cast the env var to type hinted in the dataclass
+                    if field_type is bool:
+                        cast_value = str(value).lower() in ['true', '1']
+                    else:
+                        cast_value = field_type(value)
+                    setattr(sub_config, field_name, cast_value)
+                except (ValueError, TypeError):
+                    logger.openhands_logger.error(
+                        f'Error setting env var {env_var_name}={value}: check that the value is of the right type'
+                    )
+
+    # Start processing from the root of the config object
+    set_attr_from_env(cfg)
+
+    # load default LLM config from env
+    default_llm_config = cfg.get_llm_config()
+    set_attr_from_env(default_llm_config, 'LLM_')
+    # load default agent config from env
+    default_agent_config = cfg.get_agent_config()
+    set_attr_from_env(default_agent_config, 'AGENT_')
+
+
+def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
+    """Load the config from the toml file. Supports both styles of config vars.
+
+    Args:
+        cfg: The AppConfig object to update attributes of.
+        toml_file: The path to the toml file. Defaults to 'config.toml'.
+    """
+    # try to read the config.toml file into the config object
+    try:
+        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
+            toml_config = toml.load(toml_contents)
+    except FileNotFoundError:
+        return
+    except toml.TomlDecodeError as e:
+        logger.openhands_logger.warning(
+            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
+            exc_info=False,
+        )
+        return
+
+    # if there was an exception or core is not in the toml, try to use the old-style toml
+    if 'core' not in toml_config:
+        # re-use the env loader to set the config from env-style vars
+        load_from_env(cfg, toml_config)
+        return
+
+    core_config = toml_config['core']
+
+    # load llm configs and agent configs
+    for key, value in toml_config.items():
+        if isinstance(value, dict):
+            try:
+                if key is not None and key.lower() == 'agent':
+                    logger.openhands_logger.debug(
+                        'Attempt to load default agent config from config toml'
+                    )
+                    non_dict_fields = {
+                        k: v for k, v in value.items() if not isinstance(v, dict)
+                    }
+                    agent_config = AgentConfig(**non_dict_fields)
+                    cfg.set_agent_config(agent_config, 'agent')
+                    for nested_key, nested_value in value.items():
+                        if isinstance(nested_value, dict):
+                            logger.openhands_logger.debug(
+                                f'Attempt to load group {nested_key} from config toml as agent config'
+                            )
+                            agent_config = AgentConfig(**nested_value)
+                            cfg.set_agent_config(agent_config, nested_key)
+                elif key is not None and key.lower() == 'llm':
+                    logger.openhands_logger.debug(
+                        'Attempt to load default LLM config from config toml'
+                    )
+                    non_dict_fields = {
+                        k: v for k, v in value.items() if not isinstance(v, dict)
+                    }
+                    llm_config = LLMConfig(**non_dict_fields)
+                    cfg.set_llm_config(llm_config, 'llm')
+                    for nested_key, nested_value in value.items():
+                        if isinstance(nested_value, dict):
+                            logger.openhands_logger.debug(
+                                f'Attempt to load group {nested_key} from config toml as llm config'
+                            )
+                            llm_config = LLMConfig(**nested_value)
+                            cfg.set_llm_config(llm_config, nested_key)
+                elif not key.startswith('sandbox') and key.lower() != 'core':
+                    logger.openhands_logger.warning(
+                        f'Unknown key in {toml_file}: "{key}"'
+                    )
+            except (TypeError, KeyError) as e:
+                logger.openhands_logger.warning(
+                    f'Cannot parse config from toml, toml values have not been applied.\n Error: {e}',
+                    exc_info=False,
+                )
+        else:
+            logger.openhands_logger.warning(f'Unknown key in {toml_file}: "{key}')
+
+    try:
+        # set sandbox config from the toml file
+        sandbox_config = cfg.sandbox
+
+        # migrate old sandbox configs from [core] section to sandbox config
+        keys_to_migrate = [key for key in core_config if key.startswith('sandbox_')]
+        for key in keys_to_migrate:
+            new_key = key.replace('sandbox_', '')
+            if new_key in sandbox_config.__annotations__:
+                # read the key in sandbox and remove it from core
+                setattr(sandbox_config, new_key, core_config.pop(key))
+            else:
+                logger.openhands_logger.warning(f'Unknown sandbox config: {key}')
+
+        # the new style values override the old style values
+        if 'sandbox' in toml_config:
+            sandbox_config = SandboxConfig(**toml_config['sandbox'])
+
+        # update the config object with the new values
+        cfg.sandbox = sandbox_config
+        for key, value in core_config.items():
+            if hasattr(cfg, key):
+                setattr(cfg, key, value)
+            else:
+                logger.openhands_logger.warning(f'Unknown core config key: {key}')
+    except (TypeError, KeyError) as e:
+        logger.openhands_logger.warning(
+            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
+            exc_info=False,
+        )
+
+
+def finalize_config(cfg: AppConfig):
+    """More tweaks to the config after it's been loaded."""
+    cfg.workspace_base = os.path.abspath(cfg.workspace_base)
+    # Set workspace_mount_path if not set by the user
+    if cfg.workspace_mount_path is UndefinedString.UNDEFINED:
+        cfg.workspace_mount_path = cfg.workspace_base
+
+    if cfg.workspace_mount_rewrite:  # and not config.workspace_mount_path:
+        # TODO why do we need to check if workspace_mount_path is None?
+        base = cfg.workspace_base or os.getcwd()
+        parts = cfg.workspace_mount_rewrite.split(':')
+        cfg.workspace_mount_path = base.replace(parts[0], parts[1])
+
+    for llm in cfg.llms.values():
+        if llm.embedding_base_url is None:
+            llm.embedding_base_url = llm.base_url
+
+    if cfg.sandbox.use_host_network and platform.system() == 'Darwin':
+        logger.openhands_logger.warning(
+            'Please upgrade to Docker Desktop 4.29.0 or later to use host network mode on macOS. '
+            'See https://github.com/docker/roadmap/issues/238#issuecomment-2044688144 for more information.'
+        )
+
+    # make sure cache dir exists
+    if cfg.cache_dir:
+        pathlib.Path(cfg.cache_dir).mkdir(parents=True, exist_ok=True)
+
+
+# Utility function for command line --group argument
+def get_llm_config_arg(
+    llm_config_arg: str, toml_file: str = 'config.toml'
+) -> LLMConfig | None:
+    """Get a group of llm settings from the config file.
+
+    A group in config.toml can look like this:
+
+    ```
+    [llm.gpt-3.5-for-eval]
+    model = 'gpt-3.5-turbo'
+    api_key = '...'
+    temperature = 0.5
+    num_retries = 8
+    ...
+    ```
+
+    The user-defined group name, like "gpt-3.5-for-eval", is the argument to this function. The function will load the LLMConfig object
+    with the settings of this group, from the config file, and set it as the LLMConfig object for the app.
+
+    Note that the group must be under "llm" group, or in other words, the group name must start with "llm.".
+
+    Args:
+        llm_config_arg: The group of llm settings to get from the config.toml file.
+
+    Returns:
+        LLMConfig: The LLMConfig object with the settings from the config file.
+    """
+    # keep only the name, just in case
+    llm_config_arg = llm_config_arg.strip('[]')
+
+    # truncate the prefix, just in case
+    if llm_config_arg.startswith('llm.'):
+        llm_config_arg = llm_config_arg[4:]
+
+    logger.openhands_logger.info(f'Loading llm config from {llm_config_arg}')
+
+    # load the toml file
+    try:
+        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
+            toml_config = toml.load(toml_contents)
+    except FileNotFoundError as e:
+        logger.openhands_logger.error(f'Config file not found: {e}')
+        return None
+    except toml.TomlDecodeError as e:
+        logger.openhands_logger.error(
+            f'Cannot parse llm group from {llm_config_arg}. Exception: {e}'
+        )
+        return None
+
+    # update the llm config with the specified section
+    if 'llm' in toml_config and llm_config_arg in toml_config['llm']:
+        return LLMConfig(**toml_config['llm'][llm_config_arg])
+    logger.openhands_logger.debug(f'Loading from toml failed for {llm_config_arg}')
+    return None
+
+
+# Command line arguments
+def get_parser() -> argparse.ArgumentParser:
+    """Get the parser for the command line arguments."""
+    parser = argparse.ArgumentParser(description='Run an agent with a specific task')
+    parser.add_argument(
+        '-d',
+        '--directory',
+        type=str,
+        help='The working directory for the agent',
+    )
+    parser.add_argument(
+        '-t',
+        '--task',
+        type=str,
+        default='',
+        help='The task for the agent to perform',
+    )
+    parser.add_argument(
+        '-f',
+        '--file',
+        type=str,
+        help='Path to a file containing the task. Overrides -t if both are provided.',
+    )
+    parser.add_argument(
+        '-c',
+        '--agent-cls',
+        default=OH_DEFAULT_AGENT,
+        type=str,
+        help='Name of the default agent to use',
+    )
+    parser.add_argument(
+        '-i',
+        '--max-iterations',
+        default=OH_MAX_ITERATIONS,
+        type=int,
+        help='The maximum number of iterations to run the agent',
+    )
+    parser.add_argument(
+        '-b',
+        '--max-budget-per-task',
+        type=float,
+        help='The maximum budget allowed per task, beyond which the agent will stop.',
+    )
+    # --eval configs are for evaluations only
+    parser.add_argument(
+        '--eval-output-dir',
+        default='evaluation/evaluation_outputs/outputs',
+        type=str,
+        help='The directory to save evaluation output',
+    )
+    parser.add_argument(
+        '--eval-n-limit',
+        default=None,
+        type=int,
+        help='The number of instances to evaluate',
+    )
+    parser.add_argument(
+        '--eval-num-workers',
+        default=4,
+        type=int,
+        help='The number of workers to use for evaluation',
+    )
+    parser.add_argument(
+        '--eval-note',
+        default=None,
+        type=str,
+        help='The note to add to the evaluation directory',
+    )
+    parser.add_argument(
+        '-l',
+        '--llm-config',
+        default=None,
+        type=str,
+        help='Replace default LLM ([llm] section in config.toml) config with the specified LLM config, e.g. "llama3" for [llm.llama3] section in config.toml',
+    )
+    parser.add_argument(
+        '-n',
+        '--name',
+        default='default',
+        type=str,
+        help='Name for the session',
+    )
+    parser.add_argument(
+        '--eval-ids',
+        default=None,
+        type=str,
+        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
+    )
+    return parser
+
+
+def parse_arguments() -> argparse.Namespace:
+    """Parse the command line arguments."""
+    parser = get_parser()
+    parsed_args, _ = parser.parse_known_args()
+    return parsed_args
+
+
+def load_app_config(set_logging_levels: bool = True) -> AppConfig:
+    """Load the configuration from the config.toml file and environment variables.
+
+    Args:
+        set_logger_levels: Whether to set the global variables for logging levels.
+    """
+    config = AppConfig()
+    load_from_toml(config)
+    load_from_env(config, os.environ)
+    finalize_config(config)
+    if set_logging_levels:
+        logger.DEBUG = config.debug
+        logger.DISABLE_COLOR_PRINTING = config.disable_color
+    return config
--- a/openhands/core/exceptions.py
+++ b/openhands/core/exceptions.py
@@ -77,3 +77,10 @@ class UserCancelledError(Exception):
 class MicroAgentValidationError(Exception):
    def __init__(self, message='Micro agent validation failed'):
        super().__init__(message)
+
+
+class OperationCancelled(Exception):
+    """Exception raised when an operation is cancelled (e.g. by a keyboard interrupt)."""
+
+    def __init__(self, message='Operation was cancelled'):
+        super().__init__(message)
--- a/openhands/core/main.py
+++ b/openhands/core/main.py
@@ -55,7 +55,6 @@ def create_runtime(

    config: The app config.
    sid: The session id.
-    runtime_tools_config: (will be deprecated) The runtime tools config.
    """
    # if sid is provided on the command line, use it as the name of the event stream
    # otherwise generate it on the basis of the configured jwt_secret
@@ -144,6 +143,9 @@ async def run_controller(
        headless_mode=headless_mode,
    )

+    if controller is not None:
+        controller.agent_task = asyncio.create_task(controller.start_step_loop())
+
    assert isinstance(task_str, str), f'task_str must be a string, got {type(task_str)}'
    # Logging
    logger.info(
--- a/openhands/core/message.py
+++ b/openhands/core/message.py
@@ -1,10 +1,7 @@
 from enum import Enum
-from typing import Union
+from typing import Literal

 from pydantic import BaseModel, Field, model_serializer
-from typing_extensions import Literal
-
-from openhands.core.logger import openhands_logger as logger


 class ContentType(Enum):
@@ -53,6 +50,8 @@ class ImageContent(Content):
 class Message(BaseModel):
    role: Literal['user', 'system', 'assistant']
    content: list[TextContent | ImageContent] = Field(default=list)
+    cache_enabled: bool = False
+    vision_enabled: bool = False

    @property
    def contains_image(self) -> bool:
@@ -60,60 +59,23 @@ class Message(BaseModel):

    @model_serializer
    def serialize_model(self) -> dict:
-        content: list[dict[str, str | dict[str, str]]] = []
-
-        for item in self.content:
-            if isinstance(item, TextContent):
-                content.append(item.model_dump())
-            elif isinstance(item, ImageContent):
-                content.extend(item.model_dump())
-
-        return {'content': content, 'role': self.role}
-
-
-def format_messages(
-    messages: Union[Message, list[Message]],
-    with_images: bool,
-    with_prompt_caching: bool,
-) -> list[dict]:
-    if not isinstance(messages, list):
-        messages = [messages]
-
-    if with_images or with_prompt_caching:
-        return [message.model_dump() for message in messages]
-
-    converted_messages = []
-    for message in messages:
-        content_parts = []
-        role = 'user'
-
-        if isinstance(message, str) and message:
-            content_parts.append(message)
-        elif isinstance(message, dict):
-            role = message.get('role', 'user')
-            if 'content' in message and message['content']:
-                content_parts.append(message['content'])
-        elif isinstance(message, Message):
-            role = message.role
-            for content in message.content:
-                if isinstance(content, list):
-                    for item in content:
-                        if isinstance(item, TextContent) and item.text:
-                            content_parts.append(item.text)
-                elif isinstance(content, TextContent) and content.text:
-                    content_parts.append(content.text)
+        content: list[dict] | str
+        # two kinds of serializer:
+        # 1. vision serializer: when prompt caching or vision is enabled
+        # 2. single text serializer: for other cases
+        # remove this when liteLLM or providers support this format translation
+        if self.cache_enabled or self.vision_enabled:
+            # when prompt caching or vision is enabled, use vision serializer
+            content = []
+            for item in self.content:
+                if isinstance(item, TextContent):
+                    content.append(item.model_dump())
+                elif isinstance(item, ImageContent):
+                    content.extend(item.model_dump())
        else:
-            logger.error(
-                f'>>> `message` is not a string, dict, or Message: {type(message)}'
+            # for other cases, concatenate all text content
+            # into a single string per message
+            content = '\n'.join(
+                item.text for item in self.content if isinstance(item, TextContent)
            )
-
-        if content_parts:
-            content_str = '\n'.join(content_parts)
-            converted_messages.append(
-                {
-                    'role': role,
-                    'content': content_str,
-                }
-            )
-
-    return converted_messages
+        return {'content': content, 'role': self.role}
--- a/openhands/events/stream.py
+++ b/openhands/events/stream.py
@@ -8,6 +8,7 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.utils import json
 from openhands.events.event import Event, EventSource
 from openhands.events.serialization.event import event_from_dict, event_to_dict
+from openhands.runtime.utils.shutdown_listener import should_continue
 from openhands.storage import FileStore


@@ -85,7 +86,7 @@ class EventStream:
                event_id -= 1
        else:
            event_id = start_id
-            while True:
+            while should_continue():
                if end_id is not None and event_id > end_id:
                    break
                try:
--- a/openhands/linter/init.py
+++ b/openhands/linter/init.py
@@ -0,0 +1,9 @@
+"""Linter module for OpenHands.
+
+Part of this Linter module is adapted from Aider (Apache 2.0 License, [original code](https://github.com/paul-gauthier/aider/blob/main/aider/linter.py)). Please see the [original repository](https://github.com/paul-gauthier/aider) for more information.
+"""
+
+from openhands.linter.base import LintResult
+from openhands.linter.linter import DefaultLinter
+
+__all__ = ['DefaultLinter', 'LintResult']
--- a/openhands/linter/base.py
+++ b/openhands/linter/base.py
@@ -0,0 +1,79 @@
+from abc import ABC, abstractmethod
+
+from pydantic import BaseModel
+
+
+class LintResult(BaseModel):
+    file: str
+    line: int  # 1-indexed
+    column: int  # 1-indexed
+    message: str
+
+    def visualize(self, half_window: int = 3) -> str:
+        """Visualize the lint result by print out all the lines where the lint result is found.
+
+        Args:
+            half_window: The number of context lines to display around the error on each side.
+        """
+        with open(self.file, 'r') as f:
+            file_lines = f.readlines()
+
+        # Add line numbers
+        _span_size = len(str(len(file_lines)))
+        file_lines = [
+            f'{i + 1:>{_span_size}}|{line.rstrip()}'
+            for i, line in enumerate(file_lines)
+        ]
+
+        # Get the window of lines to display
+        assert self.line <= len(file_lines) and self.line > 0
+        line_idx = self.line - 1
+        begin_window = max(0, line_idx - half_window)
+        end_window = min(len(file_lines), line_idx + half_window + 1)
+
+        selected_lines = file_lines[begin_window:end_window]
+        line_idx_in_window = line_idx - begin_window
+
+        # Add character hint
+        _character_hint = (
+            _span_size * ' '
+            + ' ' * (self.column)
+            + '^'
+            + ' ERROR HERE: '
+            + self.message
+        )
+        selected_lines[line_idx_in_window] = (
+            f'\033[91m{selected_lines[line_idx_in_window]}\033[0m'
+            + '\n'
+            + _character_hint
+        )
+        return '\n'.join(selected_lines)
+
+
+class LinterException(Exception):
+    """Base class for all linter exceptions."""
+
+    pass
+
+
+class BaseLinter(ABC):
+    """Base class for all linters.
+
+    Each linter should be able to lint files of a specific type and return a list of (parsed) lint results.
+    """
+
+    encoding: str = 'utf-8'
+
+    @property
+    @abstractmethod
+    def supported_extensions(self) -> list[str]:
+        """The file extensions that this linter supports, such as .py or .tsx."""
+        return []
+
+    @abstractmethod
+    def lint(self, file_path: str) -> list[LintResult]:
+        """Lint the given file.
+
+        file_path: The path to the file to lint. Required to be absolute.
+        """
+        pass
--- a/openhands/linter/languages/python.py
+++ b/openhands/linter/languages/python.py
@@ -0,0 +1,77 @@
+from typing import List
+
+from openhands.linter.base import BaseLinter, LintResult
+from openhands.linter.utils import run_cmd
+
+
+def python_compile_lint(fname: str) -> list[LintResult]:
+    try:
+        with open(fname, 'r') as f:
+            code = f.read()
+        compile(code, fname, 'exec')  # USE TRACEBACK BELOW HERE
+        return []
+    except SyntaxError as err:
+        err_lineno = getattr(err, 'end_lineno', err.lineno)
+        err_offset = getattr(err, 'end_offset', err.offset)
+        if err_offset and err_offset < 0:
+            err_offset = err.offset
+        return [
+            LintResult(
+                file=fname, line=err_lineno, column=err_offset or 1, message=err.msg
+            )
+        ]
+
+
+def flake_lint(filepath: str) -> list[LintResult]:
+    fatal = 'F821,F822,F831,E112,E113,E999,E902'
+    flake8_cmd = f'flake8 --select={fatal} --isolated {filepath}'
+
+    try:
+        cmd_outputs = run_cmd(flake8_cmd)
+    except FileNotFoundError:
+        return []
+    results: list[LintResult] = []
+    if not cmd_outputs:
+        return results
+    for line in cmd_outputs.splitlines():
+        parts = line.split(':')
+        if len(parts) >= 4:
+            _msg = parts[3].strip()
+            if len(parts) > 4:
+                _msg += ': ' + parts[4].strip()
+            results.append(
+                LintResult(
+                    file=filepath,
+                    line=int(parts[1]),
+                    column=int(parts[2]),
+                    message=_msg,
+                )
+            )
+    return results
+
+
+class PythonLinter(BaseLinter):
+    @property
+    def supported_extensions(self) -> List[str]:
+        return ['.py']
+
+    def lint(self, file_path: str) -> list[LintResult]:
+        error = flake_lint(file_path)
+        if not error:
+            error = python_compile_lint(file_path)
+        return error
+
+    def compile_lint(self, file_path: str, code: str) -> List[LintResult]:
+        try:
+            compile(code, file_path, 'exec')
+            return []
+        except SyntaxError as e:
+            return [
+                LintResult(
+                    file=file_path,
+                    line=e.lineno,
+                    column=e.offset,
+                    message=str(e),
+                    rule='SyntaxError',
+                )
+            ]
--- a/openhands/linter/languages/treesitter.py
+++ b/openhands/linter/languages/treesitter.py
@@ -0,0 +1,74 @@
+import warnings
+
+from grep_ast import TreeContext, filename_to_lang
+from grep_ast.parsers import PARSERS
+from tree_sitter_languages import get_parser
+
+from openhands.linter.base import BaseLinter, LintResult
+
+# tree_sitter is throwing a FutureWarning
+warnings.simplefilter('ignore', category=FutureWarning)
+
+
+def tree_context(fname, code, line_nums):
+    context = TreeContext(
+        fname,
+        code,
+        color=False,
+        line_number=True,
+        child_context=False,
+        last_line=False,
+        margin=0,
+        mark_lois=True,
+        loi_pad=3,
+        # header_max=30,
+        show_top_of_file_parent_scope=False,
+    )
+    line_nums = set(line_nums)
+    context.add_lines_of_interest(line_nums)
+    context.add_context()
+    output = context.format()
+    return output
+
+
+def traverse_tree(node):
+    """Traverses the tree to find errors."""
+    errors = []
+    if node.type == 'ERROR' or node.is_missing:
+        line_no = node.start_point[0] + 1
+        col_no = node.start_point[1] + 1
+        error_type = 'Missing node' if node.is_missing else 'Syntax error'
+        errors.append((line_no, col_no, error_type))
+
+    for child in node.children:
+        errors += traverse_tree(child)
+
+    return errors
+
+
+class TreesitterBasicLinter(BaseLinter):
+    @property
+    def supported_extensions(self) -> list[str]:
+        return list(PARSERS.keys())
+
+    def lint(self, file_path: str) -> list[LintResult]:
+        """Use tree-sitter to look for syntax errors, display them with tree context."""
+        lang = filename_to_lang(file_path)
+        if not lang:
+            return []
+        parser = get_parser(lang)
+        with open(file_path, 'r') as f:
+            code = f.read()
+        tree = parser.parse(bytes(code, 'utf-8'))
+        errors = traverse_tree(tree.root_node)
+        if not errors:
+            return []
+        return [
+            LintResult(
+                file=file_path,
+                line=int(line),
+                column=int(col),
+                message=error_details,
+            )
+            for line, col, error_details in errors
+        ]
--- a/openhands/linter/linter.py
+++ b/openhands/linter/linter.py
@@ -0,0 +1,35 @@
+import os
+from collections import defaultdict
+
+from openhands.linter.base import BaseLinter, LinterException, LintResult
+from openhands.linter.languages.python import PythonLinter
+from openhands.linter.languages.treesitter import TreesitterBasicLinter
+
+
+class DefaultLinter(BaseLinter):
+    def __init__(self):
+        self.linters: dict[str, list[BaseLinter]] = defaultdict(list)
+        self.linters['.py'] = [PythonLinter()]
+
+        # Add treesitter linter as a fallback for all linters
+        self.basic_linter = TreesitterBasicLinter()
+        for extension in self.basic_linter.supported_extensions:
+            self.linters[extension].append(self.basic_linter)
+        self._supported_extensions = list(self.linters.keys())
+
+    @property
+    def supported_extensions(self) -> list[str]:
+        return self._supported_extensions
+
+    def lint(self, file_path: str) -> list[LintResult]:
+        if not os.path.isabs(file_path):
+            raise LinterException(f'File path {file_path} is not an absolute path')
+        file_extension = os.path.splitext(file_path)[1]
+
+        linters: list[BaseLinter] = self.linters.get(file_extension, [])
+        for linter in linters:
+            res = linter.lint(file_path)
+            # We always return the first linter's result (higher priority)
+            if res:
+                return res
+        return []
--- a/openhands/linter/utils/init.py
+++ b/openhands/linter/utils/init.py
@@ -0,0 +1,3 @@
+from .cmd import run_cmd, check_tool_installed
+
+__all__ = ['run_cmd', 'check_tool_installed']
--- a/openhands/linter/utils/cmd.py
+++ b/openhands/linter/utils/cmd.py
@@ -0,0 +1,36 @@
+import subprocess
+import os
+
+def run_cmd(cmd: str, cwd: str | None = None) -> str | None:
+    """Run a command and return the output.
+
+    If the command succeeds, return None. If the command fails, return the stdout.
+    """
+
+    process = subprocess.Popen(
+        cmd.split(),
+        cwd=cwd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        encoding='utf-8',
+        errors='replace',
+    )
+    stdout, _ = process.communicate()
+    if process.returncode == 0:
+        return None
+    return stdout
+
+
+def check_tool_installed(tool_name: str) -> bool:
+    """Check if a tool is installed."""
+    try:
+        subprocess.run(
+            [tool_name, '--version'],
+            check=True,
+            cwd=os.getcwd(),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return False
--- a/openhands/llm/init.py
+++ b/openhands/llm/init.py
@@ -0,0 +1,5 @@
+from openhands.llm.async_llm import AsyncLLM
+from openhands.llm.llm import LLM
+from openhands.llm.streaming_llm import StreamingLLM
+
+__all__ = ['LLM', 'AsyncLLM', 'StreamingLLM']
--- a/openhands/llm/async_llm.py
+++ b/openhands/llm/async_llm.py
@@ -0,0 +1,117 @@
+import asyncio
+from functools import partial
+from typing import Any
+
+from litellm import completion as litellm_acompletion
+
+from openhands.core.exceptions import UserCancelledError
+from openhands.core.logger import openhands_logger as logger
+from openhands.llm.llm import LLM, LLM_RETRY_EXCEPTIONS
+from openhands.runtime.utils.shutdown_listener import should_continue
+
+
+class AsyncLLM(LLM):
+    """Asynchronous LLM class."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._async_completion = partial(
+            self._call_acompletion,
+            model=self.config.model,
+            api_key=self.config.api_key,
+            base_url=self.config.base_url,
+            api_version=self.config.api_version,
+            custom_llm_provider=self.config.custom_llm_provider,
+            max_tokens=self.config.max_output_tokens,
+            timeout=self.config.timeout,
+            temperature=self.config.temperature,
+            top_p=self.config.top_p,
+            drop_params=self.config.drop_params,
+        )
+
+        async_completion_unwrapped = self._async_completion
+
+        @self.retry_decorator(
+            num_retries=self.config.num_retries,
+            retry_exceptions=LLM_RETRY_EXCEPTIONS,
+            retry_min_wait=self.config.retry_min_wait,
+            retry_max_wait=self.config.retry_max_wait,
+            retry_multiplier=self.config.retry_multiplier,
+        )
+        async def async_completion_wrapper(*args, **kwargs):
+            """Wrapper for the litellm acompletion function."""
+            messages: list[dict[str, Any]] | dict[str, Any] = []
+
+            # some callers might send the model and messages directly
+            # litellm allows positional args, like completion(model, messages, **kwargs)
+            # see llm.py for more details
+            if len(args) > 1:
+                messages = args[1] if len(args) > 1 else args[0]
+                kwargs['messages'] = messages
+
+                # remove the first args, they're sent in kwargs
+                args = args[2:]
+            elif 'messages' in kwargs:
+                messages = kwargs['messages']
+
+            # ensure we work with a list of messages
+            messages = messages if isinstance(messages, list) else [messages]
+
+            # if we have no messages, something went very wrong
+            if not messages:
+                raise ValueError(
+                    'The messages list is empty. At least one message is required.'
+                )
+
+            self.log_prompt(messages)
+
+            async def check_stopped():
+                while should_continue():
+                    if (
+                        hasattr(self.config, 'on_cancel_requested_fn')
+                        and self.config.on_cancel_requested_fn is not None
+                        and await self.config.on_cancel_requested_fn()
+                    ):
+                        raise UserCancelledError('LLM request cancelled by user')
+                    await asyncio.sleep(0.1)
+
+            stop_check_task = asyncio.create_task(check_stopped())
+
+            try:
+                # Directly call and await litellm_acompletion
+                resp = await async_completion_unwrapped(*args, **kwargs)
+
+                message_back = resp['choices'][0]['message']['content']
+                self.log_response(message_back)
+                self._post_completion(resp)
+
+                # We do not support streaming in this method, thus return resp
+                return resp
+
+            except UserCancelledError:
+                logger.info('LLM request cancelled by user.')
+                raise
+            except Exception as e:
+                logger.error(f'Completion Error occurred:\n{e}')
+                raise
+
+            finally:
+                await asyncio.sleep(0.1)
+                stop_check_task.cancel()
+                try:
+                    await stop_check_task
+                except asyncio.CancelledError:
+                    pass
+
+        self._async_completion = async_completion_wrapper  # type: ignore
+
+    async def _call_acompletion(self, *args, **kwargs):
+        """Wrapper for the litellm acompletion function."""
+        # Used in testing?
+        return await litellm_acompletion(*args, **kwargs)
+
+    @property
+    def async_completion(self):
+        """Decorator for the async litellm acompletion function."""
+        return self._async_completion
--- a/openhands/llm/debug_mixin.py
+++ b/openhands/llm/debug_mixin.py
@@ -0,0 +1,51 @@
+from typing import Any
+
+from openhands.core.logger import llm_prompt_logger, llm_response_logger
+from openhands.core.logger import openhands_logger as logger
+
+MESSAGE_SEPARATOR = '\n\n----------\n\n'
+
+
+class DebugMixin:
+    def log_prompt(self, messages: list[dict[str, Any]] | dict[str, Any]):
+        if not messages:
+            logger.debug('No completion messages!')
+            return
+
+        messages = messages if isinstance(messages, list) else [messages]
+        debug_message = MESSAGE_SEPARATOR.join(
+            self._format_message_content(msg) for msg in messages if msg['content']
+        )
+
+        if debug_message:
+            llm_prompt_logger.debug(debug_message)
+        else:
+            logger.debug('No completion messages!')
+
+    def log_response(self, message_back: str):
+        if message_back:
+            llm_response_logger.debug(message_back)
+
+    def _format_message_content(self, message: dict[str, Any]):
+        content = message['content']
+        if isinstance(content, list):
+            return '\n'.join(
+                self._format_content_element(element) for element in content
+            )
+        return str(content)
+
+    def _format_content_element(self, element: dict[str, Any]):
+        if isinstance(element, dict):
+            if 'text' in element:
+                return element['text']
+            if (
+                self.vision_is_active()
+                and 'image_url' in element
+                and 'url' in element['image_url']
+            ):
+                return element['image_url']['url']
+        return str(element)
+
+    # This method should be implemented in the class that uses DebugMixin
+    def vision_is_active(self):
+        raise NotImplementedError
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -1,50 +1,54 @@
-import asyncio
 import copy
+import time
 import warnings
 from functools import partial
-from typing import Union
+from typing import Any

 from openhands.core.config import LLMConfig

 with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    import litellm
+from litellm import ModelInfo
 from litellm import completion as litellm_completion
 from litellm import completion_cost as litellm_completion_cost
 from litellm.exceptions import (
    APIConnectionError,
-    ContentPolicyViolationError,
    InternalServerError,
-    NotFoundError,
-    OpenAIError,
    RateLimitError,
    ServiceUnavailableError,
 )
-from litellm.types.utils import CostPerToken
-from tenacity import (
-    retry,
-    retry_if_exception_type,
-    stop_after_attempt,
-    wait_exponential,
-)
+from litellm.types.utils import CostPerToken, ModelResponse, Usage

-from openhands.core.exceptions import LLMResponseError, UserCancelledError
-from openhands.core.logger import llm_prompt_logger, llm_response_logger
 from openhands.core.logger import openhands_logger as logger
-from openhands.core.message import Message, format_messages
+from openhands.core.message import Message
 from openhands.core.metrics import Metrics
+from openhands.llm.debug_mixin import DebugMixin
+from openhands.llm.retry_mixin import RetryMixin

 __all__ = ['LLM']

-message_separator = '\n\n----------\n\n'
+# tuple of exceptions to retry on
+LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (
+    APIConnectionError,
+    InternalServerError,
+    RateLimitError,
+    ServiceUnavailableError,
+)

-cache_prompting_supported_models = [
+# cache prompt supporting models
+# remove this when we gemini and deepseek are supported
+CACHE_PROMPT_SUPPORTED_MODELS = [
    'claude-3-5-sonnet-20240620',
    'claude-3-haiku-20240307',
+    'claude-3-opus-20240229',
+    'anthropic/claude-3-opus-20240229',
+    'anthropic/claude-3-haiku-20240307',
+    'anthropic/claude-3-5-sonnet-20240620',
 ]


-class LLM:
+class LLM(RetryMixin, DebugMixin):
    """The LLM class represents a Language Model instance.

    Attributes:
@@ -61,17 +65,20 @@ class LLM:
        Passing simple parameters always overrides config.

        Args:
-            config: The LLM configuration
+            config: The LLM configuration.
+            metrics: The metrics to use.
        """
-        self.metrics = metrics if metrics is not None else Metrics()
-        self.cost_metric_supported = True
-        self.config = copy.deepcopy(config)
+        self.metrics: Metrics = metrics if metrics is not None else Metrics()
+        self.cost_metric_supported: bool = True
+        self.config: LLMConfig = copy.deepcopy(config)

-        # Set up config attributes with default values to prevent AttributeError
-        LLMConfig.set_missing_attributes(self.config)
+        # list of LLM completions (for logging purposes). Each completion is a dict with the following keys:
+        # - 'messages': list of messages
+        # - 'response': response from the LLM
+        self.llm_completions: list[dict[str, Any]] = []

        # litellm actually uses base Exception here for unknown model
-        self.model_info = None
+        self.model_info: ModelInfo | None = None
        try:
            if self.config.model.startswith('openrouter'):
                self.model_info = litellm.get_model_info(self.config.model)
@@ -83,15 +90,6 @@ class LLM:
        except Exception as e:
            logger.warning(f'Could not get model info for {config.model}:\n{e}')

-        # Tuple of exceptions to retry on
-        self.retry_exceptions = (
-            APIConnectionError,
-            ContentPolicyViolationError,
-            InternalServerError,
-            OpenAIError,
-            RateLimitError,
-        )
-
        # Set the max tokens in an LM-specific way if not set
        if self.config.max_input_tokens is None:
            if (
@@ -101,22 +99,23 @@ class LLM:
            ):
                self.config.max_input_tokens = self.model_info['max_input_tokens']
            else:
-                # Max input tokens for gpt3.5, so this is a safe fallback for any potentially viable model
+                # Safe fallback for any potentially viable model
                self.config.max_input_tokens = 4096

        if self.config.max_output_tokens is None:
-            if (
-                self.model_info is not None
-                and 'max_output_tokens' in self.model_info
-                and isinstance(self.model_info['max_output_tokens'], int)
-            ):
-                self.config.max_output_tokens = self.model_info['max_output_tokens']
-            else:
-                # Max output tokens for gpt3.5, so this is a safe fallback for any potentially viable model
-                self.config.max_output_tokens = 1024
-
-        if self.config.drop_params:
-            litellm.drop_params = self.config.drop_params
+            # Safe default for any potentially viable model
+            self.config.max_output_tokens = 4096
+            if self.model_info is not None:
+                # max_output_tokens has precedence over max_tokens, if either exists.
+                # litellm has models with both, one or none of these 2 parameters!
+                if 'max_output_tokens' in self.model_info and isinstance(
+                    self.model_info['max_output_tokens'], int
+                ):
+                    self.config.max_output_tokens = self.model_info['max_output_tokens']
+                elif 'max_tokens' in self.model_info and isinstance(
+                    self.model_info['max_tokens'], int
+                ):
+                    self.config.max_output_tokens = self.model_info['max_tokens']

        self._completion = partial(
            litellm_completion,
@@ -129,71 +128,53 @@ class LLM:
            timeout=self.config.timeout,
            temperature=self.config.temperature,
            top_p=self.config.top_p,
+            drop_params=self.config.drop_params,
        )

        if self.vision_is_active():
            logger.debug('LLM: model has vision enabled')
+        if self.is_caching_prompt_active():
+            logger.debug('LLM: caching prompt enabled')

        completion_unwrapped = self._completion

-        def attempt_on_error(retry_state):
-            """Custom attempt function for litellm completion."""
-            logger.error(
-                f'{retry_state.outcome.exception()}. Attempt #{retry_state.attempt_number} | You can customize retry values in the configuration.',
-                exc_info=False,
-            )
-            return None
-
-        def custom_completion_wait(retry_state):
-            """Custom wait function for litellm completion."""
-            if not retry_state:
-                return 0
-            exception = retry_state.outcome.exception() if retry_state.outcome else None
-            if exception is None:
-                return 0
-
-            min_wait_time = self.config.retry_min_wait
-            max_wait_time = self.config.retry_max_wait
-
-            # for rate limit errors, wait 1 minute by default, max 4 minutes between retries
-            exception_type = type(exception).__name__
-            logger.error(f'\nexception_type: {exception_type}\n')
-
-            if exception_type == 'RateLimitError':
-                min_wait_time = 60
-                max_wait_time = 240
-            elif exception_type == 'BadRequestError' and exception.response:
-                # this should give us the burried, actual error message from
-                # the LLM model.
-                logger.error(f'\n\nBadRequestError: {exception.response}\n\n')
-
-            # Return the wait time using exponential backoff
-            exponential_wait = wait_exponential(
-                multiplier=self.config.retry_multiplier,
-                min=min_wait_time,
-                max=max_wait_time,
-            )
-
-            # Call the exponential wait function with retry_state to get the actual wait time
-            return exponential_wait(retry_state)
-
-        @retry(
-            after=attempt_on_error,
-            stop=stop_after_attempt(self.config.num_retries),
-            reraise=True,
-            retry=retry_if_exception_type(self.retry_exceptions),
-            wait=custom_completion_wait,
+        @self.retry_decorator(
+            num_retries=self.config.num_retries,
+            retry_exceptions=LLM_RETRY_EXCEPTIONS,
+            retry_min_wait=self.config.retry_min_wait,
+            retry_max_wait=self.config.retry_max_wait,
+            retry_multiplier=self.config.retry_multiplier,
        )
        def wrapper(*args, **kwargs):
            """Wrapper for the litellm completion function. Logs the input and output of the completion function."""
-            # some callers might just send the messages directly
-            if 'messages' in kwargs:
-                messages = kwargs['messages']
-            else:
-                messages = args[1] if len(args) > 1 else []
+            messages: list[dict[str, Any]] | dict[str, Any] = []

-            # this serves to prevent empty messages and logging the messages
-            debug_message = self._get_debug_message(messages)
+            # some callers might send the model and messages directly
+            # litellm allows positional args, like completion(model, messages, **kwargs)
+            if len(args) > 1:
+                # ignore the first argument if it's provided (it would be the model)
+                # design wise: we don't allow overriding the configured values
+                # implementation wise: the partial function set the model as a kwarg already
+                # as well as other kwargs
+                messages = args[1] if len(args) > 1 else args[0]
+                kwargs['messages'] = messages
+
+                # remove the first args, they're sent in kwargs
+                args = args[2:]
+            elif 'messages' in kwargs:
+                messages = kwargs['messages']
+
+            # ensure we work with a list of messages
+            messages = messages if isinstance(messages, list) else [messages]
+
+            # if we have no messages, something went very wrong
+            if not messages:
+                raise ValueError(
+                    'The messages list is empty. At least one message is required.'
+                )
+
+            # log the entire LLM prompt
+            self.log_prompt(messages)

            if self.is_caching_prompt_active():
                # Anthropic-specific prompt caching
@@ -202,215 +183,31 @@ class LLM:
                        'anthropic-beta': 'prompt-caching-2024-07-31',
                    }

-            # skip if messages is empty (thus debug_message is empty)
-            if debug_message:
-                llm_prompt_logger.debug(debug_message)
-                resp = completion_unwrapped(*args, **kwargs)
-            else:
-                logger.debug('No completion messages!')
-                resp = {'choices': [{'message': {'content': ''}}]}
+            # we don't support streaming here, thus we get a ModelResponse
+            resp: ModelResponse = completion_unwrapped(*args, **kwargs)

-            # log the response
-            message_back = resp['choices'][0]['message']['content']
-            if message_back:
-                llm_response_logger.debug(message_back)
+            # log for evals or other scripts that need the raw completion
+            if self.config.log_completions:
+                self.llm_completions.append(
+                    {
+                        'messages': messages,
+                        'response': resp,
+                        'timestamp': time.time(),
+                        'cost': self._completion_cost(resp),
+                    }
+                )

-                # post-process to log costs
-                self._post_completion(resp)
+            message_back: str = resp['choices'][0]['message']['content']
+
+            # log the LLM response
+            self.log_response(message_back)
+
+            # post-process the response
+            self._post_completion(resp)

            return resp

-        self._completion = wrapper  # type: ignore
-
-        # Async version
-        self._async_completion = partial(
-            self._call_acompletion,
-            model=self.config.model,
-            api_key=self.config.api_key,
-            base_url=self.config.base_url,
-            api_version=self.config.api_version,
-            custom_llm_provider=self.config.custom_llm_provider,
-            max_tokens=self.config.max_output_tokens,
-            timeout=self.config.timeout,
-            temperature=self.config.temperature,
-            top_p=self.config.top_p,
-            drop_params=True,
-        )
-
-        async_completion_unwrapped = self._async_completion
-
-        @retry(
-            after=attempt_on_error,
-            stop=stop_after_attempt(self.config.num_retries),
-            reraise=True,
-            retry=retry_if_exception_type(self.retry_exceptions),
-            wait=custom_completion_wait,
-        )
-        async def async_completion_wrapper(*args, **kwargs):
-            """Async wrapper for the litellm acompletion function."""
-            # some callers might just send the messages directly
-            if 'messages' in kwargs:
-                messages = kwargs['messages']
-            else:
-                messages = args[1] if len(args) > 1 else []
-
-            # this serves to prevent empty messages and logging the messages
-            debug_message = self._get_debug_message(messages)
-
-            async def check_stopped():
-                while True:
-                    if (
-                        hasattr(self.config, 'on_cancel_requested_fn')
-                        and self.config.on_cancel_requested_fn is not None
-                        and await self.config.on_cancel_requested_fn()
-                    ):
-                        raise UserCancelledError('LLM request cancelled by user')
-                    await asyncio.sleep(0.1)
-
-            stop_check_task = asyncio.create_task(check_stopped())
-
-            try:
-                # Directly call and await litellm_acompletion
-                if debug_message:
-                    llm_prompt_logger.debug(debug_message)
-                    resp = await async_completion_unwrapped(*args, **kwargs)
-                else:
-                    logger.debug('No completion messages!')
-                    resp = {'choices': [{'message': {'content': ''}}]}
-
-                # skip if messages is empty (thus debug_message is empty)
-                if debug_message:
-                    message_back = resp['choices'][0]['message']['content']
-                    llm_response_logger.debug(message_back)
-                else:
-                    resp = {'choices': [{'message': {'content': ''}}]}
-                self._post_completion(resp)
-
-                # We do not support streaming in this method, thus return resp
-                return resp
-
-            except UserCancelledError:
-                logger.info('LLM request cancelled by user.')
-                raise
-            except (
-                APIConnectionError,
-                ContentPolicyViolationError,
-                InternalServerError,
-                NotFoundError,
-                OpenAIError,
-                RateLimitError,
-                ServiceUnavailableError,
-            ) as e:
-                logger.error(f'Completion Error occurred:\n{e}')
-                raise
-
-            finally:
-                await asyncio.sleep(0.1)
-                stop_check_task.cancel()
-                try:
-                    await stop_check_task
-                except asyncio.CancelledError:
-                    pass
-
-        @retry(
-            after=attempt_on_error,
-            stop=stop_after_attempt(self.config.num_retries),
-            reraise=True,
-            retry=retry_if_exception_type(self.retry_exceptions),
-            wait=custom_completion_wait,
-        )
-        async def async_acompletion_stream_wrapper(*args, **kwargs):
-            """Async wrapper for the litellm acompletion with streaming function."""
-            # some callers might just send the messages directly
-            if 'messages' in kwargs:
-                messages = kwargs['messages']
-            else:
-                messages = args[1] if len(args) > 1 else []
-
-            # log the prompt
-            debug_message = ''
-            for message in messages:
-                debug_message += message_separator + message['content']
-            llm_prompt_logger.debug(debug_message)
-
-            try:
-                # Directly call and await litellm_acompletion
-                resp = await async_completion_unwrapped(*args, **kwargs)
-
-                # For streaming we iterate over the chunks
-                async for chunk in resp:
-                    # Check for cancellation before yielding the chunk
-                    if (
-                        hasattr(self.config, 'on_cancel_requested_fn')
-                        and self.config.on_cancel_requested_fn is not None
-                        and await self.config.on_cancel_requested_fn()
-                    ):
-                        raise UserCancelledError(
-                            'LLM request cancelled due to CANCELLED state'
-                        )
-                    # with streaming, it is "delta", not "message"!
-                    message_back = chunk['choices'][0]['delta']['content']
-                    llm_response_logger.debug(message_back)
-                    self._post_completion(chunk)
-
-                    yield chunk
-
-            except UserCancelledError:
-                logger.info('LLM request cancelled by user.')
-                raise
-            except (
-                APIConnectionError,
-                ContentPolicyViolationError,
-                InternalServerError,
-                NotFoundError,
-                OpenAIError,
-                RateLimitError,
-                ServiceUnavailableError,
-            ) as e:
-                logger.error(f'Completion Error occurred:\n{e}')
-                raise
-
-            finally:
-                if kwargs.get('stream', False):
-                    await asyncio.sleep(0.1)
-
-        self._async_completion = async_completion_wrapper  # type: ignore
-        self._async_streaming_completion = async_acompletion_stream_wrapper  # type: ignore
-
-    def _get_debug_message(self, messages):
-        if not messages:
-            return ''
-
-        messages = messages if isinstance(messages, list) else [messages]
-        return message_separator.join(
-            self._format_message_content(msg) for msg in messages if msg['content']
-        )
-
-    def _format_message_content(self, message):
-        content = message['content']
-        if isinstance(content, list):
-            return self._format_list_content(content)
-        return str(content)
-
-    def _format_list_content(self, content_list):
-        return '\n'.join(
-            self._format_content_element(element) for element in content_list
-        )
-
-    def _format_content_element(self, element):
-        if isinstance(element, dict):
-            if 'text' in element:
-                return element['text']
-            if (
-                self.vision_is_active()
-                and 'image_url' in element
-                and 'url' in element['image_url']
-            ):
-                return element['image_url']['url']
-        return str(element)
-
-    async def _call_acompletion(self, *args, **kwargs):
-        return await litellm.acompletion(*args, **kwargs)
+        self._completion = wrapper

    @property
    def completion(self):
@@ -418,32 +215,7 @@ class LLM:

        Check the complete documentation at https://litellm.vercel.app/docs/completion
        """
-        try:
-            return self._completion
-        except Exception as e:
-            raise LLMResponseError(e)
-
-    @property
-    def async_completion(self):
-        """Decorator for the async litellm acompletion function.
-
-        Check the complete documentation at https://litellm.vercel.app/docs/providers/ollama#example-usage---streaming--acompletion
-        """
-        try:
-            return self._async_completion
-        except Exception as e:
-            raise LLMResponseError(e)
-
-    @property
-    def async_streaming_completion(self):
-        """Decorator for the async litellm acompletion function with streaming.
-
-        Check the complete documentation at https://litellm.vercel.app/docs/providers/ollama#example-usage---streaming--acompletion
-        """
-        try:
-            return self._async_streaming_completion
-        except Exception as e:
-            raise LLMResponseError(e)
+        return self._completion

    def vision_is_active(self):
        return not self.config.disable_vision and self._supports_vision()
@@ -454,47 +226,65 @@ class LLM:
        Returns:
            bool: True if model is vision capable. If model is not supported by litellm, it will return False.
        """
-        try:
-            return litellm.supports_vision(self.config.model)
-        except Exception:
-            return False
-
-    def is_caching_prompt_active(self) -> bool:
-        """Check if prompt caching is enabled and supported for current model.
-
-        Returns:
-            boolean: True if prompt caching is active for the given model.
-        """
-        return self.config.caching_prompt is True and any(
-            model in self.config.model for model in cache_prompting_supported_models
+        # litellm.supports_vision currently returns False for 'openai/gpt-...' or 'anthropic/claude-...' (with prefixes)
+        # but model_info will have the correct value for some reason.
+        # we can go with it, but we will need to keep an eye if model_info is correct for Vertex or other providers
+        # remove when litellm is updated to fix https://github.com/BerriAI/litellm/issues/5608
+        return litellm.supports_vision(self.config.model) or (
+            self.model_info is not None
+            and self.model_info.get('supports_vision', False)
        )

-    def _post_completion(self, response) -> None:
-        """Post-process the completion response."""
+    def is_caching_prompt_active(self) -> bool:
+        """Check if prompt caching is supported and enabled for current model.
+
+        Returns:
+            boolean: True if prompt caching is supported and enabled for the given model.
+        """
+        return (
+            self.config.caching_prompt is True
+            and self.model_info is not None
+            and self.model_info.get('supports_prompt_caching', False)
+            and self.config.model in CACHE_PROMPT_SUPPORTED_MODELS
+        )
+
+    def _post_completion(self, response: ModelResponse) -> None:
+        """Post-process the completion response.
+
+        Logs the cost and usage stats of the completion call.
+        """
        try:
-            cur_cost = self.completion_cost(response)
+            cur_cost = self._completion_cost(response)
        except Exception:
            cur_cost = 0

        stats = ''
        if self.cost_metric_supported:
+            # keep track of the cost
            stats = 'Cost: %.2f USD | Accumulated Cost: %.2f USD\n' % (
                cur_cost,
                self.metrics.accumulated_cost,
            )

-        usage = response.get('usage')
+        usage: Usage | None = response.get('usage')

        if usage:
+            # keep track of the input and output tokens
            input_tokens = usage.get('prompt_tokens')
            output_tokens = usage.get('completion_tokens')

            if input_tokens:
-                stats += 'Input tokens: ' + str(input_tokens) + '\n'
+                stats += 'Input tokens: ' + str(input_tokens)

            if output_tokens:
-                stats += 'Output tokens: ' + str(output_tokens) + '\n'
+                stats += (
+                    (' | ' if input_tokens else '')
+                    + 'Output tokens: '
+                    + str(output_tokens)
+                    + '\n'
+                )

+            # read the prompt caching status as received from the provider
            model_extra = usage.get('model_extra', {})

            cache_creation_input_tokens = model_extra.get('cache_creation_input_tokens')
@@ -511,6 +301,7 @@ class LLM:
                    'Input tokens (cache read): ' + str(cache_read_input_tokens) + '\n'
                )

+        # log the stats
        if stats:
            logger.info(stats)

@@ -529,7 +320,7 @@ class LLM:
            # TODO: this is to limit logspam in case token count is not supported
            return 0

-    def is_local(self):
+    def _is_local(self):
        """Determines if the system is using a locally running LLM.

        Returns:
@@ -544,7 +335,7 @@ class LLM:
                return True
        return False

-    def completion_cost(self, response):
+    def _completion_cost(self, response):
        """Calculate the cost of a completion response based on the model.  Local models are treated as free.
        Add the current cost into total cost in metrics.

@@ -569,7 +360,7 @@ class LLM:
            logger.info(f'Using custom cost per token: {cost_per_token}')
            extra_kwargs['custom_cost_per_token'] = cost_per_token

-        if not self.is_local():
+        if not self._is_local():
            try:
                cost = litellm_completion_cost(
                    completion_response=response, **extra_kwargs
@@ -593,10 +384,16 @@ class LLM:

    def reset(self):
        self.metrics = Metrics()
+        self.llm_completions = []

-    def format_messages_for_llm(
-        self, messages: Union[Message, list[Message]]
-    ) -> list[dict]:
-        return format_messages(
-            messages, self.vision_is_active(), self.is_caching_prompt_active()
-        )
+    def format_messages_for_llm(self, messages: Message | list[Message]) -> list[dict]:
+        if isinstance(messages, Message):
+            messages = [messages]
+
+        # set flags to know how to serialize the messages
+        for message in messages:
+            message.cache_enabled = self.is_caching_prompt_active()
+            message.vision_enabled = self.vision_is_active()
+
+        # let pydantic handle the serialization
+        return [message.model_dump() for message in messages]
--- a/openhands/llm/retry_mixin.py
+++ b/openhands/llm/retry_mixin.py
@@ -0,0 +1,53 @@
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from openhands.core.exceptions import OperationCancelled
+from openhands.core.logger import openhands_logger as logger
+from openhands.runtime.utils.shutdown_listener import should_exit
+
+
+class RetryMixin:
+    """Mixin class for retry logic."""
+
+    def retry_decorator(self, **kwargs):
+        """
+        Create a LLM retry decorator with customizable parameters. This is used for 429 errors, and a few other exceptions in LLM classes.
+
+        Args:
+            **kwargs: Keyword arguments to override default retry behavior.
+                      Keys: num_retries, retry_exceptions, retry_min_wait, retry_max_wait, retry_multiplier
+
+        Returns:
+            A retry decorator with the parameters customizable in configuration.
+        """
+        num_retries = kwargs.get('num_retries')
+        retry_exceptions = kwargs.get('retry_exceptions')
+        retry_min_wait = kwargs.get('retry_min_wait')
+        retry_max_wait = kwargs.get('retry_max_wait')
+        retry_multiplier = kwargs.get('retry_multiplier')
+
+        return retry(
+            before_sleep=self.log_retry_attempt,
+            stop=stop_after_attempt(num_retries),
+            reraise=True,
+            retry=(retry_if_exception_type(retry_exceptions)),
+            wait=wait_exponential(
+                multiplier=retry_multiplier,
+                min=retry_min_wait,
+                max=retry_max_wait,
+            ),
+        )
+
+    def log_retry_attempt(self, retry_state):
+        """Log retry attempts."""
+        if should_exit():
+            raise OperationCancelled('Operation cancelled.')  # exits the @retry loop
+        exception = retry_state.outcome.exception()
+        logger.error(
+            f'{exception}. Attempt #{retry_state.attempt_number} | You can customize retry values in the configuration.',
+            exc_info=False,
+        )
--- a/openhands/llm/streaming_llm.py
+++ b/openhands/llm/streaming_llm.py
@@ -0,0 +1,106 @@
+import asyncio
+from functools import partial
+from typing import Any
+
+from openhands.core.exceptions import UserCancelledError
+from openhands.core.logger import openhands_logger as logger
+from openhands.llm.async_llm import LLM_RETRY_EXCEPTIONS, AsyncLLM
+
+
+class StreamingLLM(AsyncLLM):
+    """Streaming LLM class."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._async_streaming_completion = partial(
+            self._call_acompletion,
+            model=self.config.model,
+            api_key=self.config.api_key,
+            base_url=self.config.base_url,
+            api_version=self.config.api_version,
+            custom_llm_provider=self.config.custom_llm_provider,
+            max_tokens=self.config.max_output_tokens,
+            timeout=self.config.timeout,
+            temperature=self.config.temperature,
+            top_p=self.config.top_p,
+            drop_params=self.config.drop_params,
+            stream=True,  # Ensure streaming is enabled
+        )
+
+        async_streaming_completion_unwrapped = self._async_streaming_completion
+
+        @self.retry_decorator(
+            num_retries=self.config.num_retries,
+            retry_exceptions=LLM_RETRY_EXCEPTIONS,
+            retry_min_wait=self.config.retry_min_wait,
+            retry_max_wait=self.config.retry_max_wait,
+            retry_multiplier=self.config.retry_multiplier,
+        )
+        async def async_streaming_completion_wrapper(*args, **kwargs):
+            messages: list[dict[str, Any]] | dict[str, Any] = []
+
+            # some callers might send the model and messages directly
+            # litellm allows positional args, like completion(model, messages, **kwargs)
+            # see llm.py for more details
+            if len(args) > 1:
+                messages = args[1] if len(args) > 1 else args[0]
+                kwargs['messages'] = messages
+
+                # remove the first args, they're sent in kwargs
+                args = args[2:]
+            elif 'messages' in kwargs:
+                messages = kwargs['messages']
+
+            # ensure we work with a list of messages
+            messages = messages if isinstance(messages, list) else [messages]
+
+            # if we have no messages, something went very wrong
+            if not messages:
+                raise ValueError(
+                    'The messages list is empty. At least one message is required.'
+                )
+
+            self.log_prompt(messages)
+
+            try:
+                # Directly call and await litellm_acompletion
+                resp = await async_streaming_completion_unwrapped(*args, **kwargs)
+
+                # For streaming we iterate over the chunks
+                async for chunk in resp:
+                    # Check for cancellation before yielding the chunk
+                    if (
+                        hasattr(self.config, 'on_cancel_requested_fn')
+                        and self.config.on_cancel_requested_fn is not None
+                        and await self.config.on_cancel_requested_fn()
+                    ):
+                        raise UserCancelledError(
+                            'LLM request cancelled due to CANCELLED state'
+                        )
+                    # with streaming, it is "delta", not "message"!
+                    message_back = chunk['choices'][0]['delta'].get('content', '')
+                    if message_back:
+                        self.log_response(message_back)
+                    self._post_completion(chunk)
+
+                    yield chunk
+
+            except UserCancelledError:
+                logger.info('LLM request cancelled by user.')
+                raise
+            except Exception as e:
+                logger.error(f'Completion Error occurred:\n{e}')
+                raise
+
+            finally:
+                # sleep for 0.1 seconds to allow the stream to be flushed
+                if kwargs.get('stream', False):
+                    await asyncio.sleep(0.1)
+
+        self._async_streaming_completion = async_streaming_completion_wrapper
+
+    @property
+    def async_streaming_completion(self):
+        """Decorator for the async litellm acompletion function with streaming."""
+        return self._async_streaming_completion
--- a/openhands/runtime/README.md
+++ b/openhands/runtime/README.md
@@ -74,6 +74,41 @@ Key features of the `RuntimeClient` class:
 - The system uses a plugin architecture for extensibility.
 - All interactions with the external environment are managed through the Runtime, ensuring a controlled and secure execution environment.

+## Runtime Types
+
+### EventStream Runtime
+
+The EventStream Runtime is designed for local execution using Docker containers:
+
+- Creates and manages a Docker container for each session
+- Executes actions within the container
+- Supports direct file system access and local resource management
+- Ideal for development, testing, and scenarios requiring full control over the execution environment
+
+Key features:
+- Real-time logging and debugging capabilities
+- Direct access to the local file system
+- Faster execution due to local resources
+
+This is the default runtime used within OpenHands.
+
+### Remote Runtime
+
+The Remote Runtime is designed for execution in a remote environment:
+
+- Connects to a remote server running the RuntimeClient
+- Executes actions by sending requests to the remote client
+- Supports distributed execution and cloud-based deployments
+- Ideal for production environments, scalability, and scenarios where local resource constraints are a concern
+
+Key features:
+- Scalability and resource flexibility
+- Reduced local resource usage
+- Support for cloud-based deployments
+- Potential for improved security through isolation
+
+At the time of this writing, this is mostly used in parallel evaluation, such as this example for [SWE-Bench](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/swe_bench#run-inference-on-remoteruntime-experimental).
+
 ## Related Components

 - The runtime interacts closely with the event system defined in the `openhands.events` module.
--- a/openhands/runtime/browser/browser_env.py
+++ b/openhands/runtime/browser/browser_env.py
@@ -16,6 +16,7 @@ from PIL import Image

 from openhands.core.exceptions import BrowserInitException
 from openhands.core.logger import openhands_logger as logger
+from openhands.runtime.utils.shutdown_listener import should_continue, should_exit

 BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL'
 BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS'
@@ -99,7 +100,7 @@ class BrowserEnv:
            self.eval_goal = obs['goal']

        logger.info('Browser env started.')
-        while True:
+        while should_continue():
            try:
                if self.browser_side.poll(timeout=0.01):
                    unique_request_id, action_data = self.browser_side.recv()
@@ -157,7 +158,7 @@ class BrowserEnv:
        self.agent_side.send((unique_request_id, {'action': action_str}))
        start_time = time.time()
        while True:
-            if time.time() - start_time > timeout:
+            if should_exit() or time.time() - start_time > timeout:
                raise TimeoutError('Browser environment took too long to respond.')
            if self.agent_side.poll(timeout=0.01):
                response_id, obs = self.agent_side.recv()
--- a/openhands/runtime/builder/base.py
+++ b/openhands/runtime/builder/base.py
@@ -26,12 +26,13 @@ class RuntimeBuilder(abc.ABC):
        pass

    @abc.abstractmethod
-    def image_exists(self, image_name: str) -> bool:
+    def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:
        """
        Check if the runtime image exists.

        Args:
            image_name (str): The name of the runtime image (e.g., "repo:sha").
+            pull_from_repo (bool): Whether to pull from the remote repo if the image not present locally

        Returns:
            bool: Whether the runtime image exists.
--- a/openhands/runtime/builder/docker.py
+++ b/openhands/runtime/builder/docker.py
@@ -1,7 +1,12 @@
+import datetime
+import os
+import subprocess
 import sys
+import time

 import docker

+from openhands import __version__ as oh_version
 from openhands.core.logger import openhands_logger as logger
 from openhands.runtime.builder.base import RuntimeBuilder

@@ -10,40 +15,139 @@ class DockerRuntimeBuilder(RuntimeBuilder):
    def __init__(self, docker_client: docker.DockerClient):
        self.docker_client = docker_client

-    def build(self, path: str, tags: list[str]) -> str:
+        version_info = self.docker_client.version()
+        server_version = version_info.get('Version', '')
+        if tuple(map(int, server_version.split('.'))) < (18, 9):
+            raise RuntimeError('Docker server version must be >= 18.09 to use BuildKit')
+
+        self.max_lines = 10
+        self.log_lines = [''] * self.max_lines
+
+    def build(
+        self,
+        path: str,
+        tags: list[str],
+        use_local_cache: bool = False,
+        extra_build_args: list[str] | None = None,
+    ) -> str:
+        """Builds a Docker image using BuildKit and handles the build logs appropriately.
+
+        Args:
+            path (str): The path to the Docker build context.
+            tags (list[str]): A list of image tags to apply to the built image.
+            use_local_cache (bool, optional): Whether to use and update the local build cache. Defaults to True.
+            extra_build_args (list[str], optional): Additional arguments to pass to the Docker build command. Defaults to None.
+
+        Returns:
+            str: The name of the built Docker image.
+
+        Raises:
+            RuntimeError: If the Docker server version is incompatible or if the build process fails.
+
+        Note:
+            This method uses Docker BuildKit for improved build performance and caching capabilities.
+            If `use_local_cache` is True, it will attempt to use and update the build cache in a local directory.
+            The `extra_build_args` parameter allows for passing additional Docker build arguments as needed.
+        """
+        self.docker_client = docker.from_env()
+        version_info = self.docker_client.version()
+        server_version = version_info.get('Version', '')
+        if tuple(map(int, server_version.split('.'))) < (18, 9):
+            raise RuntimeError('Docker server version must be >= 18.09 to use BuildKit')
+
        target_image_hash_name = tags[0]
        target_image_repo, target_image_hash_tag = target_image_hash_name.split(':')
        target_image_tag = tags[1].split(':')[1] if len(tags) > 1 else None

-        try:
-            build_logs = self.docker_client.api.build(
-                path=path,
-                tag=target_image_hash_name,
-                rm=True,
-                decode=True,
-            )
-        except docker.errors.BuildError as e:
-            logger.error(f'Sandbox image build failed: {e}')
-            raise RuntimeError(f'Sandbox image build failed: {e}')
+        # Check if the image exists and pull if necessary
+        self.image_exists(target_image_repo)

-        for log in build_logs:
-            if 'stream' in log:
-                logger.info(log['stream'].strip())
-            elif 'error' in log:
-                logger.error(log['error'].strip())
-            else:
-                logger.info(str(log))
+        buildx_cmd = [
+            'docker',
+            'buildx',
+            'build',
+            '--progress=plain',
+            f'--build-arg=OPENHANDS_RUNTIME_VERSION={oh_version}',
+            f'--build-arg=OPENHANDS_RUNTIME_BUILD_TIME={datetime.datetime.now().isoformat()}',
+            f'--tag={target_image_hash_name}',
+            '--load',
+        ]
+
+        cache_dir = '/tmp/.buildx-cache'
+        if use_local_cache and self._is_cache_usable(cache_dir):
+            buildx_cmd.extend(
+                [
+                    f'--cache-from=type=local,src={cache_dir}',
+                    f'--cache-to=type=local,dest={cache_dir},mode=max',
+                ]
+            )
+
+        if extra_build_args:
+            buildx_cmd.extend(extra_build_args)
+
+        buildx_cmd.append(path)  # must be last!
+
+        print('================ DOCKER BUILD STARTED ================')
+        if sys.stdout.isatty():
+            sys.stdout.write('\n' * self.max_lines)
+            sys.stdout.flush()
+
+        try:
+            process = subprocess.Popen(
+                buildx_cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=True,
+                bufsize=1,
+            )
+
+            if process.stdout:
+                for line in iter(process.stdout.readline, ''):
+                    line = line.strip()
+                    if line:
+                        self._output_logs(line)
+
+            return_code = process.wait()
+
+            if return_code != 0:
+                raise subprocess.CalledProcessError(
+                    return_code,
+                    process.args,
+                    output=None,
+                    stderr=None,
+                )
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f'Image build failed:\n{e}')
+            logger.error(f'Command output:\n{e.output}')
+            raise
+
+        except subprocess.TimeoutExpired:
+            logger.error('Image build timed out')
+            raise
+
+        except FileNotFoundError as e:
+            logger.error(f'Python executable not found: {e}')
+            raise
+
+        except PermissionError as e:
+            logger.error(
+                f'Permission denied when trying to execute the build command:\n{e}'
+            )
+            raise
+
+        except Exception as e:
+            logger.error(f'An unexpected error occurred during the build process: {e}')
+            raise

        logger.info(f'Image [{target_image_hash_name}] build finished.')

-        assert (
-            target_image_tag
-        ), f'Expected target image tag [{target_image_tag}] is None'
-        image = self.docker_client.images.get(target_image_hash_name)
-        image.tag(target_image_repo, target_image_tag)
-        logger.info(
-            f'Re-tagged image [{target_image_hash_name}] with more generic tag [{target_image_tag}]'
-        )
+        if target_image_tag:
+            image = self.docker_client.images.get(target_image_hash_name)
+            image.tag(target_image_repo, target_image_tag)
+            logger.info(
+                f'Re-tagged image [{target_image_hash_name}] with more generic tag [{target_image_tag}]'
+            )

        # Check if the image is built successfully
        image = self.docker_client.images.get(target_image_hash_name)
@@ -62,11 +166,12 @@ class DockerRuntimeBuilder(RuntimeBuilder):
        )
        return target_image_hash_name

-    def image_exists(self, image_name: str) -> bool:
+    def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:
        """Check if the image exists in the registry (try to pull it first) or in the local store.

        Args:
            image_name (str): The Docker image to check (<image repo>:<image tag>)
+            pull_from_repo (bool): Whether to pull from the remote repo if the image not present locally
        Returns:
            bool: Whether the Docker image exists in the registry or in the local store
        """
@@ -75,53 +180,32 @@ class DockerRuntimeBuilder(RuntimeBuilder):
            return False

        try:
-            logger.info(f'Checking, if image exists locally:\n{image_name}')
+            logger.debug(f'Checking, if image exists locally:\n{image_name}')
            self.docker_client.images.get(image_name)
-            logger.info('Image found locally.')
+            logger.debug('Image found locally.')
            return True
        except docker.errors.ImageNotFound:
+            if not pull_from_repo:
+                logger.debug(
+                    f'Image {image_name} not found locally'
+                )
+                return False
            try:
-                logger.info(
+                logger.debug(
                    'Image not found locally. Trying to pull it, please wait...'
                )

-                layers = {}
+                layers: dict[str, dict[str, str]] = {}
                previous_layer_count = 0
                for line in self.docker_client.api.pull(
                    image_name, stream=True, decode=True
                ):
-                    if 'id' in line and 'progressDetail' in line:
-                        layer_id = line['id']
-                        if layer_id not in layers:
-                            layers[layer_id] = {'last_logged': 0}
-
-                        if (
-                            'total' in line['progressDetail']
-                            and 'current' in line['progressDetail']
-                        ):
-                            total = line['progressDetail']['total']
-                            current = line['progressDetail']['current']
-                            percentage = (current / total) * 100
-
-                            # refresh process bar in console if stdout is a tty
-                            if sys.stdout.isatty():
-                                layers[layer_id]['last_logged'] = percentage
-                                self._output_pull_progress(layers, previous_layer_count)
-                                previous_layer_count = len(layers)
-                            # otherwise Log only if percentage is at least 10% higher than last logged
-                            elif percentage - layers[layer_id]['last_logged'] >= 10:
-                                logger.info(
-                                    f'Layer {layer_id}: {percentage:.0f}% downloaded'
-                                )
-                                layers[layer_id]['last_logged'] = percentage
-
-                    elif 'status' in line:
-                        logger.info(line['status'])
-
-                logger.info('Image pulled')
+                    self._output_build_progress(line, layers, previous_layer_count)
+                    previous_layer_count = len(layers)
+                logger.debug('Image pulled')
                return True
            except docker.errors.ImageNotFound:
-                logger.info('Could not find image locally or in registry.')
+                logger.debug('Could not find image locally or in registry.')
                return False
            except Exception as e:
                msg = 'Image could not be pulled: '
@@ -130,12 +214,131 @@ class DockerRuntimeBuilder(RuntimeBuilder):
                    msg += 'image not found in registry.'
                else:
                    msg += f'{ex_msg}'
-                logger.warning(msg)
+                logger.debug(msg)
                return False

-    def _output_pull_progress(self, layers: dict, previous_layer_count: int) -> None:
-        sys.stdout.write('\033[F' * previous_layer_count)
-        for lid, layer_data in sorted(layers.items()):
-            sys.stdout.write('\033[K')
-            print(f'Layer {lid}: {layer_data["last_logged"]:.0f}% downloaded')
+    def _output_logs(self, new_line: str) -> None:
+        """Display the last 10 log_lines in the console (not for file logging).
+        This will create the effect of a rolling display in the console.
+
+        '\033[F'    moves the cursor up one line.
+        '\033[2K\r' clears the line and moves the cursor to the beginning of the line.
+        """
+        if not sys.stdout.isatty():
+            logger.debug(new_line)
+            return
+
+        self.log_lines.pop(0)
+        self.log_lines.append(new_line[:80])
+
+        sys.stdout.write('\033[F' * (self.max_lines))
        sys.stdout.flush()
+
+        for line in self.log_lines:
+            sys.stdout.write('\033[2K' + line + '\n')
+            sys.stdout.flush()
+
+    def _output_build_progress(
+        self, current_line: dict, layers: dict, previous_layer_count: int
+    ) -> None:
+        if 'id' in current_line and 'progressDetail' in current_line:
+            layer_id = current_line['id']
+            if layer_id not in layers:
+                layers[layer_id] = {'status': '', 'progress': '', 'last_logged': 0}
+
+            if 'status' in current_line:
+                layers[layer_id]['status'] = current_line['status']
+
+            if 'progress' in current_line:
+                layers[layer_id]['progress'] = current_line['progress']
+
+            if 'progressDetail' in current_line:
+                progress_detail = current_line['progressDetail']
+                if 'total' in progress_detail and 'current' in progress_detail:
+                    total = progress_detail['total']
+                    current = progress_detail['current']
+                    percentage = min(
+                        (current / total) * 100, 100
+                    )  # Ensure it doesn't exceed 100%
+                else:
+                    percentage = (
+                        100 if layers[layer_id]['status'] == 'Download complete' else 0
+                    )
+
+            if sys.stdout.isatty():
+                sys.stdout.write('\033[F' * previous_layer_count)
+                for lid, layer_data in sorted(layers.items()):
+                    sys.stdout.write('\033[2K\r')
+                    status = layer_data['status']
+                    progress = layer_data['progress']
+                    if status == 'Download complete':
+                        print(f'Layer {lid}: Download complete')
+                    elif status == 'Already exists':
+                        print(f'Layer {lid}: Already exists')
+                    else:
+                        print(f'Layer {lid}: {progress} {status}')
+                sys.stdout.flush()
+            elif percentage != 0 and (
+                percentage - layers[layer_id]['last_logged'] >= 10 or percentage == 100
+            ):
+                logger.debug(
+                    f'Layer {layer_id}: {layers[layer_id]["progress"]} {layers[layer_id]["status"]}'
+                )
+
+            layers[layer_id]['last_logged'] = percentage
+        elif 'status' in current_line:
+            logger.debug(current_line['status'])
+
+    def _prune_old_cache_files(self, cache_dir: str, max_age_days: int = 7) -> None:
+        """
+        Prune cache files older than the specified number of days.
+
+        Args:
+            cache_dir (str): The path to the cache directory.
+            max_age_days (int): The maximum age of cache files in days.
+        """
+        try:
+            current_time = time.time()
+            max_age_seconds = max_age_days * 24 * 60 * 60
+
+            for root, _, files in os.walk(cache_dir):
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    try:
+                        file_age = current_time - os.path.getmtime(file_path)
+                        if file_age > max_age_seconds:
+                            os.remove(file_path)
+                            logger.debug(f'Removed old cache file: {file_path}')
+                    except Exception as e:
+                        logger.warning(f'Error processing cache file {file_path}: {e}')
+        except Exception as e:
+            logger.warning(f'Error during build cache pruning: {e}')
+
+    def _is_cache_usable(self, cache_dir: str) -> bool:
+        """
+        Check if the cache directory is usable (exists and is writable).
+
+        Args:
+            cache_dir (str): The path to the cache directory.
+
+        Returns:
+            bool: True if the cache directory is usable, False otherwise.
+        """
+        if not os.path.exists(cache_dir):
+            try:
+                os.makedirs(cache_dir, exist_ok=True)
+                logger.debug(f'Created cache directory: {cache_dir}')
+            except OSError as e:
+                logger.debug(f'Failed to create cache directory {cache_dir}: {e}')
+                return False
+
+        if not os.access(cache_dir, os.W_OK):
+            logger.warning(
+                f'Cache directory {cache_dir} is not writable. Caches will not be used for Docker builds.'
+            )
+            return False
+
+        self._prune_old_cache_files(cache_dir)
+
+        logger.debug(f'Cache directory {cache_dir} is usable')
+        return True
--- a/openhands/runtime/builder/remote.py
+++ b/openhands/runtime/builder/remote.py
@@ -8,6 +8,7 @@ import requests
 from openhands.core.logger import openhands_logger as logger
 from openhands.runtime.builder import RuntimeBuilder
 from openhands.runtime.utils.request import send_request
+from openhands.runtime.utils.shutdown_listener import should_exit, sleep_if_should_continue


 class RemoteRuntimeBuilder(RuntimeBuilder):
@@ -57,7 +58,7 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
        start_time = time.time()
        timeout = 30 * 60  # 20 minutes in seconds
        while True:
-            if time.time() - start_time > timeout:
+            if should_exit() or time.time() - start_time > timeout:
                logger.error('Build timed out after 30 minutes')
                raise RuntimeError('Build timed out after 30 minutes')

@@ -95,9 +96,9 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
                raise RuntimeError(error_message)

            # Wait before polling again
-            time.sleep(30)
+            sleep_if_should_continue(30)

-    def image_exists(self, image_name: str) -> bool:
+    def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:
        """Checks if an image exists in the remote registry using the /image_exists endpoint."""
        params = {'image': image_name}
        response = send_request(
--- a/openhands/runtime/client/client.py
+++ b/openhands/runtime/client/client.py
@@ -11,13 +11,16 @@ import os
 import re
 import shutil
 import subprocess
+import time
 from contextlib import asynccontextmanager
 from pathlib import Path

 import pexpect
 from fastapi import FastAPI, HTTPException, Request, UploadFile
+from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
+from starlette.exceptions import HTTPException as StarletteHTTPException
 from uvicorn import run

 from openhands.core.logger import openhands_logger as logger
@@ -84,6 +87,8 @@ class RuntimeClient:
        self.lock = asyncio.Lock()
        self.plugins: dict[str, Plugin] = {}
        self.browser = BrowserEnv(browsergym_eval_env)
+        self.start_time = time.time()
+        self.last_execution_time = self.start_time

    @property
    def initial_pwd(self):
@@ -184,38 +189,25 @@ class RuntimeClient:
                raise

        # Add sudoer
-        sudoer_line = r'%sudo ALL=(ALL) NOPASSWD:ALL\n'
-        sudoers_path = '/etc/sudoers.d/99_sudo'
-        if not Path(sudoers_path).exists():
-            with open(sudoers_path, 'w') as f:
-                f.write(sudoer_line)
-            output = subprocess.run(['chmod', '0440', sudoers_path])
-            if output.returncode != 0:
-                logger.error('Failed to chmod 99_sudo file!')
-            else:
-                logger.debug('Added sudoer successfully.')
+        sudoer_line = r"echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers"
+        output = subprocess.run(sudoer_line, shell=True, capture_output=True)
+        if output.returncode != 0:
+            raise RuntimeError(f'Failed to add sudoer: {output.stderr.decode()}')
+        logger.debug(f'Added sudoer successfully. Output: [{output.stdout.decode()}]')

-        # Attempt to add the user, retrying with incremented user_id if necessary
-        while True:
-            command = (
-                f'useradd -rm -d /home/{username} -s /bin/bash '
-                f'-g root -G sudo -u {user_id} {username}'
+        command = (
+            f'useradd -rm -d /home/{username} -s /bin/bash '
+            f'-g root -G sudo -u {user_id} {username}'
+        )
+        output = subprocess.run(command, shell=True, capture_output=True)
+        if output.returncode == 0:
+            logger.debug(
+                f'Added user `{username}` successfully with UID {user_id}. Output: [{output.stdout.decode()}]'
+            )
+        else:
+            raise RuntimeError(
+                f'Failed to create user `{username}` with UID {user_id}. Output: [{output.stderr.decode()}]'
            )
-            output = subprocess.run(command, shell=True, capture_output=True)
-            if output.returncode == 0:
-                logger.debug(
-                    f'Added user `{username}` successfully with UID {user_id}. Output: [{output.stdout.decode()}]'
-                )
-                break
-            elif f'UID {user_id} is not unique' in output.stderr.decode():
-                logger.warning(
-                    f'UID {user_id} is not unique. Incrementing UID and retrying...'
-                )
-                user_id += 1
-            else:
-                raise RuntimeError(
-                    f'Failed to create user `{username}`! Output: [{output.stderr.decode()}]'
-                )

    def _init_bash_shell(self, work_dir: str, username: str) -> None:
        self.shell = pexpect.spawn(
@@ -331,7 +323,13 @@ class RuntimeClient:
            logger.debug('Requesting exit code...')
            self.shell.expect(self.__bash_expect_regex, timeout=timeout)
            _exit_code_output = self.shell.before
-            exit_code = int(_exit_code_output.strip().split()[0])
+            try:
+                exit_code = int(_exit_code_output.strip().split()[0])
+            except:
+                logger.error('Error getting exit code from bash script')
+                # If we try to run an invalid shell script the output sometimes includes error text
+                # rather than the error code - we assume this is an error
+                exit_code = 2

        except pexpect.TIMEOUT as e:
            if kill_on_timeout:
@@ -575,6 +573,35 @@ if __name__ == '__main__':

    app = FastAPI(lifespan=lifespan)

+    # TODO below 3 exception handlers were recommended by Sonnet.
+    # Are these something we should keep?
+    @app.exception_handler(Exception)
+    async def global_exception_handler(request: Request, exc: Exception):
+        logger.exception('Unhandled exception occurred:')
+        return JSONResponse(
+            status_code=500,
+            content={
+                'message': 'An unexpected error occurred. Please try again later.'
+            },
+        )
+
+    @app.exception_handler(StarletteHTTPException)
+    async def http_exception_handler(request: Request, exc: StarletteHTTPException):
+        logger.error(f'HTTP exception occurred: {exc.detail}')
+        return JSONResponse(
+            status_code=exc.status_code, content={'message': exc.detail}
+        )
+
+    @app.exception_handler(RequestValidationError)
+    async def validation_exception_handler(
+        request: Request, exc: RequestValidationError
+    ):
+        logger.error(f'Validation error occurred: {exc}')
+        return JSONResponse(
+            status_code=422,
+            content={'message': 'Invalid request parameters', 'details': exc.errors()},
+        )
+
    @app.middleware('http')
    async def one_request_at_a_time(request: Request, call_next):
        assert client is not None
@@ -582,6 +609,14 @@ if __name__ == '__main__':
            response = await call_next(request)
        return response

+    @app.get('/server_info')
+    async def get_server_info():
+        assert client is not None
+        current_time = time.time()
+        uptime = current_time - client.start_time
+        idle_time = current_time - client.last_execution_time
+        return {'uptime': uptime, 'idle_time': idle_time}
+
    @app.post('/execute_action')
    async def execute_action(action_request: ActionRequest):
        assert client is not None
@@ -589,10 +624,11 @@ if __name__ == '__main__':
            action = event_from_dict(action_request.action)
            if not isinstance(action, Action):
                raise HTTPException(status_code=400, detail='Invalid action type')
+            client.last_execution_time = time.time()
            observation = await client.run_action(action)
            return event_to_dict(observation)
        except Exception as e:
-            logger.error(f'Error processing command: {str(e)}')
+            logger.error(f'Error processing command: {str(e)}', exc_info=True, stack_info=True)
            raise HTTPException(status_code=500, detail=str(e))

    @app.post('/upload_file')
--- a/Show More
+++ b/Show More