fix: reuse config parser for cli (#4187 )

Enable authentication for runtime environments (#4179 )
Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>
2026-04-29 03:00:45 -04:00 · 2024-10-03 09:41:02 -04:00 · 2024-10-03 09:14:39 -04:00 · 2024-10-03 07:58:13 -04:00 · 2024-10-03 07:00:45 +00:00 · 2024-10-03 01:54:49 +02:00
182 changed files with 3567 additions and 3194 deletions
@@ -1,6 +1,6 @@
-**Short description of the problem this fixes or functionality that this introduces. This may be used for the CHANGELOG**
-
+**End-user friendly description of the problem this fixes or functionality that this introduces**

+- [ ] Include this change in the Release Notes. If checked, you must provide an **end-user friendly** description for your change below

 ---
 **Give a summary of what the PR does, explaining any non-trivial design decisions**
@@ -14,20 +14,38 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Install poetry via pipx
+        run: pipx install poetry
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
-      - name: Set up environment
-        run: |
-          curl -sSL https://install.python-poetry.org | python3 -
-          poetry install --without evaluation,llama-index
-          poetry run playwright install --with-deps chromium
-          wget https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json -P /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/1_Pooling/
+          cache: 'poetry'
+      - name: Install Python dependencies using Poetry
+        run: poetry install --without evaluation,llama-index
+      - name: Build Environment
+        run: make build
      - name: Run tests
        run: |
          set -e
-          poetry run python openhands/core/main.py -t "do a flip" -d ./workspace/ -c DummyAgent
+          SANDBOX_FORCE_REBUILD_RUNTIME=True poetry run python3 openhands/core/main.py -t "do a flip" -d ./workspace/ -c DummyAgent
      - name: Check exit code
        run: |
          if [ $? -ne 0 ]; then
@@ -25,7 +25,71 @@ on:
        required: true
        default: ''

+env:
+  BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST: nikolaik/python-nodejs:python3.11-nodejs22
+
 jobs:
+  # Builds the OpenHands Docker images
+  ghcr_build_app:
+    name: Build App Image
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    outputs:
+      hash_from_app_image: ${{ steps.get_hash_in_app_image.outputs.hash_from_app_image }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3.0.0
+        with:
+          image: tonistiigi/binfmt:latest
+      - name: Login to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build and push app image
+        if: "!github.event.pull_request.head.repo.fork"
+        run: |
+          ./containers/build.sh -i openhands -o ${{ github.repository_owner }} --push
+      - name: Build app image
+        if: "github.event.pull_request.head.repo.fork"
+        run: |
+          ./containers/build.sh -i openhands -o ${{ github.repository_owner }} --load
+      - name: Get hash in App Image
+        id: get_hash_in_app_image
+        run: |
+          # Lowercase the repository owner
+          export REPO_OWNER=${{ github.repository_owner }}
+          REPO_OWNER=$(echo $REPO_OWNER | tr '[:upper:]' '[:lower:]')
+          # Run the build script in the app image
+          docker run -e SANDBOX_USER_ID=0 -v /var/run/docker.sock:/var/run/docker.sock ghcr.io/${REPO_OWNER}/openhands:${{ github.sha }} /bin/bash -c "mkdir -p containers/runtime; python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild" 2>&1 | tee docker-outputs.txt
+          # Get the hash from the build script
+          hash_from_app_image=$(cat docker-outputs.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1)
+          echo "hash_from_app_image=$hash_from_app_image" >> $GITHUB_OUTPUT
+          echo "Hash from app image: $hash_from_app_image"
+
+
  # Builds the runtime Docker images
  ghcr_build_runtime:
    name: Build Image
@@ -56,7 +120,9 @@ jobs:
          docker-images: false
          swap-storage: true
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
+        uses: docker/setup-qemu-action@v3.0.0
+        with:
+          image: tonistiigi/binfmt:latest
      - name: Login to GHCR
        uses: docker/login-action@v3
        with:
@@ -88,7 +154,7 @@ jobs:
      - name: Build and push runtime image ${{ matrix.base_image.image }}
        if: github.event.pull_request.head.repo.fork != true
        run: |
-          ./containers/build.sh runtime ${{ github.repository_owner }} --push ${{ matrix.base_image.tag }}
+          ./containers/build.sh -i runtime -o ${{ github.repository_owner }} --push -t ${{ matrix.base_image.tag }}
      # Forked repos can't push to GHCR, so we need to upload the image as an artifact
      - name: Build runtime image ${{ matrix.base_image.image }} for fork
        if: github.event.pull_request.head.repo.fork
@@ -104,6 +170,56 @@ jobs:
          name: runtime-${{ matrix.base_image.tag }}
          path: /tmp/runtime-${{ matrix.base_image.tag }}.tar

+  verify_hash_equivalence_in_runtime_and_app:
+    name: Verify Hash Equivalence in Runtime and Docker images
+    runs-on: ubuntu-latest
+    needs: [ghcr_build_runtime, ghcr_build_app]
+    strategy:
+      fail-fast: false
+      matrix:
+        base_image: ['nikolaik']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.virtualenvs
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+      - name: Get hash in App Image
+        run: |
+          echo "Hash from app image: ${{ needs.ghcr_build_app.outputs.hash_from_app_image }}"
+          echo "hash_from_app_image=${{ needs.ghcr_build_app.outputs.hash_from_app_image }}" >> $GITHUB_ENV
+
+      - name: Get hash using code (development mode)
+        run: |
+          mkdir -p containers/runtime
+          poetry run python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild > output.txt 2>&1
+          hash_from_code=$(cat output.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1)
+          echo "hash_from_code=$hash_from_code" >> $GITHUB_ENV
+
+      - name: Compare hashes
+        run: |
+          echo "Hash from App Image: ${{ env.hash_from_app_image }}"
+          echo "Hash from Code: ${{ env.hash_from_code }}"
+          if [ "${{ env.hash_from_app_image }}" = "${{ env.hash_from_code }}" ]; then
+            echo "Hashes match!"
+          else
+            echo "Hashes do not match!"
+            exit 1
+          fi
+
  # Run unit tests with the EventStream runtime Docker images as root
  test_runtime_root:
    name: RT Unit Tests (Root)
@@ -115,6 +231,23 @@ jobs:
        base_image: ['nikolaik']
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
        if: github.event.pull_request.head.repo.fork
@@ -160,7 +293,7 @@ jobs:
          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=false \
-          poetry run pytest -n 3 -raR --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -176,6 +309,23 @@ jobs:
        base_image: ['nikolaik']
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
        if: github.event.pull_request.head.repo.fork
@@ -221,7 +371,7 @@ jobs:
          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=true \
-          poetry run pytest -n 3 -raR --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -238,6 +388,23 @@ jobs:
        base_image: ['nikolaik']
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
        if: github.event.pull_request.head.repo.fork
@@ -290,7 +457,7 @@ jobs:
    name: All Runtime Tests Passed
    if: ${{ !cancelled() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
+    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux, verify_hash_equivalence_in_runtime_and_app]
    steps:
      - name: All tests passed
        run: echo "All runtime tests have passed successfully!"
@@ -299,7 +466,7 @@ jobs:
    name: All Runtime Tests Passed
    if: ${{ cancelled() || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
+    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux, verify_hash_equivalence_in_runtime_and_app]
    steps:
      - name: Some tests failed
        run: |
@@ -1,65 +0,0 @@
-# Workflow that builds, tests and then pushes the app docker images to the ghcr.io repository
-name: Build and Publish App Image
-
-# Always run on "main"
-# Always run on tags
-# Always run on PRs
-# Can also be triggered manually
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - '*'
-  pull_request:
-  workflow_dispatch:
-    inputs:
-      reason:
-        description: 'Reason for manual trigger'
-        required: true
-        default: ''
-
-jobs:
-  # Builds the OpenHands Docker images
-  ghcr_build:
-    name: Build App Image
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: true
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: false
-          swap-storage: true
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-      - name: Login to GHCR
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build and push app image
-        if: "!github.event.pull_request.head.repo.fork"
-        run: |
-          ./containers/build.sh openhands ${{ github.repository_owner }} --push
-      - name: Build app image
-        if: "github.event.pull_request.head.repo.fork"
-        run: |
-          ./containers/build.sh openhands image ${{ github.repository_owner }}
@@ -0,0 +1,13 @@
+name: Resolve Issues with OpenHands
+
+on:
+  issues:
+    types: [labeled]
+
+jobs:
+  call-openhands-resolver:
+    uses: All-Hands-AI/openhands-resolver/.github/workflows/openhands-resolver.yml@main
+    if: github.event.label.name == 'fix-me'
+    with:
+      issue_number: ${{ github.event.issue.number }}
+    secrets: inherit
@@ -89,6 +89,9 @@ jobs:
          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
      - name: Build Environment
        run: make build
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      - name: Run Tests
        run: poetry run pytest --forked --cov=agenthub --cov=openhands --cov-report=xml ./tests/unit
      - name: Upload coverage to Codecov
@@ -107,6 +110,9 @@ jobs:
        python-version: ['3.11']
    steps:
      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Set up Python
@@ -29,6 +29,9 @@ jobs:
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
+    - name: Set up Docker Buildx
+      id: buildx
+      uses: docker/setup-buildx-action@v3
    - name: Set up Python
      uses: actions/setup-python@v5
      with:
@@ -15,6 +15,9 @@ jobs:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
+    - name: Set up Docker Buildx
+      id: buildx
+      uses: docker/setup-buildx-action@v3
    - name: Set up Python
      uses: actions/setup-python@v5
      with:
@@ -1,113 +0,0 @@
-# Workflow that uses OpenHands to resolve a GitHub issue. Issue must be labeled 'solve-this'
-name: Use OpenHands to Resolve GitHub Issue
-
-on:
-  issues:
-    types: [labeled]
-
-permissions:
-  contents: write
-  pull-requests: write
-  issues: write
-
-jobs:
-  dogfood:
-    if: github.event.label.name == 'solve-this'
-    runs-on: ubuntu-latest
-    container:
-      image: ghcr.io/all-hands-ai/openhands
-      volumes:
-        - /var/run/docker.sock:/var/run/docker.sock
-    steps:
-    - name: install git, github cli
-      run: apt-get install -y git gh
-    - name: Checkout Repository
-      uses: actions/checkout@v4
-    - name: Write Task File
-      env:
-        ISSUE_TITLE: ${{ github.event.issue.title }}
-        ISSUE_BODY: ${{ github.event.issue.body }}
-      run: |
-        echo "TITLE:" > task.txt
-        echo "${ISSUE_TITLE}" >> task.txt
-        echo "" >> task.txt
-        echo "BODY:" >> task.txt
-        echo "${ISSUE_BODY}" >> task.txt
-    - name: Set up environment
-      run: |
-        curl -sSL https://install.python-poetry.org | python3 -
-        export PATH="/github/home/.local/bin:$PATH"
-        poetry install --without evaluation,llama-index
-        poetry run playwright install --with-deps chromium
-    - name: Run OpenHands
-      env:
-        ISSUE_TITLE: ${{ github.event.issue.title }}
-        ISSUE_BODY: ${{ github.event.issue.body }}
-        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-      run: |
-        # Append path to launch poetry
-        export PATH="/github/home/.local/bin:$PATH"
-        # Append path to correctly import package, note: must set pwd at first
-        export PYTHONPATH=$(pwd):$PYTHONPATH
-        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./openhands/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
-        rm task.txt
-    - name: Setup Git, Create Branch, and Commit Changes
-      run: |
-        # Setup Git configuration
-        git config --global --add safe.directory $PWD
-        git config --global user.name 'OpenHands'
-        git config --global user.email 'OpenHands@users.noreply.github.com'
-
-        # Create a unique branch name with a timestamp
-        BRANCH_NAME="fix/${{ github.event.issue.number }}-$(date +%Y%m%d%H%M%S)"
-
-        # Checkout new branch
-        git checkout -b $BRANCH_NAME
-
-        # Add all changes to staging, except task.txt
-        git add --all -- ':!task.txt'
-
-        # Commit the changes, if any
-        git commit -m "OpenHands: Resolve Issue #${{ github.event.issue.number }}"
-        if [ $? -ne 0 ]; then
-          echo "No changes to commit."
-          exit 0
-        fi
-
-        # Push changes
-        git push --set-upstream origin $BRANCH_NAME
-    - name: Fetch Default Branch
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        # Fetch the default branch using gh cli
-        DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef --jq .defaultBranchRef.name)
-        echo "Default branch is $DEFAULT_BRANCH"
-        echo "DEFAULT_BRANCH=$DEFAULT_BRANCH" >> $GITHUB_ENV
-    - name: Generate PR
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        # Create PR and capture URL
-        PR_URL=$(gh pr create \
-          --title "OpenHands: Resolve Issue #2" \
-          --body "This PR was generated by OpenHands to resolve issue #2" \
-          --repo "foragerr/OpenHands" \
-          --head "${{ github.head_ref }}" \
-          --base "${{ env.DEFAULT_BRANCH }}" \
-          | grep -o 'https://github.com/[^ ]*')
-
-        # Extract PR number from URL
-        PR_NUMBER=$(echo "$PR_URL" | grep -o '[0-9]\+$')
-
-        # Set environment vars
-        echo "PR_URL=$PR_URL" >> $GITHUB_ENV
-        echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV
-
-    - name: Post Comment
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        gh issue comment ${{ github.event.issue.number }} \
-          -b "OpenHands raised [PR #${{ env.PR_NUMBER }}](${{ env.PR_URL }}) to resolve this issue."
@@ -217,8 +217,6 @@ config.toml
 config.toml_
 config.toml.bak

-containers/agnostic_sandbox
-
 # swe-bench-eval
 image_build_logs
 run_instance_logs
@@ -0,0 +1,28 @@
+OpenHands is an automated AI software engineer. It is a repo with a Python backend
+(in the `openhands` directory) and TypeScript frontend (in the `frontend` directory).
+
+General Setup:
+- To set up the entire repo, including frontend and backend, run `make build`
+- To run linting and type-checking before finishing the job, run `poetry run pre-commit run --all-files --config ./dev_config/python/.pre-commit-config.yaml`
+
+Backend:
+- Located in the `openhands` directory
+- Testing:
+  - All tests are in `tests/unit/test_*.py`
+  - To test new code, run `poetry run pytest tests/unit/test_xxx.py` where `xxx` is the appropriate file for the current functionality
+  - Write all tests with pytest
+
+Frontend:
+- Located in the `frontend` directory
+- Prerequisites: A recent version of NodeJS / NPM
+- Setup: Run `npm install` in the frontend directory
+- Testing:
+  - Run tests: `npm run test`
+  - To run specific tests: `npm run test -- -t "TestName"`
+- Building:
+  - Build for production: `npm run build`
+- Environment Variables:
+  - Set in `frontend/.env` or as environment variables
+  - Available variables: VITE_BACKEND_HOST, VITE_USE_TLS, VITE_INSECURE_SKIP_VERIFY, VITE_FRONTEND_PORT
+- Internationalization:
+  - Generate i18n declaration file: `npm run make-i18n`
@@ -190,7 +190,7 @@ build-frontend:
 # Start backend
 start-backend:
 	@echo "$(YELLOW)Starting backend...$(RESET)"
-	@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"
+	@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) --reload --reload-exclude "$(shell pwd)/workspace"

 # Start frontend
 start-frontend:
@@ -42,6 +42,8 @@ system requirements and more information.
 ```bash
 export WORKSPACE_BASE=$(pwd)/workspace

+docker pull ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+
 docker run -it --pull=always \
    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
@@ -56,6 +58,10 @@ docker run -it --pull=always \

 You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!

+You'll need a model provider and API key. One option that works well: [Claude 3.5 Sonnet](https://www.anthropic.com/api), but you have [many options](https://docs.all-hands.dev/modules/usage/llms).
+
+---
+
 You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
 or as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode).

@@ -218,7 +218,6 @@ class BrowsingAgent(Agent):

        response = self.llm.completion(
            messages=self.llm.format_messages_for_llm(messages),
-            temperature=0.0,
            stop=[')```', ')\n```'],
        )
        return self.response_parser.parse(response)
@@ -10,20 +10,3 @@ The conceptual idea is illustrated below. At each turn, the agent can:
   - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.

 ![image](https://github.com/All-Hands-AI/OpenHands/assets/38853559/92b622e3-72ad-4a61-8f41-8c040b6d5fb3)
-
-## Plugin System
-
-To make the CodeAct agent more powerful with only access to `bash` action space, CodeAct agent leverages OpenHands's plugin system:
- [Jupyter plugin](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/runtime/plugins/jupyter): for IPython execution via bash command
- [Agent Skills plugin](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/runtime/plugins/agent_skills): Powerful bash command line tools for software development tasks introduced by [swe-agent](https://github.com/princeton-nlp/swe-agent).
-
-## Demo
-
-https://github.com/All-Hands-AI/OpenHands/assets/38853559/f592a192-e86c-4f48-ad31-d69282d5f6ac
-
-*Example of CodeActAgent with `gpt-4-turbo-2024-04-09` performing a data science task (linear regression)*
-
-## Work-in-progress & Next step
-
-[] Support web-browsing
-[] Complete the workflow for CodeAct agent to submit Github PRs
@@ -6,7 +6,6 @@ from openhands.events.action import (
    AgentDelegateAction,
    AgentFinishAction,
    CmdRunAction,
-    FileEditAction,
    IPythonRunCellAction,
    MessageAction,
 )
@@ -17,7 +16,6 @@ class CodeActResponseParser(ResponseParser):
    - CmdRunAction(command) - bash command to run
    - IPythonRunCellAction(code) - IPython code to run
    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
-    - FileEditAction(diff_block) - Search/Replace block to edit.
    - MessageAction(content) - Message action to run (e.g. ask for clarification)
    - AgentFinishAction() - end the interaction
    """
@@ -30,7 +28,6 @@ class CodeActResponseParser(ResponseParser):
            CodeActActionParserCmdRun(),
            CodeActActionParserIPythonRunCell(),
            CodeActActionParserAgentDelegate(),
-            CodeActActionParserFileEdit(),
        ]
        self.default_parser = CodeActActionParserMessage()

@@ -42,7 +39,11 @@ class CodeActResponseParser(ResponseParser):
        action = response.choices[0].message.content
        if action is None:
            return ''
-        for lang in ['bash', 'ipython', 'edit', 'browse']:
+        for lang in ['bash', 'ipython', 'browse']:
+            # special handling for DeepSeek: it has stop-word bug and returns </execute_ipython instead of </execute_ipython>
+            if f'</execute_{lang}' in action and f'</execute_{lang}>' not in action:
+                action = action.replace(f'</execute_{lang}', f'</execute_{lang}>')
+
            if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
                action += f'</execute_{lang}>'
        return action
@@ -161,33 +162,6 @@ class CodeActActionParserAgentDelegate(ActionParser):
        return AgentDelegateAction(agent='BrowsingAgent', inputs={'task': task})


-class CodeActActionParserFileEdit(ActionParser):
-    """Parser action:
-    - FileEditAction(diff_block) - Search/Replace block to edit.
-    """
-
-    def __init__(
-        self,
-    ):
-        self.diff_block = None
-
-    def check_condition(self, action_str: str) -> bool:
-        self.diff_block = re.search(
-            r'<execute_edit>(.*)</execute_edit>', action_str, re.DOTALL
-        )
-        return self.diff_block is not None
-
-    def parse(self, action_str: str) -> Action:
-        assert (
-            self.diff_block is not None
-        ), 'self.diff_block should not be None when parse is called'
-        thought = action_str.replace(self.diff_block.group(0), '').strip()
-        return FileEditAction(
-            diff_block=self.diff_block.group(1).strip(),
-            thought=thought,
-        )
-
-
 class CodeActActionParserMessage(ActionParser):
    """Parser action:
    - MessageAction(content) - Message action to run (e.g. ask for clarification)
@@ -5,22 +5,18 @@ from agenthub.codeact_agent.action_parser import CodeActResponseParser
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
-from openhands.core.exceptions import OperationCancelled
-from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.events.action import (
    Action,
    AgentDelegateAction,
    AgentFinishAction,
    CmdRunAction,
-    FileEditAction,
    IPythonRunCellAction,
    MessageAction,
 )
 from openhands.events.observation import (
    AgentDelegateObservation,
    CmdOutputObservation,
-    FileEditObservation,
    IPythonRunCellObservation,
    UserRejectObservation,
 )
@@ -38,7 +34,7 @@ from openhands.utils.prompt import PromptManager


 class CodeActAgent(Agent):
-    VERSION = '1.10'
+    VERSION = '1.9'
    """
    The Code Act Agent is a minimalist agent.
    The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
@@ -106,8 +102,6 @@ class CodeActAgent(Agent):
            return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
        elif isinstance(action, AgentDelegateAction):
            return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
-        elif isinstance(action, FileEditAction):
-            return f'{action.thought}\n<execute_edit>\n{action.diff_block}\n</execute_edit>'
        elif isinstance(action, MessageAction):
            return action.content
        elif isinstance(action, AgentFinishAction) and action.source == 'agent':
@@ -117,7 +111,6 @@ class CodeActAgent(Agent):
    def get_action_message(self, action: Action) -> Message | None:
        if (
            isinstance(action, AgentDelegateAction)
-            or isinstance(action, FileEditAction)
            or isinstance(action, CmdRunAction)
            or isinstance(action, IPythonRunCellAction)
            or isinstance(action, MessageAction)
@@ -158,9 +151,6 @@ class CodeActAgent(Agent):
            text = '\n'.join(splitted)
            text = truncate_content(text, max_message_chars)
            return Message(role='user', content=[TextContent(text=text)])
-        elif isinstance(obs, FileEditObservation):
-            text = obs_prefix + truncate_content(obs.content, max_message_chars)
-            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, AgentDelegateObservation):
            text = obs_prefix + truncate_content(
                obs.outputs['content'] if 'content' in obs.outputs else '',
@@ -172,7 +162,7 @@ class CodeActAgent(Agent):
            text += '\n[Error occurred in processing last action]'
            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, UserRejectObservation):
-            text = obs_prefix + truncate_content(obs.content, max_message_chars)
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
            text += '\n[Last action has been rejected by the user]'
            return Message(role='user', content=[TextContent(text=text)])
        else:
@@ -195,7 +185,6 @@ class CodeActAgent(Agent):
        - CmdRunAction(command) - bash command to run
        - IPythonRunCellAction(code) - IPython code to run
        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
-        - FileEditAction(diff_block) - Search/Replace block to edit.
        - MessageAction(content) - Message action to run (e.g. ask for clarification)
        - AgentFinishAction() - end the interaction
        """
@@ -212,26 +201,10 @@ class CodeActAgent(Agent):
                '</execute_ipython>',
                '</execute_bash>',
                '</execute_browse>',
-                '</execute_edit>',
            ],
        }

-        if self.llm.is_caching_prompt_active():
-            params['extra_headers'] = {
-                'anthropic-beta': 'prompt-caching-2024-07-31',
-            }
-
-        # TODO: move exception handling to agent_controller
-        try:
-            response = self.llm.completion(**params)
-        except OperationCancelled as e:
-            raise e
-        except Exception as e:
-            logger.error(f'{e}')
-            error_message = '{}: {}'.format(type(e).__name__, str(e).split('\n')[0])
-            return AgentFinishAction(
-                thought=f'Agent encountered an error while processing the last action.\nError: {error_message}\nPlease try again.'
-            )
+        response = self.llm.completion(**params)

        return self.action_parser.parse(response)

@@ -19,44 +19,22 @@ the assistant should retry running the command in the background.
 The assistant can browse the Internet with <execute_browse> and </execute_browse>.
 For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
 Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
-{% endset %}
-{% set EDIT_DIFF_PREFIX %}
-The assistant can edit files with <execute_edit> and </execute_edit>. Each change must be described with a SEARCH/REPLACE block.
-Every SEARCH section must EXACTLY MATCH the existing file content, character for character, including all comments, docstrings, etc. SEARCH/REPLACE blocks will replace all matching occurrences. Include enough lines to make the SEARCH blocks uniquely match the lines to change.
-Keep SEARCH/REPLACE blocks as concise as possible. Break large SEARCH/REPLACE blocks into a series of smaller blocks that each change a small portion of the file.
-To move code within a file, use 2 SEARCH/REPLACE blocks: 1 to delete it from its current location, 1 to insert it in the new location.
-If you want to put code in a new file, use a SEARCH/REPLACE block with: a new file path, an empty `SEARCH` section and the new file's contents in the `REPLACE` section.
-
-Every SEARCH/REPLACE block must use this format:
-1. The FULL file path alone on a line, verbatim. No bold asterisks, no quotes around it, no escaping of characters, etc.
-2. The start of search block: <<<<<<< SEARCH
-3. A contiguous chunk of lines to search for in the existing source code
-4. The dividing line: =======
-5. The lines to replace into the source code
-6. The end of the replace block: >>>>>>> REPLACE
-
-For example,
-<execute_edit>
-demo.py
-<<<<<<< SEARCH
-    print("hello")
-=======
-    print("goodbye")
->>>>>>> REPLACE
-</execute_edit>
-
 {% endset %}
 {% set PIP_INSTALL_PREFIX %}
 The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
 {% endset %}
-{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + EDIT_DIFF_PREFIX + PIP_INSTALL_PREFIX %}
+{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX %}
 {% set COMMAND_DOCS %}
 Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
 {{ agent_skills_docs }}
 IMPORTANT:
 - `open_file` only returns the first 100 lines of the file by default! The assistant MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
+- The assistant shall adhere to THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRING PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write the line out, with all leading spaces before the code!
 - Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
 - Any code issued should be less than 50 lines to avoid context being cut off!
+- After EVERY `create_file` the method `append_file` shall be used to write the FIRST content!
+- For `edit_file_by_replace` NEVER provide empty parameters!
+- For `edit_file_by_replace` the file must be read fully before any replacements!
 {% endset %}
 {% set SYSTEM_SUFFIX %}
 Responses should be concise.
@@ -4,12 +4,23 @@
 USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.

 ASSISTANT:
-Sure! Let me write the Python code for starting a web server and save it to a new Python file `app.py`:
-<execute_edit>
-app.py
-<<<<<<< SEARCH
-=======
-from flask import Flask
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
 app = Flask(__name__)

@app.route('/')
@@ -18,17 +29,30 @@ def index():
    return str(numbers)

 if __name__ == '__main__':
-    app.run(port=5000)
->>>>>>> REPLACE
-</execute_edit>
+    app.run(port=5000)"""

+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>

 USER:
 OBSERVATION:
-I updated the file app.py by
-appending:
-from flask import Flask
-app = Flask(__name__)
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]

 ASSISTANT:
 I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
@@ -153,22 +177,30 @@ USER:

 ASSISTANT:
 I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
-<execute_edit>
-app.py
-<<<<<<< SEARCH
-    return str(numbers)
-=======
-    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
->>>>>>> REPLACE
-</execute_edit>
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>

 USER:
 Observation:
-I updated the file app.py by
-replacing:
-    return str(numbers)
-with:
-    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]

 ASSISTANT:
 Running the updated file:
@@ -166,7 +166,6 @@ class CodeActSWEAgent(Agent):
                '</execute_ipython>',
                '</execute_bash>',
            ],
-            temperature=0.0,
        )

        return self.response_parser.parse(response)
@@ -78,7 +78,6 @@ class MicroAgent(Agent):
        message = Message(role='user', content=content)
        resp = self.llm.completion(
            messages=self.llm.format_messages_for_llm(message),
-            temperature=0.0,
        )
        action_resp = resp['choices'][0]['message']['content']
        action = parse_response(action_resp)
@@ -112,7 +112,7 @@ api_key = "your-api-key"
 #embedding_deployment_name = ""

 # Embedding model to use
-embedding_model = ""
+embedding_model = "local"

 # Maximum number of characters in an observation's content
 #max_message_chars = 10000
@@ -146,8 +146,8 @@ model = "gpt-4o"
 # Drop any unmapped (unsupported) params without causing an exception
 #drop_params = false

-# Using the prompt caching feature provided by the LLM
-#caching_prompt = false
+# Using the prompt caching feature if provided by the LLM and supported
+#caching_prompt = true

 # Base URL for the OLLAMA API
 #ollama_base_url = ""
@@ -188,7 +188,7 @@ model = "gpt-4o-mini"
 #memory_max_threads = 2

 # LLM config group to use
-#llm_config = 'llm'
+#llm_config = 'your-llm-config-group'

 [agent.RepoExplorerAgent]
 # Example: use a cheaper model for RepoExplorerAgent to reduce cost, especially
@@ -232,7 +232,7 @@ llm_config = 'gpt3'
 [security]

 # Enable confirmation mode
-#confirmation_mode = true
+#confirmation_mode = false

 # The security analyzer to use
 #security_analyzer = ""
@@ -37,7 +37,7 @@ ARG OPENHANDS_BUILD_VERSION #re-declare for this section
 ENV RUN_AS_OPENHANDS=true
 # A random number--we need this to be different from the user's UID on the host machine
 ENV OPENHANDS_USER_ID=42420
-ENV SANDBOX_API_HOSTNAME=host.docker.internal
+ENV SANDBOX_LOCAL_RUNTIME_URL=http://host.docker.internal
 ENV USE_HOST_NETWORK=false
 ENV WORKSPACE_BASE=/opt/workspace_base
 ENV OPENHANDS_BUILD_VERSION=$OPENHANDS_BUILD_VERSION
@@ -70,10 +70,11 @@ RUN playwright install --with-deps chromium
 COPY --chown=openhands:app --chmod=770 ./openhands ./openhands
 COPY --chown=openhands:app --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
 COPY --chown=openhands:app --chmod=770 ./agenthub ./agenthub
-COPY --chown=openhands:app --chmod=770 ./pyproject.toml ./pyproject.toml
-COPY --chown=openhands:app --chmod=770 ./poetry.lock ./poetry.lock
-COPY --chown=openhands:app --chmod=770 ./README.md ./README.md
-COPY --chown=openhands:app --chmod=770 ./MANIFEST.in ./MANIFEST.in
+COPY --chown=openhands:app ./pyproject.toml ./pyproject.toml
+COPY --chown=openhands:app ./poetry.lock ./poetry.lock
+COPY --chown=openhands:app ./README.md ./README.md
+COPY --chown=openhands:app ./MANIFEST.in ./MANIFEST.in
+COPY --chown=openhands:app ./LICENSE ./LICENSE

 # This is run as "openhands" user, and will create __pycache__ with openhands:openhands ownership
 RUN python openhands/core/download.py # No-op to download assets
@@ -1,13 +1,40 @@
 #!/bin/bash
 set -eo pipefail

-image_name=$1
-org_name=$2
+# Initialize variables with default values
+image_name=""
+org_name=""
 push=0
-if [[ $3 == "--push" ]]; then
-  push=1
+load=0
+tag_suffix=""
+
+# Function to display usage information
+usage() {
+    echo "Usage: $0 -i <image_name> [-o <org_name>] [--push] [--load] [-t <tag_suffix>]"
+    echo "  -i: Image name (required)"
+    echo "  -o: Organization name"
+    echo "  --push: Push the image"
+    echo "  --load: Load the image"
+    echo "  -t: Tag suffix"
+    exit 1
+}
+
+# Parse command-line options
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -i) image_name="$2"; shift 2 ;;
+        -o) org_name="$2"; shift 2 ;;
+        --push) push=1; shift ;;
+        --load) load=1; shift ;;
+        -t) tag_suffix="$2"; shift 2 ;;
+        *) usage ;;
+    esac
+done
+# Check if required arguments are provided
+if [[ -z "$image_name" ]]; then
+    echo "Error: Image name is required."
+    usage
 fi
-tag_suffix=$4

 echo "Building: $image_name"
 tags=()
@@ -95,14 +122,35 @@ if [[ $push -eq 1 ]]; then
  args+=" --cache-to=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag,mode=max"
 fi

+if [[ $load -eq 1 ]]; then
+  args+=" --load"
+fi
+
 echo "Args: $args"

+# Modify the platform selection based on --load flag
+if [[ $load -eq 1 ]]; then
+  # When loading, build only for the current platform
+  platform=$(docker version -f '{{.Server.Os}}/{{.Server.Arch}}')
+else
+  # For push or without load, build for multiple platforms
+  platform="linux/amd64,linux/arm64"
+fi
+
+echo "Building for platform(s): $platform"
+
 docker buildx build \
  $args \
  --build-arg OPENHANDS_BUILD_VERSION="$OPENHANDS_BUILD_VERSION" \
  --cache-from=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag \
  --cache-from=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag_base-main \
-  --platform linux/amd64,linux/arm64 \
+  --platform $platform \
  --provenance=false \
  -f "$dir/Dockerfile" \
  "$DOCKER_BASE_DIR"
+
+# If load was requested, print the loaded images
+if [[ $load -eq 1 ]]; then
+  echo "Local images built:"
+  docker images "$DOCKER_REPOSITORY" --format "{{.Repository}}:{{.Tag}}"
+fi
@@ -1,44 +0,0 @@
-FROM ubuntu:22.04
-
-# install basic packages
-RUN apt-get update && apt-get install -y \
-    curl \
-    wget \
-    git \
-    vim \
-    nano \
-    unzip \
-    zip \
-    python3 \
-    python3-pip \
-    python3-venv \
-    python3-dev \
-    build-essential \
-    openssh-server \
-    sudo \
-    gcc \
-    jq \
-    g++ \
-    make \
-    iproute2 \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN mkdir -p -m0755 /var/run/sshd
-
-# symlink python3 to python
-RUN ln -s /usr/bin/python3 /usr/bin/python
-
-# ==== OpenHands Runtime Client ====
-RUN mkdir -p /openhands && mkdir -p /openhands/logs && chmod 777 /openhands/logs
-RUN wget --progress=bar:force -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
-RUN bash Miniforge3.sh -b -p /openhands/miniforge3
-RUN chmod -R g+w /openhands/miniforge3
-RUN bash -c ". /openhands/miniforge3/etc/profile.d/conda.sh && conda config --set changeps1 False && conda config --append channels conda-forge"
-RUN echo "" > /openhands/bash.bashrc
-RUN rm -f Miniforge3.sh
-
-# - agentskills dependencies
-RUN /openhands/miniforge3/bin/pip install --upgrade pip
-RUN /openhands/miniforge3/bin/pip install jupyterlab notebook jupyter_kernel_gateway flake8
-RUN /openhands/miniforge3/bin/pip install python-docx PyPDF2 python-pptx pylatexenc openai
-RUN /openhands/miniforge3/bin/pip install python-dotenv toml termcolor pydantic python-docx pyyaml docker pexpect tenacity e2b browsergym minio
@@ -1,4 +0,0 @@
-DOCKER_REGISTRY=ghcr.io
-DOCKER_ORG=all-hands-ai
-DOCKER_IMAGE=sandbox
-DOCKER_BASE_DIR="."
@@ -6,9 +6,9 @@ sidebar_position: 2

 ## System Requirements

-* Docker version 26.0.0+ or Docker Desktop 4.31.0+
-* You must be using Linux or Mac OS
-  * If you are on Windows, you must use [WSL](https://learn.microsoft.com/en-us/windows/wsl/install)
+* Docker version 26.0.0+ or Docker Desktop 4.31.0+.
+* You must be using Linux or Mac OS.
+  * If you are on Windows, you must use [WSL](https://learn.microsoft.com/en-us/windows/wsl/install).

 ## Installation

@@ -18,6 +18,8 @@ existing code that you'd like to modify.
 ```bash
 export WORKSPACE_BASE=$(pwd)/workspace

+docker pull ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+
 docker run -it --pull=always \
    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
@@ -30,8 +32,7 @@ docker run -it --pull=always \
    ghcr.io/all-hands-ai/openhands:0.9
 ```

-You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
-or as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode).
+You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), or using the [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action).

 ## Setup

@@ -54,7 +55,7 @@ The `Advanced Options` also allow you to specify a `Base URL` if required.

 ## Versions

-The command above pulls the `0.9` tag, which represents the most recent stable release of OpenHands. You have other options as well:
+The command above pulls the most recent stable release of OpenHands. You have other options as well:
 - For a specific release, use `ghcr.io/all-hands-ai/openhands:$VERSION`, replacing $VERSION with the version number.
 - We use semver, and release major, minor, and patch tags. So `0.9` will automatically point to the latest `0.9.x` release, and `0` will point to the latest `0.x.x` release.
 - For the most up-to-date development version, you can use `ghcr.io/all-hands-ai/openhands:main`. This version is unstable and is recommended for testing or development purposes only.
@@ -8,7 +8,7 @@ This mode is different from the [headless mode](headless-mode), which is non-int

 To start an interactive OpenHands session via the command line, follow these steps:

-1. Ensure you have followed the [Development setup instructions](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md)
+1. Ensure you have followed the [Development setup instructions](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).

 2. Run the following command:

@@ -9,8 +9,8 @@ as python and Node.js but your use case may need additional software installed b

 There are two ways you can do so:

-1. Use an existing image from docker hub
-2. Creating your own custom docker image and using it
+1. Use an existing image from docker hub.
+2. Creating your own custom docker image and using it.

 If you want to take the first approach, you can skip the `Create Your Docker Image` section.

@@ -0,0 +1,71 @@
+# Debugging
+
+The following is intended as a primer on debugging OpenHands for Development purposes.
+
+## Server / VSCode
+
+The following `launch.json` will allow debugging the agent, controller and server elements, but not the sandbox (Which runs inside docker). It will ignore any changes inside the `workspace/` directory:
+
+```
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "OpenHands CLI",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "openhands.core.cli",
+            "justMyCode": false
+        },
+        {
+            "name": "OpenHands WebApp",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "uvicorn",
+            "args": [
+                "openhands.server.listen:app",
+                "--reload",
+                "--reload-exclude",
+                "${workspaceFolder}/workspace",
+                "--port",
+                "3000"
+            ],
+            "justMyCode": false
+        }
+    ]
+}
+```
+
+More specific debugging configurations which include more parameters may be specified:
+
+```
+    ...
+    {
+      "name": "Debug CodeAct",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "openhands.core.main",
+      "args": [
+        "-t",
+        "Ask me what your task is.",
+        "-d",
+        "${workspaceFolder}/workspace",
+        "-c",
+        "CodeActAgent",
+        "-l",
+        "llm.o1",
+        "-n",
+        "prompts"
+      ],
+      "justMyCode": false
+    }
+    ...
+```
+
+Values in the snippet above can be updated such that:
+
+    * *t*: the task
+    * *d*: the openhands workspace directory
+    * *c*: the agent
+    * *l*: the LLM config (pre-defined in config.toml)
+    * *n*: session name (e.g. eventstream name)
@@ -0,0 +1,15 @@
+# Using the OpenHands GitHub Action
+
+This guide explains how to use the OpenHands GitHub Action, both within the OpenHands repository and in your own projects.
+
+## Using the Action in the OpenHands Repository
+
+To use the OpenHands GitHub Action in the OpenHands repository, an OpenHands maintainer can:
+
+1. Create an issue in the repository.
+2. Add the `fix-me` label to the issue.
+3. The action will automatically trigger and attempt to resolve the issue.
+
+## Installing the Action in a New Repository
+
+To install the OpenHands GitHub Action in your own repository, follow the [directions in the OpenHands Resolver repo](https://github.com/All-Hands-AI/OpenHands-resolver?tab=readme-ov-file#using-the-github-actions-workflow).
@@ -177,6 +177,7 @@ spec:
      claimName: docker-pvc
 ```

+
 ```bash
 # create the pod
 $ oc create -f pod.yaml
@@ -262,3 +263,167 @@ Events:                   <none>
 6. Connect to OpenHands UI, configure the Agent, then test:

 ![image](https://github.com/user-attachments/assets/12f94804-a0c7-4744-b873-e003c9caf40e)
+
+
+
+## GCP GKE Openhands deployment
+
+**Warning**: this deployment grants the OpenHands application access to the Kubernetes docker socket, which creates security risk. Use at your own discretion.
+1- Create policy for privillege access
+2- Create gke credentials(optional)
+3- Create openhands deployment
+4- Verification and ui access commands
+5- Tshoot pod to verify the internal container
+
+1. create policy for privillege access
+```bash
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: privileged-role
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["create", "get", "list", "watch", "delete"]
+- apiGroups: ["apps"]
+  resources: ["deployments"]
+  verbs: ["create", "get", "list", "watch", "delete"]
+- apiGroups: [""]
+  resources: ["pods/exec"]
+  verbs: ["create"]
+- apiGroups: [""]
+  resources: ["pods/log"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: privileged-role-binding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: privileged-role
+subjects:
+- kind: ServiceAccount
+  name: default  # Change to your service account name
+  namespace: default
+```
+2. create gke credentials(optional)
+```bash
+kubectl create secret generic google-cloud-key \
+  --from-file=key.json=/path/to/your/google-cloud-key.json
+  ```
+3. create openhands deployment
+## as this is tested for the single worker node if you have multiple specify the flag for the single worker
+
+```bash
+kind: Deployment
+metadata:
+  name: openhands-app-2024
+  labels:
+    app: openhands-app-2024
+spec:
+  replicas: 1  # You can increase this number for multiple replicas
+  selector:
+    matchLabels:
+      app: openhands-app-2024
+  template:
+    metadata:
+      labels:
+        app: openhands-app-2024
+    spec:
+      containers:
+      - name: openhands-app-2024
+        image: ghcr.io/all-hands-ai/openhands:main
+        env:
+        - name: SANDBOX_USER_ID
+          value: "1000"
+        - name: SANDBOX_API_HOSTNAME
+          value: '10.164.0.4'
+        - name: WORKSPACE_MOUNT_PATH
+          value: "/tmp/workspace_base"
+        - name: GOOGLE_APPLICATION_CREDENTIALS
+          value: "/tmp/workspace_base/google-cloud-key.json"
+        volumeMounts:
+        - name: workspace-volume
+          mountPath: /tmp/workspace_base
+        - name: docker-sock
+          mountPath: /var/run/docker.sock
+        - name: google-credentials
+          mountPath: "/tmp/workspace_base/google-cloud-key.json"
+        securityContext:
+          privileged: true  # Add this to allow privileged access
+        ports:
+        - containerPort: 3000
+      - name: openhands-sandbox-2024
+        image: ghcr.io/opendevin/sandbox:main
+    #    securityContext:
+    #      privileged: true  # Add this to allow privileged access
+        ports:
+        - containerPort: 51963
+        command: ["/usr/sbin/sshd", "-D", "-p 51963", "-o", "PermitRootLogin=yes"]
+      volumes:
+      #- name: workspace-volume
+      #  persistentVolumeClaim:
+      #    claimName: workspace-pvc
+      - name: workspace-volume
+        emptyDir: {}
+      - name: docker-sock
+        hostPath:
+          path: /var/run/docker.sock       # Use host's Docker socket
+          type: Socket
+      - name: google-credentials
+        secret:
+          secretName: google-cloud-key
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: openhands-app-2024-svc
+spec:
+  selector:
+    app: openhands-app-2024
+  ports:
+  - name: http
+    protocol: TCP
+    port: 80
+    targetPort: 3000
+  - name: ssh
+    protocol: TCP
+    port: 51963
+    targetPort: 51963
+  type: LoadBalancer
+  ```
+
+5. Tshoot pod to verify the internal container
+### if you want to know more regarding the internal container runtime use below mention pod deployment use kubectl exec -it to enter into container and you can check the contaienr run time using normal docker commands like "docker ps -a"
+
+```bash
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docker-in-docker
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: docker-in-docker
+  template:
+    metadata:
+      labels:
+        app: docker-in-docker
+    spec:
+      containers:
+      - name: dind
+        image: docker:20.10-dind
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: docker-sock
+          mountPath: /var/run/docker.sock
+      volumes:
+      - name: docker-sock
+        hostPath:
+          path: /var/run/docker.sock
+          type: Socket
+```
@@ -8,8 +8,8 @@ When running OpenHands, you'll need to set the following in the OpenHands UI thr
 * `LLM Provider` to `Groq`
 * `LLM Model` to the model you will be using. [Visit here to see the list of
 models that Groq hosts](https://console.groq.com/docs/models). If the model is not in the list, toggle
-`Advanced Options`, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`)
-* `API key` to your Groq API key. To find or create your Groq API Key, [see here](https://console.groq.com/keys)
+`Advanced Options`, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`).
+* `API key` to your Groq API key. To find or create your Groq API Key, [see here](https://console.groq.com/keys).



@@ -7,11 +7,10 @@ sidebar_position: 3
 OpenHands can connect to any LLM supported by LiteLLM. However, it requires a powerful model to work.
 The following are verified by the community to work with OpenHands:

-* claude-3-5-sonnet
+* claude-3-5-sonnet (recommended)
 * gemini-1.5-pro / gemini-1.5-flash
 * gpt-4 / gpt-4o
 * llama-3.1-405b / hermes-3-llama-3.1-405b
-* wizardlm-2-8x22b

 :::warning
 OpenHands will issue many prompts to the LLM you configure. Most of these LLMs cost money, so be sure to set spending
@@ -59,9 +58,21 @@ We have a few guides for running OpenHands with specific model providers:

 ### API retries and rate limits

-Some LLMs have rate limits and may require retries. OpenHands will automatically retry requests if it receives a 429 error or API connection error.
-You can set the following environment variables to control the number of retries and the time between retries:
+LLM providers typically have rate limits, sometimes very low, and may require retries. OpenHands will automatically retry requests if it receives a Rate Limit Error (429 error code), API connection error, or other transient errors.
+
+You can customize these options as you need for the provider you're using. Check their documentation, and set the following environment variables to control the number of retries and the time between retries:

 * `LLM_NUM_RETRIES` (Default of 8)
 * `LLM_RETRY_MIN_WAIT` (Default of 15 seconds)
 * `LLM_RETRY_MAX_WAIT` (Default of 120 seconds)
+* `LLM_RETRY_MULTIPLIER` (Default of 2)
+
+If you are running OpenHands in development mode, you can also set these options in the `config.toml` file:
+
+```toml
+[llm]
+num_retries = 8
+retry_min_wait = 15
+retry_max_wait = 120
+retry_multiplier = 2
+```
@@ -0,0 +1,39 @@
+# Prompting Best Practices
+
+When working with OpenHands AI software developer, it's crucial to provide clear and effective prompts. This guide outlines best practices for creating prompts that will yield the most accurate and useful responses.
+
+## Characteristics of Good Prompts
+
+Good prompts are:
+
+1. **Concrete**: They explain exactly what functionality should be added or what error needs to be fixed.
+2. **Location-specific**: If known, they explain the locations in the code base that should be modified.
+3. **Appropriately scoped**: They should be the size of a single feature, typically not exceeding 100 lines of code.
+
+## Examples
+
+### Good Prompt Examples
+
+1. "Add a function `calculate_average` in `utils/math_operations.py` that takes a list of numbers as input and returns their average."
+
+2. "Fix the TypeError in `frontend/src/components/UserProfile.tsx` occurring on line 42. The error suggests we're trying to access a property of undefined."
+
+3. "Implement input validation for the email field in the registration form. Update `frontend/src/components/RegistrationForm.tsx` to check if the email is in a valid format before submission."
+
+### Bad Prompt Examples
+
+1. "Make the code better." (Too vague, not concrete)
+
+2. "Rewrite the entire backend to use a different framework." (Not appropriately scoped)
+
+3. "There's a bug somewhere in the user authentication. Can you find and fix it?" (Lacks specificity and location information)
+
+## Tips for Effective Prompting
+
+1. Be as specific as possible about the desired outcome or the problem to be solved.
+2. Provide context, including relevant file paths and line numbers if available.
+3. Break down large tasks into smaller, manageable prompts.
+4. Include any relevant error messages or logs.
+5. Specify the programming language or framework if it's not obvious from the context.
+
+Remember, the more precise and informative your prompt is, the better the AI can assist you in developing or modifying the OpenHands software.
@@ -10,44 +10,81 @@ const sidebars: SidebarsConfig = {
    },
    {
      type: 'category',
-      label: 'LLMs',
+      label: 'Usage Methods',
      items: [
        {
          type: 'doc',
-          label: 'Overview',
-          id: 'usage/llms/llms',
+          label: 'CLI Mode',
+          id: 'usage/how-to/cli-mode',
        },
+        {
+          type: 'doc',
+          label: 'Headless Mode',
+          id: 'usage/how-to/headless-mode',
+        },
+        {
+          type: 'doc',
+          label: 'Github Actions',
+          id: 'usage/how-to/github-action',
+        },
+        {
+          type: 'doc',
+          label: 'Prompting Best Practices',
+          id: 'usage/prompting-best-practices',
+        },
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Advanced Configuration',
+      items: [
        {
          type: 'category',
-          label: 'Providers',
+          label: 'LLM Configuration',
          items: [
            {
              type: 'doc',
-              label: 'Azure',
-              id: 'usage/llms/azure-llms',
+              label: 'Overview',
+              id: 'usage/llms/llms',
            },
            {
-              type: 'doc',
-              label: 'Google',
-              id: 'usage/llms/google-llms',
-            },
-            {
-              type: 'doc',
-              label: 'Groq',
-              id: 'usage/llms/groq',
-            },
-            {
-              type: 'doc',
-              label: 'OpenAI',
-              id: 'usage/llms/openai-llms',
-            },
-            {
-              type: 'doc',
-              label: 'OpenRouter',
-              id: 'usage/llms/openrouter',
+              type: 'category',
+              label: 'Providers',
+              items: [
+                {
+                  type: 'doc',
+                  label: 'Azure',
+                  id: 'usage/llms/azure-llms',
+                },
+                {
+                  type: 'doc',
+                  label: 'Google',
+                  id: 'usage/llms/google-llms',
+                },
+                {
+                  type: 'doc',
+                  label: 'Groq',
+                  id: 'usage/llms/groq',
+                },
+                {
+                  type: 'doc',
+                  label: 'OpenAI',
+                  id: 'usage/llms/openai-llms',
+                },
+                {
+                  type: 'doc',
+                  label: 'OpenRouter',
+                  id: 'usage/llms/openrouter',
+                },
+              ],
            },
          ],
        },
+        {
+          type: 'doc',
+          label: 'Custom Sandbox',
+          id: 'usage/how-to/custom-sandbox-guide',
+        },
      ],
    },
    {
@@ -62,44 +99,39 @@ const sidebars: SidebarsConfig = {
    },
    {
      type: 'category',
-      label: 'How-to Guides',
+      label: 'For OpenHands Developers',
      items: [
        {
-          type: 'doc',
-          id: 'usage/how-to/cli-mode',
+          type: 'category',
+          label: 'Architecture',
+          items: [
+            {
+              type: 'doc',
+              label: 'Backend',
+              id: 'usage/architecture/backend',
+            },
+            {
+              type: 'doc',
+              label: 'Runtime',
+              id: 'usage/architecture/runtime',
+            },
+          ],
        },
        {
          type: 'doc',
-          id: 'usage/how-to/headless-mode',
-        },
-        {
-          type: 'doc',
-          id: 'usage/how-to/custom-sandbox-guide',
+          label: 'Debugging',
+          id: 'usage/how-to/debugging',
        },
        {
          type: 'doc',
+          label: 'Evaluation',
          id: 'usage/how-to/evaluation-harness',
        },
        {
          type: 'doc',
+          label: 'Kubernetes Deployment',
          id: 'usage/how-to/openshift-example',
-        }
-      ]
-    },
-    {
-      type: 'category',
-      label: 'Architecture',
-      items: [
-        {
-          type: 'doc',
-          label: 'Backend',
-          id: 'usage/architecture/backend',
        },
-        {
-          type: 'doc',
-          label: 'Runtime',
-          id: 'usage/architecture/runtime',
-        }
      ],
    },
    {
@@ -256,7 +256,6 @@ def process_instance(
 if __name__ == '__main__':
    args = parse_arguments()
    dataset = load_dataset('RajMaheshwari/Exercism-Python')
-    dataset = dataset.shuffle(seed=42)
    aider_bench_tests = dataset['train'].to_pandas()

    llm_config = None
@@ -28,7 +28,9 @@ When the `run_infer.sh` script is started, it will automatically pull the releva

 ```bash
 ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
-# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test
+
+# Example
+./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test
 ```

 where `model_config` is mandatory, and the rest are optional.
@@ -68,10 +70,11 @@ then your command would be:
 This is in limited beta. Contact Xingyao over slack if you want to try this out!

 ```bash
-# ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" \
+./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+
+# Example - This runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
-# This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
 ```

 To clean-up all existing runtime you've already started, run:
@@ -133,8 +136,9 @@ NOTE, you should have already pulled the instance-level OR env-level docker imag
 Then you can run the following:

 ```bash
-# ./evaluation/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split]
-# For example:
+./evaluation/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split]
+
+# Example
 ./evaluation/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
 ```

@@ -162,9 +166,11 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
 This is in limited beta. Contact Xingyao over slack if you want to try this out!

 ```bash
-# ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
-# This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
+./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
+
+# Example - This evaluates patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
+evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
 ```

 To clean-up all existing runtimes that you've already started, run:
@@ -81,6 +81,7 @@ def get_config(instance: pd.Series) -> AppConfig:
            # large enough timeout, since some testcases take very long to run
            timeout=1800,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
        ),
        # do not mount workspace
        workspace_base=None,
@@ -131,6 +131,7 @@ def get_config(
            # large enough timeout, since some testcases take very long to run
            timeout=300,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
        ),
        # do not mount workspace
        workspace_base=None,
@@ -2,10 +2,10 @@


 # API base URL
-BASE_URL="https://api.all-hands.dev/v0"
+BASE_URL="https://runtime.eval.all-hands.dev"

 # Get the list of runtimes
-response=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
+response=$(curl --silent --location --request GET "${BASE_URL}/list" \
  --header "X-API-Key: ${ALLHANDS_API_KEY}")

 n_runtimes=$(echo $response | jq -r '.total')
@@ -16,7 +16,7 @@ runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
 counter=1
 for runtime_id in $runtime_ids; do
  echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
-  curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
+  curl --silent --location --request POST "${BASE_URL}/stop" \
    --header "X-API-Key: ${ALLHANDS_API_KEY}" \
    --header "Content-Type: application/json" \
    --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
@@ -106,7 +106,7 @@ if [ -z "$INSTANCE_ID" ]; then
        rm -rf $RESULT_OUTPUT_DIR/eval_outputs
    fi

-    mv run_instance_logs/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
+    mv logs/run_evaluation/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
    mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
    echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt

@@ -40,9 +40,9 @@ class EvalMetadata(BaseModel):
    def model_dump_json(self, *args, **kwargs):
        dumped = super().model_dump_json(*args, **kwargs)
        dumped_dict = json.loads(dumped)
-        logger.debug(f'Dumped metadata: {dumped_dict}')
        # avoid leaking sensitive information
        dumped_dict['llm_config'] = self.llm_config.to_safe_dict()
+        logger.debug(f'Dumped metadata: {dumped_dict}')
        return json.dumps(dumped_dict)


@@ -61,7 +61,7 @@ class EvalOutput(BaseModel):
    history: (
        list[dict[str, Any]] | list[tuple[dict[str, Any], dict[str, Any]]] | None
    ) = None
-    llm_completions: list[dict[str, Any]]
+    llm_completions: list[dict[str, Any]] | None = None
    metrics: dict[str, Any] | None = None
    error: str | None = None

@@ -375,18 +375,27 @@ def reset_logger_for_multiprocessing(
    # Remove all existing handlers from logger
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
-    # add back the console handler to print ONE line
-    logger.addHandler(get_console_handler())
+
+    # add console handler to print ONE line
+    console_handler = get_console_handler(log_level=logging.INFO)
+    console_handler.setFormatter(
+        logging.Formatter(
+            f'Instance {instance_id} - ' + '%(asctime)s - %(levelname)s - %(message)s'
+        )
+    )
+    logger.addHandler(console_handler)
    logger.info(
        f'Starting evaluation for instance {instance_id}.\n'
        f'Hint: run "tail -f {log_file}" to see live logs in a separate shell'
    )
-    # Remove all existing handlers from logger
-    for handler in logger.handlers[:]:
-        logger.removeHandler(handler)
+    # Only log WARNING or higher to console
+    console_handler.setLevel(logging.WARNING)
+
+    # Log INFO and above to file
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    file_handler = logging.FileHandler(log_file)
    file_handler.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    )
+    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)
@@ -1,16 +1,16 @@
 {
  "name": "openhands-frontend",
-  "version": "0.9.4",
+  "version": "0.9.7",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "openhands-frontend",
-      "version": "0.9.4",
+      "version": "0.9.7",
      "dependencies": {
        "@monaco-editor/react": "^4.6.0",
        "@nextui-org/react": "^2.4.8",
-        "@react-types/shared": "^3.24.1",
+        "@react-types/shared": "^3.25.0",
        "@reduxjs/toolkit": "^2.2.7",
        "@vitejs/plugin-react": "^4.3.1",
        "@xterm/addon-fit": "^0.10.0",
@@ -33,7 +33,7 @@
        "react-syntax-highlighter": "^15.5.0",
        "remark-gfm": "^4.0.0",
        "tailwind-merge": "^2.5.2",
-        "vite": "^5.4.7",
+        "vite": "^5.4.8",
        "web-vitals": "^3.5.2"
      },
      "devDependencies": {
@@ -41,8 +41,8 @@
        "@testing-library/jest-dom": "^6.5.0",
        "@testing-library/react": "^16.0.1",
        "@testing-library/user-event": "^14.5.2",
-        "@types/node": "^22.6.1",
-        "@types/react": "^18.3.8",
+        "@types/node": "^22.7.4",
+        "@types/react": "^18.3.10",
        "@types/react-dom": "^18.3.0",
        "@types/react-highlight": "^0.12.8",
        "@types/react-syntax-highlighter": "^15.5.13",
@@ -60,11 +60,11 @@
        "eslint-plugin-react": "^7.35.0",
        "eslint-plugin-react-hooks": "^4.6.2",
        "husky": "^9.1.6",
-        "jsdom": "^25.0.0",
+        "jsdom": "^25.0.1",
        "lint-staged": "^15.2.10",
        "postcss": "^8.4.47",
        "prettier": "^3.3.3",
-        "tailwindcss": "^3.4.12",
+        "tailwindcss": "^3.4.13",
        "typescript": "^5.6.2",
        "vite-tsconfig-paths": "^5.0.1",
        "vitest": "^1.6.0"
@@ -4336,9 +4336,9 @@
      }
    },
    "node_modules/@react-types/shared": {
-      "version": "3.24.1",
-      "resolved": "https://registry.npmjs.org/@react-types/shared/-/shared-3.24.1.tgz",
-      "integrity": "sha512-AUQeGYEm/zDTN6zLzdXolDxz3Jk5dDL7f506F07U8tBwxNNI3WRdhU84G0/AaFikOZzDXhOZDr3MhQMzyE7Ydw==",
+      "version": "3.25.0",
+      "resolved": "https://registry.npmjs.org/@react-types/shared/-/shared-3.25.0.tgz",
+      "integrity": "sha512-OZSyhzU6vTdW3eV/mz5i6hQwQUhkRs7xwY2d1aqPvTdMe0+2cY7Fwp45PAiwYLEj73i9ro2FxF9qC4DvHGSCgQ==",
      "peerDependencies": {
        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
      }
@@ -4860,9 +4860,9 @@
      "integrity": "sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g=="
    },
    "node_modules/@types/node": {
-      "version": "22.6.1",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.6.1.tgz",
-      "integrity": "sha512-V48tCfcKb/e6cVUigLAaJDAILdMP0fUW6BidkPK4GpGjXcfbnoHasCZDwz3N3yVt5we2RHm4XTQCpv0KJz9zqw==",
+      "version": "22.7.4",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.7.4.tgz",
+      "integrity": "sha512-y+NPi1rFzDs1NdQHHToqeiX2TIS79SWEAw9GYhkkx8bD0ChpfqC+n2j5OXOCpzfojBEBt6DnEnnG9MY0zk1XLg==",
      "devOptional": true,
      "dependencies": {
        "undici-types": "~6.19.2"
@@ -4874,9 +4874,9 @@
      "integrity": "sha512-5zvhXYtRNRluoE/jAp4GVsSduVUzNWKkOZrCDBWYtE7biZywwdC2AcEzg+cSMLFRfVgeAFqpfNabiPjxFddV1Q=="
    },
    "node_modules/@types/react": {
-      "version": "18.3.8",
-      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.8.tgz",
-      "integrity": "sha512-syBUrW3/XpnW4WJ41Pft+I+aPoDVbrBVQGEnbD7NijDGlVC+8gV/XKRY+7vMDlfPpbwYt0l1vd/Sj8bJGMbs9Q==",
+      "version": "18.3.10",
+      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.10.tgz",
+      "integrity": "sha512-02sAAlBnP39JgXwkAq3PeU9DVaaGpZyF3MGcC0MKgQVkZor5IiiDAipVaxQHtDJAmO4GIy/rVBy/LzVj76Cyqg==",
      "dependencies": {
        "@types/prop-types": "*",
        "csstype": "^3.0.2"
@@ -6160,23 +6160,17 @@
      }
    },
    "node_modules/cssstyle": {
-      "version": "4.0.1",
-      "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.0.1.tgz",
-      "integrity": "sha512-8ZYiJ3A/3OkDd093CBT/0UKDWry7ak4BdPTFP2+QEP7cmhouyq/Up709ASSj2cK02BbZiMgk7kYjZNS4QP5qrQ==",
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.1.0.tgz",
+      "integrity": "sha512-h66W1URKpBS5YMI/V8PyXvTMFT8SupJ1IzoIV8IeBC/ji8WVmrO8dGlTi+2dh6whmdk6BiKJLD/ZBkhWbcg6nA==",
      "dev": true,
      "dependencies": {
-        "rrweb-cssom": "^0.6.0"
+        "rrweb-cssom": "^0.7.1"
      },
      "engines": {
        "node": ">=18"
      }
    },
-    "node_modules/cssstyle/node_modules/rrweb-cssom": {
-      "version": "0.6.0",
-      "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz",
-      "integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw==",
-      "dev": true
-    },
    "node_modules/csstype": {
      "version": "3.1.3",
      "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
@@ -8872,12 +8866,12 @@
      }
    },
    "node_modules/jsdom": {
-      "version": "25.0.0",
-      "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-25.0.0.tgz",
-      "integrity": "sha512-OhoFVT59T7aEq75TVw9xxEfkXgacpqAhQaYgP9y/fDqWQCMB/b1H66RfmPm/MaeaAIU9nDwMOVTlPN51+ao6CQ==",
+      "version": "25.0.1",
+      "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-25.0.1.tgz",
+      "integrity": "sha512-8i7LzZj7BF8uplX+ZyOlIz86V6TAsSs+np6m1kpW9u0JWi4z/1t+FzcK1aek+ybTnAC4KhBL4uXCNT0wcUIeCw==",
      "dev": true,
      "dependencies": {
-        "cssstyle": "^4.0.1",
+        "cssstyle": "^4.1.0",
        "data-urls": "^5.0.0",
        "decimal.js": "^10.4.3",
        "form-data": "^4.0.0",
@@ -8890,7 +8884,7 @@
        "rrweb-cssom": "^0.7.1",
        "saxes": "^6.0.0",
        "symbol-tree": "^3.2.4",
-        "tough-cookie": "^4.1.4",
+        "tough-cookie": "^5.0.0",
        "w3c-xmlserializer": "^5.0.0",
        "webidl-conversions": "^7.0.0",
        "whatwg-encoding": "^3.1.1",
@@ -11061,12 +11055,6 @@
        "url": "https://github.com/sponsors/wooorm"
      }
    },
-    "node_modules/psl": {
-      "version": "1.9.0",
-      "resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz",
-      "integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag==",
-      "dev": true
-    },
    "node_modules/punycode": {
      "version": "2.3.1",
      "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
@@ -11076,12 +11064,6 @@
        "node": ">=6"
      }
    },
-    "node_modules/querystringify": {
-      "version": "2.2.0",
-      "resolved": "https://registry.npmjs.org/querystringify/-/querystringify-2.2.0.tgz",
-      "integrity": "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ==",
-      "dev": true
-    },
    "node_modules/queue-microtask": {
      "version": "1.2.3",
      "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
@@ -11593,12 +11575,6 @@
        "url": "https://opencollective.com/unified"
      }
    },
-    "node_modules/requires-port": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz",
-      "integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ==",
-      "dev": true
-    },
    "node_modules/reselect": {
      "version": "5.1.1",
      "resolved": "https://registry.npmjs.org/reselect/-/reselect-5.1.1.tgz",
@@ -12426,9 +12402,9 @@
      }
    },
    "node_modules/tailwindcss": {
-      "version": "3.4.12",
-      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.12.tgz",
-      "integrity": "sha512-Htf/gHj2+soPb9UayUNci/Ja3d8pTmu9ONTfh4QY8r3MATTZOzmv6UYWF7ZwikEIC8okpfqmGqrmDehua8mF8w==",
+      "version": "3.4.13",
+      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.13.tgz",
+      "integrity": "sha512-KqjHOJKogOUt5Bs752ykCeiwvi0fKVkr5oqsFNt/8px/tA8scFPIlkygsf6jXrfCqGHz7VflA6+yytWuM+XhFw==",
      "dependencies": {
        "@alloc/quick-lru": "^5.2.0",
        "arg": "^5.0.2",
@@ -12566,6 +12542,24 @@
        "node": ">=14.0.0"
      }
    },
+    "node_modules/tldts": {
+      "version": "6.1.47",
+      "resolved": "https://registry.npmjs.org/tldts/-/tldts-6.1.47.tgz",
+      "integrity": "sha512-R/K2tZ5MiY+mVrnSkNJkwqYT2vUv1lcT6wJvd2emGaMJ7PHUGRY4e3tUsdFCXgqxi2QgbHjL3yJgXCo40v9Hxw==",
+      "dev": true,
+      "dependencies": {
+        "tldts-core": "^6.1.47"
+      },
+      "bin": {
+        "tldts": "bin/cli.js"
+      }
+    },
+    "node_modules/tldts-core": {
+      "version": "6.1.47",
+      "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-6.1.47.tgz",
+      "integrity": "sha512-6SWyFMnlst1fEt7GQVAAu16EGgFK0cLouH/2Mk6Ftlwhv3Ol40L0dlpGMcnnNiiOMyD2EV/aF3S+U2nKvvLvrA==",
+      "dev": true
+    },
    "node_modules/to-fast-properties": {
      "version": "2.0.0",
      "resolved": "https://registry.npmjs.org/to-fast-properties/-/to-fast-properties-2.0.0.tgz",
@@ -12586,18 +12580,15 @@
      }
    },
    "node_modules/tough-cookie": {
-      "version": "4.1.4",
-      "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.4.tgz",
-      "integrity": "sha512-Loo5UUvLD9ScZ6jh8beX1T6sO1w2/MpCRpEP7V280GKMVUQ0Jzar2U3UJPsrdbziLEMMhu3Ujnq//rhiFuIeag==",
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-5.0.0.tgz",
+      "integrity": "sha512-FRKsF7cz96xIIeMZ82ehjC3xW2E+O2+v11udrDYewUbszngYhsGa8z6YUMMzO9QJZzzyd0nGGXnML/TReX6W8Q==",
      "dev": true,
      "dependencies": {
-        "psl": "^1.1.33",
-        "punycode": "^2.1.1",
-        "universalify": "^0.2.0",
-        "url-parse": "^1.5.3"
+        "tldts": "^6.1.32"
      },
      "engines": {
-        "node": ">=6"
+        "node": ">=16"
      }
    },
    "node_modules/tr46": {
@@ -12936,15 +12927,6 @@
        "url": "https://opencollective.com/unified"
      }
    },
-    "node_modules/universalify": {
-      "version": "0.2.0",
-      "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz",
-      "integrity": "sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg==",
-      "dev": true,
-      "engines": {
-        "node": ">= 4.0.0"
-      }
-    },
    "node_modules/update-browserslist-db": {
      "version": "1.1.0",
      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.0.tgz",
@@ -12983,16 +12965,6 @@
        "punycode": "^2.1.0"
      }
    },
-    "node_modules/url-parse": {
-      "version": "1.5.10",
-      "resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.5.10.tgz",
-      "integrity": "sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ==",
-      "dev": true,
-      "dependencies": {
-        "querystringify": "^2.1.1",
-        "requires-port": "^1.0.0"
-      }
-    },
    "node_modules/use-callback-ref": {
      "version": "1.3.2",
      "resolved": "https://registry.npmjs.org/use-callback-ref/-/use-callback-ref-1.3.2.tgz",
@@ -13112,9 +13084,9 @@
      }
    },
    "node_modules/vite": {
-      "version": "5.4.7",
-      "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.7.tgz",
-      "integrity": "sha512-5l2zxqMEPVENgvzTuBpHer2awaetimj2BGkhBPdnwKbPNOlHsODU+oiazEZzLK7KhAnOrO+XGYJYn4ZlUhDtDQ==",
+      "version": "5.4.8",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.8.tgz",
+      "integrity": "sha512-FqrItQ4DT1NC4zCUqMB4c4AZORMKIa0m8/URVCZ77OZ/QSNeJ54bU1vrFADbDsuwfIPcgknRkmqakQcgnL4GiQ==",
      "dependencies": {
        "esbuild": "^0.21.3",
        "postcss": "^8.4.43",
@@ -1,6 +1,6 @@
 {
  "name": "openhands-frontend",
-  "version": "0.9.4",
+  "version": "0.9.7",
  "private": true,
  "type": "module",
  "engines": {
@@ -9,7 +9,7 @@
  "dependencies": {
    "@monaco-editor/react": "^4.6.0",
    "@nextui-org/react": "^2.4.8",
-    "@react-types/shared": "^3.24.1",
+    "@react-types/shared": "^3.25.0",
    "@reduxjs/toolkit": "^2.2.7",
    "@vitejs/plugin-react": "^4.3.1",
    "@xterm/addon-fit": "^0.10.0",
@@ -32,7 +32,7 @@
    "react-syntax-highlighter": "^15.5.0",
    "remark-gfm": "^4.0.0",
    "tailwind-merge": "^2.5.2",
-    "vite": "^5.4.7",
+    "vite": "^5.4.8",
    "web-vitals": "^3.5.2"
  },
  "scripts": {
@@ -64,8 +64,8 @@
    "@testing-library/jest-dom": "^6.5.0",
    "@testing-library/react": "^16.0.1",
    "@testing-library/user-event": "^14.5.2",
-    "@types/node": "^22.6.1",
-    "@types/react": "^18.3.8",
+    "@types/node": "^22.7.4",
+    "@types/react": "^18.3.10",
    "@types/react-dom": "^18.3.0",
    "@types/react-highlight": "^0.12.8",
    "@types/react-syntax-highlighter": "^15.5.13",
@@ -83,11 +83,11 @@
    "eslint-plugin-react": "^7.35.0",
    "eslint-plugin-react-hooks": "^4.6.2",
    "husky": "^9.1.6",
-    "jsdom": "^25.0.0",
+    "jsdom": "^25.0.1",
    "lint-staged": "^15.2.10",
    "postcss": "^8.4.47",
    "prettier": "^3.3.3",
-    "tailwindcss": "^3.4.12",
+    "tailwindcss": "^3.4.13",
    "typescript": "^5.6.2",
    "vite-tsconfig-paths": "^5.0.1",
    "vitest": "^1.6.0"
@@ -94,13 +94,13 @@ function AgentStatusBar() {
  const [statusMessage, setStatusMessage] = React.useState<string>("");

  React.useEffect(() => {
-    const trimmedCustomMessage = curStatusMessage.message.trim();
+    const trimmedCustomMessage = curStatusMessage.status.trim();
    if (trimmedCustomMessage) {
      setStatusMessage(t(trimmedCustomMessage));
    } else {
      setStatusMessage(AgentStatusMap[curAgentState].message);
    }
-  }, [curAgentState, curStatusMessage.message]);
+  }, [curAgentState, curStatusMessage.status]);

  return (
    <div className="flex flex-col items-center">
@@ -7,6 +7,7 @@ import { I18nKey } from "../../../i18n/declaration";
 import { AutocompleteCombobox } from "./AutocompleteCombobox";
 import { Settings } from "#/services/settings";
 import { organizeModelsAndProviders } from "#/utils/organizeModelsAndProviders";
+import { extractModelAndProvider } from "#/utils/extractModelAndProvider";
 import { ModelSelector } from "./ModelSelector";

 interface SettingsFormProps {
@@ -41,17 +42,29 @@ function SettingsForm({
 }: SettingsFormProps) {
  const { t } = useTranslation();
  const { isOpen: isVisible, onOpenChange: onVisibleChange } = useDisclosure();
-  const advancedAlreadyInUse = React.useMemo(
-    () =>
+  const advancedAlreadyInUse = React.useMemo(() => {
+    const organizedModels = organizeModelsAndProviders(models);
+    const { provider, model } = extractModelAndProvider(
+      settings.LLM_MODEL || "",
+    );
+    const isKnownModel =
+      provider in organizedModels &&
+      organizedModels[provider].models.includes(model);
+
+    return (
      !!settings.SECURITY_ANALYZER ||
      !!settings.CONFIRMATION_MODE ||
      !!settings.LLM_BASE_URL ||
-      (!!settings.LLM_MODEL && !models.includes(settings.LLM_MODEL)),
-    [],
-  );
+      (!!settings.LLM_MODEL && !isKnownModel)
+    );
+  }, [settings, models]);
  const [enableAdvanced, setEnableAdvanced] =
    React.useState(advancedAlreadyInUse);

+  React.useEffect(() => {
+    setEnableAdvanced(advancedAlreadyInUse);
+  }, [advancedAlreadyInUse]);
+
  const handleAdvancedChange = (value: boolean) => {
    setEnableAdvanced(value);
  };
@@ -140,11 +140,11 @@ export function handleActionMessage(message: ActionMessage) {
 }

 export function handleStatusMessage(message: StatusMessage) {
-  const msg = message.message == null ? "" : message.message.trim();
+  const msg = message.status == null ? "" : message.status.trim();
  store.dispatch(
    setCurStatusMessage({
      ...message,
-      message: msg,
+      status: msg,
    }),
  );
 }
@@ -160,9 +160,9 @@ export function handleAssistantMessage(data: string | SocketMessage) {

  if ("action" in socketMessage) {
    handleActionMessage(socketMessage);
-  } else if ("observation" in socketMessage) {
-    handleObservationMessage(socketMessage);
-  } else if ("message" in socketMessage) {
+  } else if ("status" in socketMessage) {
    handleStatusMessage(socketMessage);
+  } else {
+    handleObservationMessage(socketMessage);
  }
 }
@@ -2,7 +2,7 @@ import { createSlice, PayloadAction } from "@reduxjs/toolkit";
 import { StatusMessage } from "#/types/Message";

 const initialStatusMessage: StatusMessage = {
-  message: "",
+  status: "",
  is_error: false,
 };

@@ -38,5 +38,5 @@ export interface StatusMessage {
  is_error: boolean;

  // A status message to display to the user
-  message: string;
+  status: string;
 }
@@ -1,3 +1,6 @@
+import os
+
+
 def get_version():
    try:
        from importlib.metadata import PackageNotFoundError, version
@@ -19,6 +22,16 @@ def get_version():
    except ImportError:
        pass

+    # Try getting the version from pyproject.toml
+    try:
+        root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        with open(os.path.join(root_dir, 'pyproject.toml'), 'r') as f:
+            for line in f:
+                if line.startswith('version ='):
+                    return line.split('=')[1].strip().strip('"')
+    except FileNotFoundError:
+        pass
+
    return 'unknown'


@@ -1,4 +1,5 @@
 import asyncio
+import copy
 import traceback
 from typing import Type

@@ -36,6 +37,7 @@ from openhands.events.observation import (
    ErrorObservation,
    Observation,
 )
+from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
 from openhands.runtime.utils.shutdown_listener import should_continue

@@ -117,8 +119,6 @@ class AgentController:

    async def close(self):
        """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream."""
-        if self.agent_task is not None:
-            self.agent_task.cancel()
        await self.set_agent_state_to(AgentState.STOPPED)
        self.event_stream.unsubscribe(EventStreamSubscriber.AGENT_CONTROLLER)

@@ -220,7 +220,13 @@ class AgentController:
        ):
            return

-        logger.info(observation, extra={'msg_type': 'OBSERVATION'})
+        # Make sure we print the observation in the same way as the LLM sees it
+        observation_to_print = copy.deepcopy(observation)
+        if len(observation_to_print.content) > self.agent.llm.config.max_message_chars:
+            observation_to_print.content = truncate_content(
+                observation_to_print.content, self.agent.llm.config.max_message_chars
+            )
+        logger.info(observation_to_print, extra={'msg_type': 'OBSERVATION'})
        if self._pending_action and self._pending_action.id == observation.cause:
            self._pending_action = None
            if self.state.agent_state == AgentState.USER_CONFIRMED:
@@ -1,4 +1,3 @@
-import argparse
 import asyncio
 import logging
 from typing import Type
@@ -10,6 +9,7 @@ from openhands import __version__
 from openhands.controller import AgentController
 from openhands.controller.agent import Agent
 from openhands.core.config import (
+    get_parser,
    load_app_config,
 )
 from openhands.core.logger import openhands_logger as logger
@@ -63,10 +63,10 @@ def display_event(event: Event):
        display_command_output(event.content)


-def get_parser() -> argparse.ArgumentParser:
-    """Get the parser for the command line arguments."""
-    parser = argparse.ArgumentParser(description='Run an agent with a specific task')
+async def main():
+    """Runs the agent in CLI mode"""

+    parser = get_parser()
    # Add the version argument
    parser.add_argument(
        '-v',
@@ -76,14 +76,6 @@ def get_parser() -> argparse.ArgumentParser:
        help='Show the version number and exit',
        default=None,
    )
-
-    return parser
-
-
-async def main():
-    """Runs the agent in CLI mode"""
-
-    parser = get_parser()
    args = parser.parse_args()

    if args.version:
@@ -91,7 +83,7 @@ async def main():
        return

    logger.setLevel(logging.WARNING)
-    config = load_app_config()
+    config = load_app_config(config_file=args.config_file)
    sid = 'cli'

    agent_cls: Type[Agent] = Agent.get_cls(config.default_agent)
@@ -1,3 +1,4 @@
+import os
 from dataclasses import dataclass, fields

 from openhands.core.config.config_utils import get_field_info
@@ -36,7 +37,7 @@ class LLMConfig:
        ollama_base_url: The base URL for the OLLAMA API.
        drop_params: Drop any unmapped (unsupported) params without causing an exception.
        disable_vision: If model is vision capable, this option allows to disable image processing (useful for cost reduction).
-        caching_prompt: Using the prompt caching feature provided by the LLM.
+        caching_prompt: Use the prompt caching feature if provided by the LLM and supported by the provider.
        log_completions: Whether to log LLM completions to the state.
    """

@@ -68,7 +69,7 @@ class LLMConfig:
    ollama_base_url: str | None = None
    drop_params: bool = True
    disable_vision: bool | None = None
-    caching_prompt: bool = False
+    caching_prompt: bool = True
    log_completions: bool = False

    def defaults_to_dict(self) -> dict:
@@ -78,6 +79,18 @@ class LLMConfig:
            result[f.name] = get_field_info(f)
        return result

+    def __post_init__(self):
+        """
+        Post-initialization hook to assign OpenRouter-related variables to environment variables.
+        This ensures that these values are accessible to litellm at runtime.
+        """
+
+        # Assign OpenRouter-specific variables to environment variables
+        if self.openrouter_site_url:
+            os.environ['OR_SITE_URL'] = self.openrouter_site_url
+        if self.openrouter_app_name:
+            os.environ['OR_APP_NAME'] = self.openrouter_app_name
+
    def __str__(self):
        attr_str = []
        for f in fields(self):
@@ -101,9 +114,3 @@ class LLMConfig:
            if k in LLM_SENSITIVE_FIELDS:
                ret[k] = '******' if v else None
        return ret
-
-    def set_missing_attributes(self):
-        """Set any missing attributes to their default values."""
-        for field_name, field_obj in self.__dataclass_fields__.items():
-            if not hasattr(self, field_name):
-                setattr(self, field_name, field_obj.default)
@@ -9,7 +9,8 @@ class SandboxConfig:
    """Configuration for the sandbox.

    Attributes:
-        api_hostname: The hostname for the EventStream Runtime API.
+        remote_runtime_api_url: The hostname for the Remote Runtime API.
+        local_runtime_url: The default hostname for the local runtime. You may want to change to http://host.docker.internal for DIND environments
        base_container_image: The base container image from which to build the runtime image.
        runtime_container_image: The runtime container image to use.
        user_id: The user ID for the sandbox.
@@ -17,6 +18,7 @@ class SandboxConfig:
        enable_auto_lint: Whether to enable auto-lint.
        use_host_network: Whether to use the host network.
        initialize_plugins: Whether to initialize plugins.
+        force_rebuild_runtime: Whether to force rebuild the runtime image.
        runtime_extra_deps: The extra dependencies to install in the runtime image (typically used for evaluation).
            This will be rendered into the end of the Dockerfile that builds the runtime image.
            It can contain any valid shell commands (e.g., pip install numpy).
@@ -30,7 +32,8 @@ class SandboxConfig:
            Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
    """

-    api_hostname: str = 'localhost'
+    remote_runtime_api_url: str = 'http://localhost:8000'
+    local_runtime_url: str = 'http://localhost'
    api_key: str | None = None
    base_container_image: str = 'nikolaik/python-nodejs:python3.11-nodejs22'  # default to nikolaik/python-nodejs:python3.11-nodejs22 for eventstream runtime
    runtime_container_image: str | None = None
@@ -41,6 +44,7 @@ class SandboxConfig:
    )
    use_host_network: bool = False
    initialize_plugins: bool = True
+    force_rebuild_runtime: bool = False
    runtime_extra_deps: str | None = None
    runtime_startup_env_vars: dict[str, str] = field(default_factory=dict)
    browsergym_eval_env: str | None = None
@@ -281,6 +281,12 @@ def get_llm_config_arg(
 def get_parser() -> argparse.ArgumentParser:
    """Get the parser for the command line arguments."""
    parser = argparse.ArgumentParser(description='Run an agent with a specific task')
+    parser.add_argument(
+        '--config-file',
+        type=str,
+        default='config.toml',
+        help='Path to the config file (default: config.toml in the current directory)',
+    )
    parser.add_argument(
        '-d',
        '--directory',
@@ -375,14 +381,17 @@ def parse_arguments() -> argparse.Namespace:
    return parsed_args


-def load_app_config(set_logging_levels: bool = True) -> AppConfig:
-    """Load the configuration from the config.toml file and environment variables.
+def load_app_config(
+    set_logging_levels: bool = True, config_file: str = 'config.toml'
+) -> AppConfig:
+    """Load the configuration from the specified config file and environment variables.

    Args:
        set_logger_levels: Whether to set the global variables for logging levels.
+        config_file: Path to the config file. Defaults to 'config.toml' in the current directory.
    """
    config = AppConfig()
-    load_from_toml(config)
+    load_from_toml(config, config_file)
    load_from_env(config, os.environ)
    finalize_config(config)
    if set_logging_levels:
@@ -228,7 +228,7 @@ if __name__ == '__main__':
    # Load the app config
    # this will load config from config.toml in the current directory
    # as well as from the environment variables
-    config = load_app_config()
+    config = load_app_config(config_file=args.config_file)

    # Override default LLM configs ([llm] section in config.toml)
    if args.llm_config:
@@ -50,6 +50,8 @@ class ImageContent(Content):
 class Message(BaseModel):
    role: Literal['user', 'system', 'assistant']
    content: list[TextContent | ImageContent] = Field(default=list)
+    cache_enabled: bool = False
+    vision_enabled: bool = False

    @property
    def contains_image(self) -> bool:
@@ -58,23 +60,22 @@ class Message(BaseModel):
    @model_serializer
    def serialize_model(self) -> dict:
        content: list[dict] | str
-        if self.role == 'system':
-            # For system role, concatenate all text content into a single string
-            content = '\n'.join(
-                item.text for item in self.content if isinstance(item, TextContent)
-            )
-        elif self.role == 'assistant' and not self.contains_image:
-            # For assistant role without vision, concatenate all text content into a single string
-            content = '\n'.join(
-                item.text for item in self.content if isinstance(item, TextContent)
-            )
-        else:
-            # For user role or assistant role with vision enabled, serialize each content item
+        # two kinds of serializer:
+        # 1. vision serializer: when prompt caching or vision is enabled
+        # 2. single text serializer: for other cases
+        # remove this when liteLLM or providers support this format translation
+        if self.cache_enabled or self.vision_enabled:
+            # when prompt caching or vision is enabled, use vision serializer
            content = []
            for item in self.content:
                if isinstance(item, TextContent):
                    content.append(item.model_dump())
                elif isinstance(item, ImageContent):
                    content.extend(item.model_dump())
-
+        else:
+            # for other cases, concatenate all text content
+            # into a single string per message
+            content = '\n'.join(
+                item.text for item in self.content if isinstance(item, TextContent)
+            )
        return {'content': content, 'role': self.role}
@@ -24,10 +24,6 @@ class ActionTypeSchema(BaseModel):
    """Writes the content to a file.
    """

-    EDIT: str = Field(default='edit')
-    """Edits the content of a file.
-    """
-
    RUN: str = Field(default='run')
    """Runs a command.
    """
@@ -10,10 +10,6 @@ class ObservationTypeSchema(BaseModel):

    WRITE: str = Field(default='write')

-    EDIT: str = Field(default='edit')
-    """The edited file
-    """
-
    BROWSE: str = Field(default='browse')
    """The HTML content of a URL
    """
@@ -9,11 +9,7 @@ from openhands.events.action.agent import (
 from openhands.events.action.browse import BrowseInteractiveAction, BrowseURLAction
 from openhands.events.action.commands import CmdRunAction, IPythonRunCellAction
 from openhands.events.action.empty import NullAction
-from openhands.events.action.files import (
-    FileEditAction,
-    FileReadAction,
-    FileWriteAction,
-)
+from openhands.events.action.files import FileReadAction, FileWriteAction
 from openhands.events.action.message import MessageAction
 from openhands.events.action.tasks import AddTaskAction, ModifyTaskAction

@@ -25,7 +21,6 @@ __all__ = [
    'BrowseInteractiveAction',
    'FileReadAction',
    'FileWriteAction',
-    'FileEditAction',
    'AgentFinishAction',
    'AgentRejectAction',
    'AgentDelegateAction',
@@ -39,23 +39,3 @@ class FileWriteAction(Action):
    @property
    def message(self) -> str:
        return f'Writing file: {self.path}'
-
-
-@dataclass
-class FileEditAction(Action):
-    diff_block: str
-    thought: str = ''
-    action: str = ActionType.EDIT
-    runnable: ClassVar[bool] = True
-    security_risk: ActionSecurityRisk | None = None
-
-    def __str__(self) -> str:
-        ret = '**EditFileAction**\n'
-        if self.thought:
-            ret += f'THOUGHT: {self.thought}\n'
-        ret += f'DIFF BLOCK:\n{self.diff_block}\n'
-        return ret
-
-    @property
-    def message(self) -> str:
-        return f'Edit Diff block: {self.diff_block}'
@@ -7,11 +7,7 @@ from openhands.events.observation.commands import (
 from openhands.events.observation.delegate import AgentDelegateObservation
 from openhands.events.observation.empty import NullObservation
 from openhands.events.observation.error import ErrorObservation
-from openhands.events.observation.files import (
-    FileEditObservation,
-    FileReadObservation,
-    FileWriteObservation,
-)
+from openhands.events.observation.files import FileReadObservation, FileWriteObservation
 from openhands.events.observation.observation import Observation
 from openhands.events.observation.reject import UserRejectObservation
 from openhands.events.observation.success import SuccessObservation
@@ -24,7 +20,6 @@ __all__ = [
    'BrowserOutputObservation',
    'FileReadObservation',
    'FileWriteObservation',
-    'FileEditObservation',
    'ErrorObservation',
    'AgentStateChangedObservation',
    'AgentDelegateObservation',
@@ -26,30 +26,3 @@ class FileWriteObservation(Observation):
    @property
    def message(self) -> str:
        return f'I wrote to the file {self.path}.'
-
-
-@dataclass
-class FileEditObservation(Observation):
-    """This data class represents a file edit operation"""
-
-    path: str
-    search_block: str
-    replace_block: str
-    observation: str = ObservationType.EDIT
-
-    @property
-    def message(self) -> str:
-        if self.search_block:
-            return (
-                f'I updated the file {self.path} by \n'
-                f'replacing:\n {self.search_block}\n'
-                f'with:\n {self.replace_block}\n'
-            )
-        else:
-            return (
-                f'I updated the file {self.path} by \n'
-                f'appending:\n {self.replace_block}\n'
-            )
-
-    def __str__(self) -> str:
-        return f'**FileEditObservation**\n' f'DIFF BLOCK: {self.content}\n'
@@ -12,11 +12,7 @@ from openhands.events.action.commands import (
    IPythonRunCellAction,
 )
 from openhands.events.action.empty import NullAction
-from openhands.events.action.files import (
-    FileEditAction,
-    FileReadAction,
-    FileWriteAction,
-)
+from openhands.events.action.files import FileReadAction, FileWriteAction
 from openhands.events.action.message import MessageAction
 from openhands.events.action.tasks import AddTaskAction, ModifyTaskAction

@@ -28,7 +24,6 @@ actions = (
    BrowseInteractiveAction,
    FileReadAction,
    FileWriteAction,
-    FileEditAction,
    AgentFinishAction,
    AgentRejectAction,
    AgentDelegateAction,
@@ -80,6 +80,13 @@ def event_to_memory(event: 'Event', max_message_chars: int) -> dict:
    d.pop('timestamp', None)
    d.pop('message', None)
    d.pop('images_urls', None)
+
+    # runnable actions have some extra fields used in the BE/FE, which should not be sent to the LLM
+    if 'args' in d:
+        d['args'].pop('blocking', None)
+        d['args'].pop('keep_prompt', None)
+        d['args'].pop('is_confirmed', None)
+
    if 'extras' in d:
        remove_fields(d['extras'], DELETE_FROM_MEMORY_EXTRAS)
    if isinstance(event, Observation) and 'content' in d:
@@ -7,11 +7,7 @@ from openhands.events.observation.commands import (
 from openhands.events.observation.delegate import AgentDelegateObservation
 from openhands.events.observation.empty import NullObservation
 from openhands.events.observation.error import ErrorObservation
-from openhands.events.observation.files import (
-    FileEditObservation,
-    FileReadObservation,
-    FileWriteObservation,
-)
+from openhands.events.observation.files import FileReadObservation, FileWriteObservation
 from openhands.events.observation.observation import Observation
 from openhands.events.observation.reject import UserRejectObservation
 from openhands.events.observation.success import SuccessObservation
@@ -23,7 +19,6 @@ observations = (
    BrowserOutputObservation,
    FileReadObservation,
    FileWriteObservation,
-    FileEditObservation,
    AgentDelegateObservation,
    SuccessObservation,
    ErrorObservation,
@@ -0,0 +1,9 @@
+"""Linter module for OpenHands.
+
+Part of this Linter module is adapted from Aider (Apache 2.0 License, [original code](https://github.com/paul-gauthier/aider/blob/main/aider/linter.py)). Please see the [original repository](https://github.com/paul-gauthier/aider) for more information.
+"""
+
+from openhands.linter.base import LintResult
+from openhands.linter.linter import DefaultLinter
+
+__all__ = ['DefaultLinter', 'LintResult']
@@ -0,0 +1,79 @@
+from abc import ABC, abstractmethod
+
+from pydantic import BaseModel
+
+
+class LintResult(BaseModel):
+    file: str
+    line: int  # 1-indexed
+    column: int  # 1-indexed
+    message: str
+
+    def visualize(self, half_window: int = 3) -> str:
+        """Visualize the lint result by print out all the lines where the lint result is found.
+
+        Args:
+            half_window: The number of context lines to display around the error on each side.
+        """
+        with open(self.file, 'r') as f:
+            file_lines = f.readlines()
+
+        # Add line numbers
+        _span_size = len(str(len(file_lines)))
+        file_lines = [
+            f'{i + 1:>{_span_size}}|{line.rstrip()}'
+            for i, line in enumerate(file_lines)
+        ]
+
+        # Get the window of lines to display
+        assert self.line <= len(file_lines) and self.line > 0
+        line_idx = self.line - 1
+        begin_window = max(0, line_idx - half_window)
+        end_window = min(len(file_lines), line_idx + half_window + 1)
+
+        selected_lines = file_lines[begin_window:end_window]
+        line_idx_in_window = line_idx - begin_window
+
+        # Add character hint
+        _character_hint = (
+            _span_size * ' '
+            + ' ' * (self.column)
+            + '^'
+            + ' ERROR HERE: '
+            + self.message
+        )
+        selected_lines[line_idx_in_window] = (
+            f'\033[91m{selected_lines[line_idx_in_window]}\033[0m'
+            + '\n'
+            + _character_hint
+        )
+        return '\n'.join(selected_lines)
+
+
+class LinterException(Exception):
+    """Base class for all linter exceptions."""
+
+    pass
+
+
+class BaseLinter(ABC):
+    """Base class for all linters.
+
+    Each linter should be able to lint files of a specific type and return a list of (parsed) lint results.
+    """
+
+    encoding: str = 'utf-8'
+
+    @property
+    @abstractmethod
+    def supported_extensions(self) -> list[str]:
+        """The file extensions that this linter supports, such as .py or .tsx."""
+        return []
+
+    @abstractmethod
+    def lint(self, file_path: str) -> list[LintResult]:
+        """Lint the given file.
+
+        file_path: The path to the file to lint. Required to be absolute.
+        """
+        pass
@@ -0,0 +1,77 @@
+from typing import List
+
+from openhands.linter.base import BaseLinter, LintResult
+from openhands.linter.utils import run_cmd
+
+
+def python_compile_lint(fname: str) -> list[LintResult]:
+    try:
+        with open(fname, 'r') as f:
+            code = f.read()
+        compile(code, fname, 'exec')  # USE TRACEBACK BELOW HERE
+        return []
+    except SyntaxError as err:
+        err_lineno = getattr(err, 'end_lineno', err.lineno)
+        err_offset = getattr(err, 'end_offset', err.offset)
+        if err_offset and err_offset < 0:
+            err_offset = err.offset
+        return [
+            LintResult(
+                file=fname, line=err_lineno, column=err_offset or 1, message=err.msg
+            )
+        ]
+
+
+def flake_lint(filepath: str) -> list[LintResult]:
+    fatal = 'F821,F822,F831,E112,E113,E999,E902'
+    flake8_cmd = f'flake8 --select={fatal} --isolated {filepath}'
+
+    try:
+        cmd_outputs = run_cmd(flake8_cmd)
+    except FileNotFoundError:
+        return []
+    results: list[LintResult] = []
+    if not cmd_outputs:
+        return results
+    for line in cmd_outputs.splitlines():
+        parts = line.split(':')
+        if len(parts) >= 4:
+            _msg = parts[3].strip()
+            if len(parts) > 4:
+                _msg += ': ' + parts[4].strip()
+            results.append(
+                LintResult(
+                    file=filepath,
+                    line=int(parts[1]),
+                    column=int(parts[2]),
+                    message=_msg,
+                )
+            )
+    return results
+
+
+class PythonLinter(BaseLinter):
+    @property
+    def supported_extensions(self) -> List[str]:
+        return ['.py']
+
+    def lint(self, file_path: str) -> list[LintResult]:
+        error = flake_lint(file_path)
+        if not error:
+            error = python_compile_lint(file_path)
+        return error
+
+    def compile_lint(self, file_path: str, code: str) -> List[LintResult]:
+        try:
+            compile(code, file_path, 'exec')
+            return []
+        except SyntaxError as e:
+            return [
+                LintResult(
+                    file=file_path,
+                    line=e.lineno,
+                    column=e.offset,
+                    message=str(e),
+                    rule='SyntaxError',
+                )
+            ]
@@ -0,0 +1,74 @@
+import warnings
+
+from grep_ast import TreeContext, filename_to_lang
+from grep_ast.parsers import PARSERS
+from tree_sitter_languages import get_parser
+
+from openhands.linter.base import BaseLinter, LintResult
+
+# tree_sitter is throwing a FutureWarning
+warnings.simplefilter('ignore', category=FutureWarning)
+
+
+def tree_context(fname, code, line_nums):
+    context = TreeContext(
+        fname,
+        code,
+        color=False,
+        line_number=True,
+        child_context=False,
+        last_line=False,
+        margin=0,
+        mark_lois=True,
+        loi_pad=3,
+        # header_max=30,
+        show_top_of_file_parent_scope=False,
+    )
+    line_nums = set(line_nums)
+    context.add_lines_of_interest(line_nums)
+    context.add_context()
+    output = context.format()
+    return output
+
+
+def traverse_tree(node):
+    """Traverses the tree to find errors."""
+    errors = []
+    if node.type == 'ERROR' or node.is_missing:
+        line_no = node.start_point[0] + 1
+        col_no = node.start_point[1] + 1
+        error_type = 'Missing node' if node.is_missing else 'Syntax error'
+        errors.append((line_no, col_no, error_type))
+
+    for child in node.children:
+        errors += traverse_tree(child)
+
+    return errors
+
+
+class TreesitterBasicLinter(BaseLinter):
+    @property
+    def supported_extensions(self) -> list[str]:
+        return list(PARSERS.keys())
+
+    def lint(self, file_path: str) -> list[LintResult]:
+        """Use tree-sitter to look for syntax errors, display them with tree context."""
+        lang = filename_to_lang(file_path)
+        if not lang:
+            return []
+        parser = get_parser(lang)
+        with open(file_path, 'r') as f:
+            code = f.read()
+        tree = parser.parse(bytes(code, 'utf-8'))
+        errors = traverse_tree(tree.root_node)
+        if not errors:
+            return []
+        return [
+            LintResult(
+                file=file_path,
+                line=int(line),
+                column=int(col),
+                message=error_details,
+            )
+            for line, col, error_details in errors
+        ]
@@ -0,0 +1,35 @@
+import os
+from collections import defaultdict
+
+from openhands.linter.base import BaseLinter, LinterException, LintResult
+from openhands.linter.languages.python import PythonLinter
+from openhands.linter.languages.treesitter import TreesitterBasicLinter
+
+
+class DefaultLinter(BaseLinter):
+    def __init__(self):
+        self.linters: dict[str, list[BaseLinter]] = defaultdict(list)
+        self.linters['.py'] = [PythonLinter()]
+
+        # Add treesitter linter as a fallback for all linters
+        self.basic_linter = TreesitterBasicLinter()
+        for extension in self.basic_linter.supported_extensions:
+            self.linters[extension].append(self.basic_linter)
+        self._supported_extensions = list(self.linters.keys())
+
+    @property
+    def supported_extensions(self) -> list[str]:
+        return self._supported_extensions
+
+    def lint(self, file_path: str) -> list[LintResult]:
+        if not os.path.isabs(file_path):
+            raise LinterException(f'File path {file_path} is not an absolute path')
+        file_extension = os.path.splitext(file_path)[1]
+
+        linters: list[BaseLinter] = self.linters.get(file_extension, [])
+        for linter in linters:
+            res = linter.lint(file_path)
+            # We always return the first linter's result (higher priority)
+            if res:
+                return res
+        return []
@@ -0,0 +1,3 @@
+from .cmd import check_tool_installed, run_cmd
+
+__all__ = ['run_cmd', 'check_tool_installed']
@@ -0,0 +1,37 @@
+import os
+import subprocess
+
+
+def run_cmd(cmd: str, cwd: str | None = None) -> str | None:
+    """Run a command and return the output.
+
+    If the command succeeds, return None. If the command fails, return the stdout.
+    """
+
+    process = subprocess.Popen(
+        cmd.split(),
+        cwd=cwd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        encoding='utf-8',
+        errors='replace',
+    )
+    stdout, _ = process.communicate()
+    if process.returncode == 0:
+        return None
+    return stdout
+
+
+def check_tool_installed(tool_name: str) -> bool:
+    """Check if a tool is installed."""
+    try:
+        subprocess.run(
+            [tool_name, '--version'],
+            check=True,
+            cwd=os.getcwd(),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return False
@@ -0,0 +1,5 @@
+from openhands.llm.async_llm import AsyncLLM
+from openhands.llm.llm import LLM
+from openhands.llm.streaming_llm import StreamingLLM
+
+__all__ = ['LLM', 'AsyncLLM', 'StreamingLLM']
@@ -0,0 +1,117 @@
+import asyncio
+from functools import partial
+from typing import Any
+
+from litellm import completion as litellm_acompletion
+
+from openhands.core.exceptions import UserCancelledError
+from openhands.core.logger import openhands_logger as logger
+from openhands.llm.llm import LLM, LLM_RETRY_EXCEPTIONS
+from openhands.runtime.utils.shutdown_listener import should_continue
+
+
+class AsyncLLM(LLM):
+    """Asynchronous LLM class."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._async_completion = partial(
+            self._call_acompletion,
+            model=self.config.model,
+            api_key=self.config.api_key,
+            base_url=self.config.base_url,
+            api_version=self.config.api_version,
+            custom_llm_provider=self.config.custom_llm_provider,
+            max_tokens=self.config.max_output_tokens,
+            timeout=self.config.timeout,
+            temperature=self.config.temperature,
+            top_p=self.config.top_p,
+            drop_params=self.config.drop_params,
+        )
+
+        async_completion_unwrapped = self._async_completion
+
+        @self.retry_decorator(
+            num_retries=self.config.num_retries,
+            retry_exceptions=LLM_RETRY_EXCEPTIONS,
+            retry_min_wait=self.config.retry_min_wait,
+            retry_max_wait=self.config.retry_max_wait,
+            retry_multiplier=self.config.retry_multiplier,
+        )
+        async def async_completion_wrapper(*args, **kwargs):
+            """Wrapper for the litellm acompletion function."""
+            messages: list[dict[str, Any]] | dict[str, Any] = []
+
+            # some callers might send the model and messages directly
+            # litellm allows positional args, like completion(model, messages, **kwargs)
+            # see llm.py for more details
+            if len(args) > 1:
+                messages = args[1] if len(args) > 1 else args[0]
+                kwargs['messages'] = messages
+
+                # remove the first args, they're sent in kwargs
+                args = args[2:]
+            elif 'messages' in kwargs:
+                messages = kwargs['messages']
+
+            # ensure we work with a list of messages
+            messages = messages if isinstance(messages, list) else [messages]
+
+            # if we have no messages, something went very wrong
+            if not messages:
+                raise ValueError(
+                    'The messages list is empty. At least one message is required.'
+                )
+
+            self.log_prompt(messages)
+
+            async def check_stopped():
+                while should_continue():
+                    if (
+                        hasattr(self.config, 'on_cancel_requested_fn')
+                        and self.config.on_cancel_requested_fn is not None
+                        and await self.config.on_cancel_requested_fn()
+                    ):
+                        raise UserCancelledError('LLM request cancelled by user')
+                    await asyncio.sleep(0.1)
+
+            stop_check_task = asyncio.create_task(check_stopped())
+
+            try:
+                # Directly call and await litellm_acompletion
+                resp = await async_completion_unwrapped(*args, **kwargs)
+
+                message_back = resp['choices'][0]['message']['content']
+                self.log_response(message_back)
+                self._post_completion(resp)
+
+                # We do not support streaming in this method, thus return resp
+                return resp
+
+            except UserCancelledError:
+                logger.info('LLM request cancelled by user.')
+                raise
+            except Exception as e:
+                logger.error(f'Completion Error occurred:\n{e}')
+                raise
+
+            finally:
+                await asyncio.sleep(0.1)
+                stop_check_task.cancel()
+                try:
+                    await stop_check_task
+                except asyncio.CancelledError:
+                    pass
+
+        self._async_completion = async_completion_wrapper  # type: ignore
+
+    async def _call_acompletion(self, *args, **kwargs):
+        """Wrapper for the litellm acompletion function."""
+        # Used in testing?
+        return await litellm_acompletion(*args, **kwargs)
+
+    @property
+    def async_completion(self):
+        """Decorator for the async litellm acompletion function."""
+        return self._async_completion
@@ -0,0 +1,51 @@
+from typing import Any
+
+from openhands.core.logger import llm_prompt_logger, llm_response_logger
+from openhands.core.logger import openhands_logger as logger
+
+MESSAGE_SEPARATOR = '\n\n----------\n\n'
+
+
+class DebugMixin:
+    def log_prompt(self, messages: list[dict[str, Any]] | dict[str, Any]):
+        if not messages:
+            logger.debug('No completion messages!')
+            return
+
+        messages = messages if isinstance(messages, list) else [messages]
+        debug_message = MESSAGE_SEPARATOR.join(
+            self._format_message_content(msg) for msg in messages if msg['content']
+        )
+
+        if debug_message:
+            llm_prompt_logger.debug(debug_message)
+        else:
+            logger.debug('No completion messages!')
+
+    def log_response(self, message_back: str):
+        if message_back:
+            llm_response_logger.debug(message_back)
+
+    def _format_message_content(self, message: dict[str, Any]):
+        content = message['content']
+        if isinstance(content, list):
+            return '\n'.join(
+                self._format_content_element(element) for element in content
+            )
+        return str(content)
+
+    def _format_content_element(self, element: dict[str, Any]):
+        if isinstance(element, dict):
+            if 'text' in element:
+                return element['text']
+            if (
+                self.vision_is_active()
+                and 'image_url' in element
+                and 'url' in element['image_url']
+            ):
+                return element['image_url']['url']
+        return str(element)
+
+    # This method should be implemented in the class that uses DebugMixin
+    def vision_is_active(self):
+        raise NotImplementedError
@@ -1,59 +1,56 @@
-import asyncio
 import copy
-import os
 import time
 import warnings
 from functools import partial
 from typing import Any

 from openhands.core.config import LLMConfig
-from openhands.runtime.utils.shutdown_listener import should_continue

 with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    import litellm
+from litellm import ModelInfo
 from litellm import completion as litellm_completion
 from litellm import completion_cost as litellm_completion_cost
 from litellm.exceptions import (
    APIConnectionError,
-    ContentPolicyViolationError,
+    APIError,
    InternalServerError,
-    NotFoundError,
-    OpenAIError,
    RateLimitError,
    ServiceUnavailableError,
 )
-from litellm.types.utils import CostPerToken
-from tenacity import (
-    retry,
-    retry_if_exception_type,
-    retry_if_not_exception_type,
-    stop_after_attempt,
-    wait_exponential,
-)
+from litellm.types.utils import CostPerToken, ModelResponse, Usage

-from openhands.core.exceptions import (
-    LLMResponseError,
-    OperationCancelled,
-    UserCancelledError,
-)
-from openhands.core.logger import llm_prompt_logger, llm_response_logger
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message
 from openhands.core.metrics import Metrics
-from openhands.runtime.utils.shutdown_listener import should_exit
+from openhands.llm.debug_mixin import DebugMixin
+from openhands.llm.retry_mixin import RetryMixin

 __all__ = ['LLM']

-message_separator = '\n\n----------\n\n'
+# tuple of exceptions to retry on
+LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (
+    APIConnectionError,
+    APIError,
+    InternalServerError,
+    RateLimitError,
+    ServiceUnavailableError,
+)

-cache_prompting_supported_models = [
+# cache prompt supporting models
+# remove this when we gemini and deepseek are supported
+CACHE_PROMPT_SUPPORTED_MODELS = [
    'claude-3-5-sonnet-20240620',
    'claude-3-haiku-20240307',
+    'claude-3-opus-20240229',
+    'anthropic/claude-3-opus-20240229',
+    'anthropic/claude-3-haiku-20240307',
+    'anthropic/claude-3-5-sonnet-20240620',
 ]


-class LLM:
+class LLM(RetryMixin, DebugMixin):
    """The LLM class represents a Language Model instance.

    Attributes:
@@ -70,25 +67,20 @@ class LLM:
        Passing simple parameters always overrides config.

        Args:
-            config: The LLM configuration
+            config: The LLM configuration.
+            metrics: The metrics to use.
        """
-        self.metrics = metrics if metrics is not None else Metrics()
-        self.cost_metric_supported = True
-        self.config = copy.deepcopy(config)
-
-        os.environ['OR_SITE_URL'] = self.config.openrouter_site_url
-        os.environ['OR_APP_NAME'] = self.config.openrouter_app_name
+        self.metrics: Metrics = metrics if metrics is not None else Metrics()
+        self.cost_metric_supported: bool = True
+        self.config: LLMConfig = copy.deepcopy(config)

        # list of LLM completions (for logging purposes). Each completion is a dict with the following keys:
        # - 'messages': list of messages
        # - 'response': response from the LLM
        self.llm_completions: list[dict[str, Any]] = []

-        # Set up config attributes with default values to prevent AttributeError
-        LLMConfig.set_missing_attributes(self.config)
-
        # litellm actually uses base Exception here for unknown model
-        self.model_info = None
+        self.model_info: ModelInfo | None = None
        try:
            if self.config.model.startswith('openrouter'):
                self.model_info = litellm.get_model_info(self.config.model)
@@ -100,15 +92,6 @@ class LLM:
        except Exception as e:
            logger.warning(f'Could not get model info for {config.model}:\n{e}')

-        # Tuple of exceptions to retry on
-        self.retry_exceptions = (
-            APIConnectionError,
-            ContentPolicyViolationError,
-            InternalServerError,
-            OpenAIError,
-            RateLimitError,
-        )
-
        # Set the max tokens in an LM-specific way if not set
        if self.config.max_input_tokens is None:
            if (
@@ -136,30 +119,6 @@ class LLM:
                ):
                    self.config.max_output_tokens = self.model_info['max_tokens']

-        # This only seems to work with Google as the provider, not with OpenRouter!
-        gemini_safety_settings = (
-            [
-                {
-                    'category': 'HARM_CATEGORY_HARASSMENT',
-                    'threshold': 'BLOCK_NONE',
-                },
-                {
-                    'category': 'HARM_CATEGORY_HATE_SPEECH',
-                    'threshold': 'BLOCK_NONE',
-                },
-                {
-                    'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT',
-                    'threshold': 'BLOCK_NONE',
-                },
-                {
-                    'category': 'HARM_CATEGORY_DANGEROUS_CONTENT',
-                    'threshold': 'BLOCK_NONE',
-                },
-            ]
-            if self.config.model.lower().startswith('gemini')
-            else None
-        )
-
        self._completion = partial(
            litellm_completion,
            model=self.config.model,
@@ -172,84 +131,52 @@ class LLM:
            temperature=self.config.temperature,
            top_p=self.config.top_p,
            drop_params=self.config.drop_params,
-            **(
-                {'safety_settings': gemini_safety_settings}
-                if gemini_safety_settings is not None
-                else {}
-            ),
        )

        if self.vision_is_active():
            logger.debug('LLM: model has vision enabled')
+        if self.is_caching_prompt_active():
+            logger.debug('LLM: caching prompt enabled')

        completion_unwrapped = self._completion

-        def log_retry_attempt(retry_state):
-            """With before_sleep, this is called before `custom_completion_wait` and
-            ONLY if the retry is triggered by an exception."""
-            if should_exit():
-                raise OperationCancelled(
-                    'Operation cancelled.'
-                )  # exits the @retry loop
-            exception = retry_state.outcome.exception()
-            logger.error(
-                f'{exception}. Attempt #{retry_state.attempt_number} | You can customize retry values in the configuration.',
-                exc_info=False,
-            )
-
-        def custom_completion_wait(retry_state):
-            """Custom wait function for litellm completion."""
-            if not retry_state:
-                return 0
-            exception = retry_state.outcome.exception() if retry_state.outcome else None
-            if exception is None:
-                return 0
-
-            min_wait_time = self.config.retry_min_wait
-            max_wait_time = self.config.retry_max_wait
-
-            # for rate limit errors, wait 1 minute by default, max 4 minutes between retries
-            exception_type = type(exception).__name__
-            logger.error(f'\nexception_type: {exception_type}\n')
-
-            if exception_type == 'RateLimitError':
-                min_wait_time = 60
-                max_wait_time = 240
-            elif exception_type == 'BadRequestError' and exception.response:
-                # this should give us the burried, actual error message from
-                # the LLM model.
-                logger.error(f'\n\nBadRequestError: {exception.response}\n\n')
-
-            # Return the wait time using exponential backoff
-            exponential_wait = wait_exponential(
-                multiplier=self.config.retry_multiplier,
-                min=min_wait_time,
-                max=max_wait_time,
-            )
-
-            # Call the exponential wait function with retry_state to get the actual wait time
-            return exponential_wait(retry_state)
-
-        @retry(
-            before_sleep=log_retry_attempt,
-            stop=stop_after_attempt(self.config.num_retries),
-            reraise=True,
-            retry=(
-                retry_if_exception_type(self.retry_exceptions)
-                & retry_if_not_exception_type(OperationCancelled)
-            ),
-            wait=custom_completion_wait,
+        @self.retry_decorator(
+            num_retries=self.config.num_retries,
+            retry_exceptions=LLM_RETRY_EXCEPTIONS,
+            retry_min_wait=self.config.retry_min_wait,
+            retry_max_wait=self.config.retry_max_wait,
+            retry_multiplier=self.config.retry_multiplier,
        )
        def wrapper(*args, **kwargs):
            """Wrapper for the litellm completion function. Logs the input and output of the completion function."""
-            # some callers might just send the messages directly
-            if 'messages' in kwargs:
-                messages = kwargs['messages']
-            else:
-                messages = args[1] if len(args) > 1 else []
+            messages: list[dict[str, Any]] | dict[str, Any] = []

-            # this serves to prevent empty messages and logging the messages
-            debug_message = self._get_debug_message(messages)
+            # some callers might send the model and messages directly
+            # litellm allows positional args, like completion(model, messages, **kwargs)
+            if len(args) > 1:
+                # ignore the first argument if it's provided (it would be the model)
+                # design wise: we don't allow overriding the configured values
+                # implementation wise: the partial function set the model as a kwarg already
+                # as well as other kwargs
+                messages = args[1] if len(args) > 1 else args[0]
+                kwargs['messages'] = messages
+
+                # remove the first args, they're sent in kwargs
+                args = args[2:]
+            elif 'messages' in kwargs:
+                messages = kwargs['messages']
+
+            # ensure we work with a list of messages
+            messages = messages if isinstance(messages, list) else [messages]
+
+            # if we have no messages, something went very wrong
+            if not messages:
+                raise ValueError(
+                    'The messages list is empty. At least one message is required.'
+                )
+
+            # log the entire LLM prompt
+            self.log_prompt(messages)

            if self.is_caching_prompt_active():
                # Anthropic-specific prompt caching
@@ -258,239 +185,31 @@ class LLM:
                        'anthropic-beta': 'prompt-caching-2024-07-31',
                    }

-            # skip if messages is empty (thus debug_message is empty)
-            if debug_message:
-                llm_prompt_logger.debug(debug_message)
-                resp = completion_unwrapped(*args, **kwargs)
-            else:
-                logger.debug('No completion messages!')
-                resp = {'choices': [{'message': {'content': ''}}]}
+            # we don't support streaming here, thus we get a ModelResponse
+            resp: ModelResponse = completion_unwrapped(*args, **kwargs)

+            # log for evals or other scripts that need the raw completion
            if self.config.log_completions:
                self.llm_completions.append(
                    {
                        'messages': messages,
                        'response': resp,
                        'timestamp': time.time(),
-                        'cost': self.completion_cost(resp),
+                        'cost': self._completion_cost(resp),
                    }
                )

-            # log the response
-            message_back = resp['choices'][0]['message']['content']
-            if message_back:
-                llm_response_logger.debug(message_back)
+            message_back: str = resp['choices'][0]['message']['content']

-                # post-process to log costs
-                self._post_completion(resp)
+            # log the LLM response
+            self.log_response(message_back)
+
+            # post-process the response
+            self._post_completion(resp)

            return resp

-        self._completion = wrapper  # type: ignore
-
-        # Async version
-        self._async_completion = partial(
-            self._call_acompletion,
-            model=self.config.model,
-            api_key=self.config.api_key,
-            base_url=self.config.base_url,
-            api_version=self.config.api_version,
-            custom_llm_provider=self.config.custom_llm_provider,
-            max_tokens=self.config.max_output_tokens,
-            timeout=self.config.timeout,
-            temperature=self.config.temperature,
-            top_p=self.config.top_p,
-            drop_params=self.config.drop_params,
-            **(
-                {'safety_settings': gemini_safety_settings}
-                if gemini_safety_settings is not None
-                else {}
-            ),
-        )
-
-        async_completion_unwrapped = self._async_completion
-
-        @retry(
-            before_sleep=log_retry_attempt,
-            stop=stop_after_attempt(self.config.num_retries),
-            reraise=True,
-            retry=(
-                retry_if_exception_type(self.retry_exceptions)
-                & retry_if_not_exception_type(OperationCancelled)
-            ),
-            wait=custom_completion_wait,
-        )
-        async def async_completion_wrapper(*args, **kwargs):
-            """Async wrapper for the litellm acompletion function."""
-            # some callers might just send the messages directly
-            if 'messages' in kwargs:
-                messages = kwargs['messages']
-            else:
-                messages = args[1] if len(args) > 1 else []
-
-            # this serves to prevent empty messages and logging the messages
-            debug_message = self._get_debug_message(messages)
-
-            async def check_stopped():
-                while should_continue():
-                    if (
-                        hasattr(self.config, 'on_cancel_requested_fn')
-                        and self.config.on_cancel_requested_fn is not None
-                        and await self.config.on_cancel_requested_fn()
-                    ):
-                        raise UserCancelledError('LLM request cancelled by user')
-                    await asyncio.sleep(0.1)
-
-            stop_check_task = asyncio.create_task(check_stopped())
-
-            try:
-                # Directly call and await litellm_acompletion
-                if debug_message:
-                    llm_prompt_logger.debug(debug_message)
-                    resp = await async_completion_unwrapped(*args, **kwargs)
-                else:
-                    logger.debug('No completion messages!')
-                    resp = {'choices': [{'message': {'content': ''}}]}
-
-                # skip if messages is empty (thus debug_message is empty)
-                if debug_message:
-                    message_back = resp['choices'][0]['message']['content']
-                    llm_response_logger.debug(message_back)
-                else:
-                    resp = {'choices': [{'message': {'content': ''}}]}
-                self._post_completion(resp)
-
-                # We do not support streaming in this method, thus return resp
-                return resp
-
-            except UserCancelledError:
-                logger.info('LLM request cancelled by user.')
-                raise
-            except (
-                APIConnectionError,
-                ContentPolicyViolationError,
-                InternalServerError,
-                NotFoundError,
-                OpenAIError,
-                RateLimitError,
-                ServiceUnavailableError,
-            ) as e:
-                logger.error(f'Completion Error occurred:\n{e}')
-                raise
-
-            finally:
-                await asyncio.sleep(0.1)
-                stop_check_task.cancel()
-                try:
-                    await stop_check_task
-                except asyncio.CancelledError:
-                    pass
-
-        @retry(
-            before_sleep=log_retry_attempt,
-            stop=stop_after_attempt(self.config.num_retries),
-            reraise=True,
-            retry=(
-                retry_if_exception_type(self.retry_exceptions)
-                & retry_if_not_exception_type(OperationCancelled)
-            ),
-            wait=custom_completion_wait,
-        )
-        async def async_acompletion_stream_wrapper(*args, **kwargs):
-            """Async wrapper for the litellm acompletion with streaming function."""
-            # some callers might just send the messages directly
-            if 'messages' in kwargs:
-                messages = kwargs['messages']
-            else:
-                messages = args[1] if len(args) > 1 else []
-
-            # log the prompt
-            debug_message = ''
-            for message in messages:
-                debug_message += message_separator + message['content']
-            llm_prompt_logger.debug(debug_message)
-
-            try:
-                # Directly call and await litellm_acompletion
-                resp = await async_completion_unwrapped(*args, **kwargs)
-
-                # For streaming we iterate over the chunks
-                async for chunk in resp:
-                    # Check for cancellation before yielding the chunk
-                    if (
-                        hasattr(self.config, 'on_cancel_requested_fn')
-                        and self.config.on_cancel_requested_fn is not None
-                        and await self.config.on_cancel_requested_fn()
-                    ):
-                        raise UserCancelledError(
-                            'LLM request cancelled due to CANCELLED state'
-                        )
-                    # with streaming, it is "delta", not "message"!
-                    message_back = chunk['choices'][0]['delta']['content']
-                    llm_response_logger.debug(message_back)
-                    self._post_completion(chunk)
-
-                    yield chunk
-
-            except UserCancelledError:
-                logger.info('LLM request cancelled by user.')
-                raise
-            except (
-                APIConnectionError,
-                ContentPolicyViolationError,
-                InternalServerError,
-                NotFoundError,
-                OpenAIError,
-                RateLimitError,
-                ServiceUnavailableError,
-            ) as e:
-                logger.error(f'Completion Error occurred:\n{e}')
-                raise
-
-            finally:
-                if kwargs.get('stream', False):
-                    await asyncio.sleep(0.1)
-
-        self._async_completion = async_completion_wrapper  # type: ignore
-        self._async_streaming_completion = async_acompletion_stream_wrapper  # type: ignore
-
-    def _get_debug_message(self, messages):
-        if not messages:
-            return ''
-
-        messages = messages if isinstance(messages, list) else [messages]
-        return message_separator.join(
-            self._format_message_content(msg) for msg in messages if msg['content']
-        )
-
-    def _format_message_content(self, message):
-        content = message['content']
-        if isinstance(content, list):
-            return self._format_list_content(content)
-        return str(content)
-
-    def _format_list_content(self, content_list):
-        return '\n'.join(
-            self._format_content_element(element) for element in content_list
-        )
-
-    def _format_content_element(self, element):
-        if isinstance(element, dict):
-            if 'text' in element:
-                return element['text']
-            if (
-                self.vision_is_active()
-                and 'image_url' in element
-                and 'url' in element['image_url']
-            ):
-                return element['image_url']['url']
-        return str(element)
-
-    async def _call_acompletion(self, *args, **kwargs):
-        """This is a wrapper for the litellm acompletion function which
-        makes it mockable for testing.
-        """
-        return await litellm.acompletion(*args, **kwargs)
+        self._completion = wrapper

    @property
    def completion(self):
@@ -498,32 +217,7 @@ class LLM:

        Check the complete documentation at https://litellm.vercel.app/docs/completion
        """
-        try:
-            return self._completion
-        except Exception as e:
-            raise LLMResponseError(e)
-
-    @property
-    def async_completion(self):
-        """Decorator for the async litellm acompletion function.
-
-        Check the complete documentation at https://litellm.vercel.app/docs/providers/ollama#example-usage---streaming--acompletion
-        """
-        try:
-            return self._async_completion
-        except Exception as e:
-            raise LLMResponseError(e)
-
-    @property
-    def async_streaming_completion(self):
-        """Decorator for the async litellm acompletion function with streaming.
-
-        Check the complete documentation at https://litellm.vercel.app/docs/providers/ollama#example-usage---streaming--acompletion
-        """
-        try:
-            return self._async_streaming_completion
-        except Exception as e:
-            raise LLMResponseError(e)
+        return self._completion

    def vision_is_active(self):
        return not self.config.disable_vision and self._supports_vision()
@@ -534,38 +228,50 @@ class LLM:
        Returns:
            bool: True if model is vision capable. If model is not supported by litellm, it will return False.
        """
-        try:
-            return litellm.supports_vision(self.config.model)
-        except Exception:
-            return False
-
-    def is_caching_prompt_active(self) -> bool:
-        """Check if prompt caching is enabled and supported for current model.
-
-        Returns:
-            boolean: True if prompt caching is active for the given model.
-        """
-        return self.config.caching_prompt is True and any(
-            model in self.config.model for model in cache_prompting_supported_models
+        # litellm.supports_vision currently returns False for 'openai/gpt-...' or 'anthropic/claude-...' (with prefixes)
+        # but model_info will have the correct value for some reason.
+        # we can go with it, but we will need to keep an eye if model_info is correct for Vertex or other providers
+        # remove when litellm is updated to fix https://github.com/BerriAI/litellm/issues/5608
+        return litellm.supports_vision(self.config.model) or (
+            self.model_info is not None
+            and self.model_info.get('supports_vision', False)
        )

-    def _post_completion(self, response) -> None:
-        """Post-process the completion response."""
+    def is_caching_prompt_active(self) -> bool:
+        """Check if prompt caching is supported and enabled for current model.
+
+        Returns:
+            boolean: True if prompt caching is supported and enabled for the given model.
+        """
+        return (
+            self.config.caching_prompt is True
+            and self.model_info is not None
+            and self.model_info.get('supports_prompt_caching', False)
+            and self.config.model in CACHE_PROMPT_SUPPORTED_MODELS
+        )
+
+    def _post_completion(self, response: ModelResponse) -> None:
+        """Post-process the completion response.
+
+        Logs the cost and usage stats of the completion call.
+        """
        try:
-            cur_cost = self.completion_cost(response)
+            cur_cost = self._completion_cost(response)
        except Exception:
            cur_cost = 0

        stats = ''
        if self.cost_metric_supported:
+            # keep track of the cost
            stats = 'Cost: %.2f USD | Accumulated Cost: %.2f USD\n' % (
                cur_cost,
                self.metrics.accumulated_cost,
            )

-        usage = response.get('usage')
+        usage: Usage | None = response.get('usage')

        if usage:
+            # keep track of the input and output tokens
            input_tokens = usage.get('prompt_tokens')
            output_tokens = usage.get('completion_tokens')

@@ -580,6 +286,7 @@ class LLM:
                    + '\n'
                )

+            # read the prompt caching status as received from the provider
            model_extra = usage.get('model_extra', {})

            cache_creation_input_tokens = model_extra.get('cache_creation_input_tokens')
@@ -596,6 +303,7 @@ class LLM:
                    'Input tokens (cache read): ' + str(cache_read_input_tokens) + '\n'
                )

+        # log the stats
        if stats:
            logger.info(stats)

@@ -614,7 +322,7 @@ class LLM:
            # TODO: this is to limit logspam in case token count is not supported
            return 0

-    def is_local(self):
+    def _is_local(self):
        """Determines if the system is using a locally running LLM.

        Returns:
@@ -629,7 +337,7 @@ class LLM:
                return True
        return False

-    def completion_cost(self, response):
+    def _completion_cost(self, response):
        """Calculate the cost of a completion response based on the model.  Local models are treated as free.
        Add the current cost into total cost in metrics.

@@ -654,7 +362,7 @@ class LLM:
            logger.info(f'Using custom cost per token: {cost_per_token}')
            extra_kwargs['custom_cost_per_token'] = cost_per_token

-        if not self.is_local():
+        if not self._is_local():
            try:
                cost = litellm_completion_cost(
                    completion_response=response, **extra_kwargs
@@ -682,5 +390,12 @@ class LLM:

    def format_messages_for_llm(self, messages: Message | list[Message]) -> list[dict]:
        if isinstance(messages, Message):
-            return [messages.model_dump()]
+            messages = [messages]
+
+        # set flags to know how to serialize the messages
+        for message in messages:
+            message.cache_enabled = self.is_caching_prompt_active()
+            message.vision_enabled = self.vision_is_active()
+
+        # let pydantic handle the serialization
        return [message.model_dump() for message in messages]
@@ -0,0 +1,50 @@
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from openhands.core.logger import openhands_logger as logger
+from openhands.utils.tenacity_stop import stop_if_should_exit
+
+
+class RetryMixin:
+    """Mixin class for retry logic."""
+
+    def retry_decorator(self, **kwargs):
+        """
+        Create a LLM retry decorator with customizable parameters. This is used for 429 errors, and a few other exceptions in LLM classes.
+
+        Args:
+            **kwargs: Keyword arguments to override default retry behavior.
+                      Keys: num_retries, retry_exceptions, retry_min_wait, retry_max_wait, retry_multiplier
+
+        Returns:
+            A retry decorator with the parameters customizable in configuration.
+        """
+        num_retries = kwargs.get('num_retries')
+        retry_exceptions = kwargs.get('retry_exceptions')
+        retry_min_wait = kwargs.get('retry_min_wait')
+        retry_max_wait = kwargs.get('retry_max_wait')
+        retry_multiplier = kwargs.get('retry_multiplier')
+
+        return retry(
+            before_sleep=self.log_retry_attempt,
+            stop=stop_after_attempt(num_retries) | stop_if_should_exit(),
+            reraise=True,
+            retry=(retry_if_exception_type(retry_exceptions)),
+            wait=wait_exponential(
+                multiplier=retry_multiplier,
+                min=retry_min_wait,
+                max=retry_max_wait,
+            ),
+        )
+
+    def log_retry_attempt(self, retry_state):
+        """Log retry attempts."""
+        exception = retry_state.outcome.exception()
+        logger.error(
+            f'{exception}. Attempt #{retry_state.attempt_number} | You can customize retry values in the configuration.',
+            exc_info=False,
+        )
@@ -0,0 +1,106 @@
+import asyncio
+from functools import partial
+from typing import Any
+
+from openhands.core.exceptions import UserCancelledError
+from openhands.core.logger import openhands_logger as logger
+from openhands.llm.async_llm import LLM_RETRY_EXCEPTIONS, AsyncLLM
+
+
+class StreamingLLM(AsyncLLM):
+    """Streaming LLM class."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._async_streaming_completion = partial(
+            self._call_acompletion,
+            model=self.config.model,
+            api_key=self.config.api_key,
+            base_url=self.config.base_url,
+            api_version=self.config.api_version,
+            custom_llm_provider=self.config.custom_llm_provider,
+            max_tokens=self.config.max_output_tokens,
+            timeout=self.config.timeout,
+            temperature=self.config.temperature,
+            top_p=self.config.top_p,
+            drop_params=self.config.drop_params,
+            stream=True,  # Ensure streaming is enabled
+        )
+
+        async_streaming_completion_unwrapped = self._async_streaming_completion
+
+        @self.retry_decorator(
+            num_retries=self.config.num_retries,
+            retry_exceptions=LLM_RETRY_EXCEPTIONS,
+            retry_min_wait=self.config.retry_min_wait,
+            retry_max_wait=self.config.retry_max_wait,
+            retry_multiplier=self.config.retry_multiplier,
+        )
+        async def async_streaming_completion_wrapper(*args, **kwargs):
+            messages: list[dict[str, Any]] | dict[str, Any] = []
+
+            # some callers might send the model and messages directly
+            # litellm allows positional args, like completion(model, messages, **kwargs)
+            # see llm.py for more details
+            if len(args) > 1:
+                messages = args[1] if len(args) > 1 else args[0]
+                kwargs['messages'] = messages
+
+                # remove the first args, they're sent in kwargs
+                args = args[2:]
+            elif 'messages' in kwargs:
+                messages = kwargs['messages']
+
+            # ensure we work with a list of messages
+            messages = messages if isinstance(messages, list) else [messages]
+
+            # if we have no messages, something went very wrong
+            if not messages:
+                raise ValueError(
+                    'The messages list is empty. At least one message is required.'
+                )
+
+            self.log_prompt(messages)
+
+            try:
+                # Directly call and await litellm_acompletion
+                resp = await async_streaming_completion_unwrapped(*args, **kwargs)
+
+                # For streaming we iterate over the chunks
+                async for chunk in resp:
+                    # Check for cancellation before yielding the chunk
+                    if (
+                        hasattr(self.config, 'on_cancel_requested_fn')
+                        and self.config.on_cancel_requested_fn is not None
+                        and await self.config.on_cancel_requested_fn()
+                    ):
+                        raise UserCancelledError(
+                            'LLM request cancelled due to CANCELLED state'
+                        )
+                    # with streaming, it is "delta", not "message"!
+                    message_back = chunk['choices'][0]['delta'].get('content', '')
+                    if message_back:
+                        self.log_response(message_back)
+                    self._post_completion(chunk)
+
+                    yield chunk
+
+            except UserCancelledError:
+                logger.info('LLM request cancelled by user.')
+                raise
+            except Exception as e:
+                logger.error(f'Completion Error occurred:\n{e}')
+                raise
+
+            finally:
+                # sleep for 0.1 seconds to allow the stream to be flushed
+                if kwargs.get('stream', False):
+                    await asyncio.sleep(0.1)
+
+        self._async_streaming_completion = async_streaming_completion_wrapper
+
+    @property
+    def async_streaming_completion(self):
+        """Decorator for the async litellm acompletion function with streaming."""
+        return self._async_streaming_completion
@@ -11,6 +11,7 @@ from tenacity import (
 from openhands.core.config import LLMConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.utils import json
+from openhands.utils.tenacity_stop import stop_if_should_exit

 try:
    import chromadb
@@ -50,7 +51,7 @@ if LLAMA_INDEX_AVAILABLE:

    @retry(
        reraise=True,
-        stop=stop_after_attempt(num_retries),
+        stop=stop_after_attempt(num_retries) | stop_if_should_exit(),
        wait=wait_random_exponential(min=retry_min_wait, max=retry_max_wait),
        retry=retry_if_exception_type(
            (RateLimitError, APIConnectionError, InternalServerError)
@@ -17,6 +17,7 @@ from PIL import Image
 from openhands.core.exceptions import BrowserInitException
 from openhands.core.logger import openhands_logger as logger
 from openhands.runtime.utils.shutdown_listener import should_continue, should_exit
+from openhands.utils.tenacity_stop import stop_if_should_exit

 BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL'
 BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS'
@@ -52,7 +53,7 @@ class BrowserEnv:

    @tenacity.retry(
        wait=tenacity.wait_fixed(1),
-        stop=tenacity.stop_after_attempt(5),
+        stop=tenacity.stop_after_attempt(5) | stop_if_should_exit(),
        retry=tenacity.retry_if_exception_type(BrowserInitException),
    )
    def init_browser(self):
@@ -26,12 +26,13 @@ class RuntimeBuilder(abc.ABC):
        pass

    @abc.abstractmethod
-    def image_exists(self, image_name: str) -> bool:
+    def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:
        """
        Check if the runtime image exists.

        Args:
            image_name (str): The name of the runtime image (e.g., "repo:sha").
+            pull_from_repo (bool): Whether to pull from the remote repo if the image not present locally

        Returns:
            bool: Whether the runtime image exists.
@@ -1,7 +1,12 @@
+import datetime
+import os
+import subprocess
 import sys
+import time

 import docker

+from openhands import __version__ as oh_version
 from openhands.core.logger import openhands_logger as logger
 from openhands.runtime.builder.base import RuntimeBuilder

@@ -10,45 +15,139 @@ class DockerRuntimeBuilder(RuntimeBuilder):
    def __init__(self, docker_client: docker.DockerClient):
        self.docker_client = docker_client

-    def build(self, path: str, tags: list[str]) -> str:
+        version_info = self.docker_client.version()
+        server_version = version_info.get('Version', '')
+        if tuple(map(int, server_version.split('.'))) < (18, 9):
+            raise RuntimeError('Docker server version must be >= 18.09 to use BuildKit')
+
+        self.max_lines = 10
+        self.log_lines = [''] * self.max_lines
+
+    def build(
+        self,
+        path: str,
+        tags: list[str],
+        use_local_cache: bool = False,
+        extra_build_args: list[str] | None = None,
+    ) -> str:
+        """Builds a Docker image using BuildKit and handles the build logs appropriately.
+
+        Args:
+            path (str): The path to the Docker build context.
+            tags (list[str]): A list of image tags to apply to the built image.
+            use_local_cache (bool, optional): Whether to use and update the local build cache. Defaults to True.
+            extra_build_args (list[str], optional): Additional arguments to pass to the Docker build command. Defaults to None.
+
+        Returns:
+            str: The name of the built Docker image.
+
+        Raises:
+            RuntimeError: If the Docker server version is incompatible or if the build process fails.
+
+        Note:
+            This method uses Docker BuildKit for improved build performance and caching capabilities.
+            If `use_local_cache` is True, it will attempt to use and update the build cache in a local directory.
+            The `extra_build_args` parameter allows for passing additional Docker build arguments as needed.
+        """
+        self.docker_client = docker.from_env()
+        version_info = self.docker_client.version()
+        server_version = version_info.get('Version', '')
+        if tuple(map(int, server_version.split('.'))) < (18, 9):
+            raise RuntimeError('Docker server version must be >= 18.09 to use BuildKit')
+
        target_image_hash_name = tags[0]
        target_image_repo, target_image_hash_tag = target_image_hash_name.split(':')
        target_image_tag = tags[1].split(':')[1] if len(tags) > 1 else None

-        try:
-            build_logs = self.docker_client.api.build(
-                path=path,
-                tag=target_image_hash_name,
-                rm=True,
-                decode=True,
-            )
-        except docker.errors.BuildError as e:
-            logger.error(f'Sandbox image build failed: {e}')
-            raise RuntimeError(f'Sandbox image build failed: {e}')
+        # Check if the image exists and pull if necessary
+        self.image_exists(target_image_hash_name)

-        layers: dict[str, dict[str, str]] = {}
-        previous_layer_count = 0
-        for log in build_logs:
-            if 'stream' in log:
-                logger.info(log['stream'].strip())
-            elif 'error' in log:
-                logger.error(log['error'].strip())
-            elif 'status' in log:
-                self._output_build_progress(log, layers, previous_layer_count)
-                previous_layer_count = len(layers)
-            else:
-                logger.info(str(log))
+        buildx_cmd = [
+            'docker',
+            'buildx',
+            'build',
+            '--progress=plain',
+            f'--build-arg=OPENHANDS_RUNTIME_VERSION={oh_version}',
+            f'--build-arg=OPENHANDS_RUNTIME_BUILD_TIME={datetime.datetime.now().isoformat()}',
+            f'--tag={target_image_hash_name}',
+            '--load',
+        ]
+
+        cache_dir = '/tmp/.buildx-cache'
+        if use_local_cache and self._is_cache_usable(cache_dir):
+            buildx_cmd.extend(
+                [
+                    f'--cache-from=type=local,src={cache_dir}',
+                    f'--cache-to=type=local,dest={cache_dir},mode=max',
+                ]
+            )
+
+        if extra_build_args:
+            buildx_cmd.extend(extra_build_args)
+
+        buildx_cmd.append(path)  # must be last!
+
+        print('================ DOCKER BUILD STARTED ================')
+        if sys.stdout.isatty():
+            sys.stdout.write('\n' * self.max_lines)
+            sys.stdout.flush()
+
+        try:
+            process = subprocess.Popen(
+                buildx_cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=True,
+                bufsize=1,
+            )
+
+            if process.stdout:
+                for line in iter(process.stdout.readline, ''):
+                    line = line.strip()
+                    if line:
+                        self._output_logs(line)
+
+            return_code = process.wait()
+
+            if return_code != 0:
+                raise subprocess.CalledProcessError(
+                    return_code,
+                    process.args,
+                    output=process.stdout.read() if process.stdout else None,
+                    stderr=process.stderr.read() if process.stderr else None,
+                )
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f'Image build failed:\n{e}')
+            logger.error(f'Command output:\n{e.output}')
+            raise
+
+        except subprocess.TimeoutExpired:
+            logger.error('Image build timed out')
+            raise
+
+        except FileNotFoundError as e:
+            logger.error(f'Python executable not found: {e}')
+            raise
+
+        except PermissionError as e:
+            logger.error(
+                f'Permission denied when trying to execute the build command:\n{e}'
+            )
+            raise
+
+        except Exception as e:
+            logger.error(f'An unexpected error occurred during the build process: {e}')
+            raise

        logger.info(f'Image [{target_image_hash_name}] build finished.')

-        assert (
-            target_image_tag
-        ), f'Expected target image tag [{target_image_tag}] is None'
-        image = self.docker_client.images.get(target_image_hash_name)
-        image.tag(target_image_repo, target_image_tag)
-        logger.info(
-            f'Re-tagged image [{target_image_hash_name}] with more generic tag [{target_image_tag}]'
-        )
+        if target_image_tag:
+            image = self.docker_client.images.get(target_image_hash_name)
+            image.tag(target_image_repo, target_image_tag)
+            logger.info(
+                f'Re-tagged image [{target_image_hash_name}] with more generic tag [{target_image_tag}]'
+            )

        # Check if the image is built successfully
        image = self.docker_client.images.get(target_image_hash_name)
@@ -67,11 +166,12 @@ class DockerRuntimeBuilder(RuntimeBuilder):
        )
        return target_image_hash_name

-    def image_exists(self, image_name: str) -> bool:
+    def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:
        """Check if the image exists in the registry (try to pull it first) or in the local store.

        Args:
            image_name (str): The Docker image to check (<image repo>:<image tag>)
+            pull_from_repo (bool): Whether to pull from the remote repo if the image not present locally
        Returns:
            bool: Whether the Docker image exists in the registry or in the local store
        """
@@ -80,27 +180,37 @@ class DockerRuntimeBuilder(RuntimeBuilder):
            return False

        try:
-            logger.info(f'Checking, if image exists locally:\n{image_name}')
+            logger.debug(f'Checking, if image exists locally:\n{image_name}')
            self.docker_client.images.get(image_name)
-            logger.info('Image found locally.')
+            logger.debug('Image found locally.')
            return True
        except docker.errors.ImageNotFound:
+            if not pull_from_repo:
+                logger.debug(f'Image {image_name} not found locally')
+                return False
            try:
-                logger.info(
+                logger.debug(
                    'Image not found locally. Trying to pull it, please wait...'
                )

                layers: dict[str, dict[str, str]] = {}
                previous_layer_count = 0
+
+                if ':' in image_name:
+                    image_repo, image_tag = image_name.split(':', 1)
+                else:
+                    image_repo = image_name
+                    image_tag = None
+
                for line in self.docker_client.api.pull(
-                    image_name, stream=True, decode=True
+                    image_repo, tag=image_tag, stream=True, decode=True
                ):
                    self._output_build_progress(line, layers, previous_layer_count)
                    previous_layer_count = len(layers)
-                logger.info('Image pulled')
+                logger.debug('Image pulled')
                return True
            except docker.errors.ImageNotFound:
-                logger.info('Could not find image locally or in registry.')
+                logger.debug('Could not find image locally or in registry.')
                return False
            except Exception as e:
                msg = 'Image could not be pulled: '
@@ -109,9 +219,30 @@ class DockerRuntimeBuilder(RuntimeBuilder):
                    msg += 'image not found in registry.'
                else:
                    msg += f'{ex_msg}'
-                logger.warning(msg)
+                logger.debug(msg)
                return False

+    def _output_logs(self, new_line: str) -> None:
+        """Display the last 10 log_lines in the console (not for file logging).
+        This will create the effect of a rolling display in the console.
+
+        '\033[F'    moves the cursor up one line.
+        '\033[2K\r' clears the line and moves the cursor to the beginning of the line.
+        """
+        if not sys.stdout.isatty():
+            logger.debug(new_line)
+            return
+
+        self.log_lines.pop(0)
+        self.log_lines.append(new_line[:80])
+
+        sys.stdout.write('\033[F' * (self.max_lines))
+        sys.stdout.flush()
+
+        for line in self.log_lines:
+            sys.stdout.write('\033[2K' + line + '\n')
+            sys.stdout.flush()
+
    def _output_build_progress(
        self, current_line: dict, layers: dict, previous_layer_count: int
    ) -> None:
@@ -126,31 +257,93 @@ class DockerRuntimeBuilder(RuntimeBuilder):
            if 'progress' in current_line:
                layers[layer_id]['progress'] = current_line['progress']

-            if (
-                'total' in current_line['progressDetail']
-                and 'current' in current_line['progressDetail']
-            ):
-                total = current_line['progressDetail']['total']
-                current = current_line['progressDetail']['current']
-                percentage = (current / total) * 100
-            else:
-                percentage = 0
+            if 'progressDetail' in current_line:
+                progress_detail = current_line['progressDetail']
+                if 'total' in progress_detail and 'current' in progress_detail:
+                    total = progress_detail['total']
+                    current = progress_detail['current']
+                    percentage = min(
+                        (current / total) * 100, 100
+                    )  # Ensure it doesn't exceed 100%
+                else:
+                    percentage = (
+                        100 if layers[layer_id]['status'] == 'Download complete' else 0
+                    )

-            # refresh process bar in console if stdout is a tty
            if sys.stdout.isatty():
                sys.stdout.write('\033[F' * previous_layer_count)
                for lid, layer_data in sorted(layers.items()):
-                    sys.stdout.write('\033[K')
-                    print(
-                        f'Layer {lid}: {layer_data["progress"]} {layer_data["status"]}'
-                    )
+                    sys.stdout.write('\033[2K\r')
+                    status = layer_data['status']
+                    progress = layer_data['progress']
+                    if status == 'Download complete':
+                        print(f'Layer {lid}: Download complete')
+                    elif status == 'Already exists':
+                        print(f'Layer {lid}: Already exists')
+                    else:
+                        print(f'Layer {lid}: {progress} {status}')
                sys.stdout.flush()
-            # otherwise Log only if percentage is at least 10% higher than last logged
-            elif percentage != 0 and percentage - layers[layer_id]['last_logged'] >= 10:
-                logger.info(
+            elif percentage != 0 and (
+                percentage - layers[layer_id]['last_logged'] >= 10 or percentage == 100
+            ):
+                logger.debug(
                    f'Layer {layer_id}: {layers[layer_id]["progress"]} {layers[layer_id]["status"]}'
                )

            layers[layer_id]['last_logged'] = percentage
        elif 'status' in current_line:
-            logger.info(current_line['status'])
+            logger.debug(current_line['status'])
+
+    def _prune_old_cache_files(self, cache_dir: str, max_age_days: int = 7) -> None:
+        """
+        Prune cache files older than the specified number of days.
+
+        Args:
+            cache_dir (str): The path to the cache directory.
+            max_age_days (int): The maximum age of cache files in days.
+        """
+        try:
+            current_time = time.time()
+            max_age_seconds = max_age_days * 24 * 60 * 60
+
+            for root, _, files in os.walk(cache_dir):
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    try:
+                        file_age = current_time - os.path.getmtime(file_path)
+                        if file_age > max_age_seconds:
+                            os.remove(file_path)
+                            logger.debug(f'Removed old cache file: {file_path}')
+                    except Exception as e:
+                        logger.warning(f'Error processing cache file {file_path}: {e}')
+        except Exception as e:
+            logger.warning(f'Error during build cache pruning: {e}')
+
+    def _is_cache_usable(self, cache_dir: str) -> bool:
+        """
+        Check if the cache directory is usable (exists and is writable).
+
+        Args:
+            cache_dir (str): The path to the cache directory.
+
+        Returns:
+            bool: True if the cache directory is usable, False otherwise.
+        """
+        if not os.path.exists(cache_dir):
+            try:
+                os.makedirs(cache_dir, exist_ok=True)
+                logger.debug(f'Created cache directory: {cache_dir}')
+            except OSError as e:
+                logger.debug(f'Failed to create cache directory {cache_dir}: {e}')
+                return False
+
+        if not os.access(cache_dir, os.W_OK):
+            logger.warning(
+                f'Cache directory {cache_dir} is not writable. Caches will not be used for Docker builds.'
+            )
+            return False
+
+        self._prune_old_cache_files(cache_dir)
+
+        logger.debug(f'Cache directory {cache_dir} is usable')
+        return True
@@ -8,7 +8,10 @@ import requests
 from openhands.core.logger import openhands_logger as logger
 from openhands.runtime.builder import RuntimeBuilder
 from openhands.runtime.utils.request import send_request
-from openhands.runtime.utils.shutdown_listener import should_exit, sleep_if_should_continue
+from openhands.runtime.utils.shutdown_listener import (
+    should_continue,
+    sleep_if_should_continue,
+)


 class RemoteRuntimeBuilder(RuntimeBuilder):
@@ -57,8 +60,8 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
        # Poll /build_status until the build is complete
        start_time = time.time()
        timeout = 30 * 60  # 20 minutes in seconds
-        while True:
-            if should_exit() or time.time() - start_time > timeout:
+        while should_continue():
+            if time.time() - start_time > timeout:
                logger.error('Build timed out after 30 minutes')
                raise RuntimeError('Build timed out after 30 minutes')

@@ -98,7 +101,9 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
            # Wait before polling again
            sleep_if_should_continue(30)

-    def image_exists(self, image_name: str) -> bool:
+        raise RuntimeError('Build interrupted (likely received SIGTERM or SIGINT).')
+
+    def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:
        """Checks if an image exists in the remote registry using the /image_exists endpoint."""
        params = {'image': image_name}
        response = send_request(
@@ -11,13 +11,15 @@ import os
 import re
 import shutil
 import subprocess
+import time
 from contextlib import asynccontextmanager
 from pathlib import Path

 import pexpect
-from fastapi import FastAPI, HTTPException, Request, UploadFile
+from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
+from fastapi.security import APIKeyHeader
 from pydantic import BaseModel
 from starlette.exceptions import HTTPException as StarletteHTTPException
 from uvicorn import run
@@ -28,7 +30,6 @@ from openhands.events.action import (
    BrowseInteractiveAction,
    BrowseURLAction,
    CmdRunAction,
-    FileEditAction,
    FileReadAction,
    FileWriteAction,
    IPythonRunCellAction,
@@ -36,7 +37,6 @@ from openhands.events.action import (
 from openhands.events.observation import (
    CmdOutputObservation,
    ErrorObservation,
-    FileEditObservation,
    FileReadObservation,
    FileWriteObservation,
    IPythonRunCellObservation,
@@ -50,11 +50,6 @@ from openhands.runtime.plugins import (
    JupyterPlugin,
    Plugin,
 )
-from openhands.runtime.plugins.agent_skills.file_ops import (
-    append_file,
-    create_file,
-    edit_file_by_replace,
-)
 from openhands.runtime.utils import split_bash_commands
 from openhands.runtime.utils.files import insert_lines, read_lines

@@ -69,9 +64,14 @@ INIT_COMMANDS = [
 ]
 SOFT_TIMEOUT_SECONDS = 5

-HEAD = '<<<<<<< SEARCH'
-DIVIDER = '======='
-TAIL = '>>>>>>> REPLACE'
+SESSION_API_KEY = os.environ.get('SESSION_API_KEY')
+api_key_header = APIKeyHeader(name='X-Session-API-Key', auto_error=False)
+
+
+def verify_api_key(api_key: str = Depends(api_key_header)):
+    if SESSION_API_KEY and api_key != SESSION_API_KEY:
+        raise HTTPException(status_code=403, detail='Invalid API Key')
+    return api_key


 class RuntimeClient:
@@ -97,6 +97,8 @@ class RuntimeClient:
        self.lock = asyncio.Lock()
        self.plugins: dict[str, Plugin] = {}
        self.browser = BrowserEnv(browsergym_eval_env)
+        self.start_time = time.time()
+        self.last_execution_time = self.start_time

    @property
    def initial_pwd(self):
@@ -331,7 +333,13 @@ class RuntimeClient:
            logger.debug('Requesting exit code...')
            self.shell.expect(self.__bash_expect_regex, timeout=timeout)
            _exit_code_output = self.shell.before
-            exit_code = int(_exit_code_output.strip().split()[0])
+            try:
+                exit_code = int(_exit_code_output.strip().split()[0])
+            except Exception:
+                logger.error('Error getting exit code from bash script')
+                # If we try to run an invalid shell script the output sometimes includes error text
+                # rather than the error code - we assume this is an error
+                exit_code = 2

        except pexpect.TIMEOUT as e:
            if kill_on_timeout:
@@ -520,55 +528,6 @@ class RuntimeClient:
            return ErrorObservation(f'Malformed paths not permitted: {filepath}')
        return FileWriteObservation(content='', path=filepath)

-    async def edit(self, action: FileEditAction) -> Observation:
-        diff_blocks = re.search(
-            f'(.*)\n{HEAD}(.*)\n{DIVIDER}(.*)\n{TAIL}', action.diff_block, re.DOTALL
-        )
-        if not diff_blocks or len(diff_blocks.groups()) < 3:
-            found_head = re.search(f'{HEAD}', action.diff_block) is not None
-            found_divider = re.search(f'{DIVIDER}', action.diff_block) is not None
-            found_tail = re.search(f'{TAIL}', action.diff_block) is not None
-
-            error_msg = 'Could not resolve diff block into search/replace blocks.'
-            if found_head and (not found_tail):
-                error_msg = 'The diff block got cut off because it is too long. Try breaking it into smaller SEARCH/REPLACE blocks.'
-            elif found_head and found_tail and (not found_divider):
-                error_msg = 'Could not find the divider between SEARCH/REPLACE blocks.'
-            return ErrorObservation(error_msg)
-
-        path = diff_blocks.group(1)
-        search_block = diff_blocks.group(2)
-        replace_block = diff_blocks.group(3)
-        if search_block:
-            search_block = search_block[1:]
-        if replace_block:
-            replace_block = replace_block[1:]
-
-        working_dir = self._get_working_directory()
-        filepath = self._resolve_path(path, working_dir)
-        if not search_block:
-            create_file(filename=filepath)
-            append_file(
-                file_name=filepath,
-                content=replace_block,
-            )
-        else:
-            if search_block == replace_block:
-                return ErrorObservation(
-                    'Search block should not be same as Replace block.'
-                )
-            edit_file_by_replace(
-                file_name=filepath,
-                to_replace=search_block,
-                new_content=replace_block,
-            )
-        return FileEditObservation(
-            content=action.diff_block,
-            path=filepath,
-            search_block=search_block,
-            replace_block=replace_block,
-        )
-
    async def browse(self, action: BrowseURLAction) -> Observation:
        return await browse(action, self.browser)

@@ -660,6 +619,24 @@ if __name__ == '__main__':
            response = await call_next(request)
        return response

+    @app.middleware('http')
+    async def authenticate_requests(request: Request, call_next):
+        if request.url.path != '/alive' and request.url.path != '/server_info':
+            try:
+                verify_api_key(request.headers.get('X-Session-API-Key'))
+            except HTTPException as e:
+                return e
+        response = await call_next(request)
+        return response
+
+    @app.get('/server_info')
+    async def get_server_info():
+        assert client is not None
+        current_time = time.time()
+        uptime = current_time - client.start_time
+        idle_time = current_time - client.last_execution_time
+        return {'uptime': uptime, 'idle_time': idle_time}
+
    @app.post('/execute_action')
    async def execute_action(action_request: ActionRequest):
        assert client is not None
@@ -667,10 +644,13 @@ if __name__ == '__main__':
            action = event_from_dict(action_request.action)
            if not isinstance(action, Action):
                raise HTTPException(status_code=400, detail='Invalid action type')
+            client.last_execution_time = time.time()
            observation = await client.run_action(action)
            return event_to_dict(observation)
        except Exception as e:
-            logger.error(f'Error processing command: {str(e)}')
+            logger.error(
+                f'Error processing command: {str(e)}', exc_info=True, stack_info=True
+            )
            raise HTTPException(status_code=500, detail=str(e))

    @app.post('/upload_file')
@@ -10,6 +10,7 @@ import requests
 import tenacity

 from openhands.core.config import AppConfig
+from openhands.core.logger import DEBUG
 from openhands.core.logger import openhands_logger as logger
 from openhands.events import EventStream
 from openhands.events.action import (
@@ -17,7 +18,6 @@ from openhands.events.action import (
    BrowseInteractiveAction,
    BrowseURLAction,
    CmdRunAction,
-    FileEditAction,
    FileReadAction,
    FileWriteAction,
    IPythonRunCellAction,
@@ -36,6 +36,7 @@ from openhands.runtime.plugins import PluginRequirement
 from openhands.runtime.runtime import Runtime
 from openhands.runtime.utils import find_available_tcp_port
 from openhands.runtime.utils.runtime_build import build_runtime_image
+from openhands.utils.tenacity_stop import stop_if_should_exit


 class LogBuffer:
@@ -126,9 +127,7 @@ class EventStreamRuntime(Runtime):
        self.config = config
        self._host_port = 30000  # initial dummy value
        self._container_port = 30001  # initial dummy value
-        self.api_url = (
-            f'http://{self.config.sandbox.api_hostname}:{self._container_port}'
-        )
+        self.api_url = f'{self.config.sandbox.local_runtime_url}:{self._container_port}'
        self.session = requests.Session()
        self.instance_id = (
            sid + '_' + str(uuid.uuid4()) if sid is not None else str(uuid.uuid4())
@@ -168,6 +167,7 @@ class EventStreamRuntime(Runtime):
                self.base_container_image,
                self.runtime_builder,
                extra_deps=self.config.sandbox.runtime_extra_deps,
+                force_rebuild=self.config.sandbox.force_rebuild_runtime,
            )
        self.container = self._init_container(
            sandbox_workspace_dir=self.config.workspace_mount_path_in_sandbox,  # e.g. /workspace
@@ -203,7 +203,7 @@ class EventStreamRuntime(Runtime):
            raise ex

    @tenacity.retry(
-        stop=tenacity.stop_after_attempt(5),
+        stop=tenacity.stop_after_attempt(5) | stop_if_should_exit(),
        wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
    )
    def _init_container(
@@ -226,7 +226,7 @@ class EventStreamRuntime(Runtime):
                self._host_port
            )  # in future this might differ from host port
            self.api_url = (
-                f'http://{self.config.sandbox.api_hostname}:{self._container_port}'
+                f'{self.config.sandbox.local_runtime_url}:{self._container_port}'
            )

            use_host_network = self.config.sandbox.use_host_network
@@ -249,7 +249,7 @@ class EventStreamRuntime(Runtime):
                'port': str(self._container_port),
                'PYTHONUNBUFFERED': 1,
            }
-            if self.config.debug:
+            if self.config.debug or DEBUG:
                environment['DEBUG'] = 'true'

            logger.debug(f'Workspace Base: {self.config.workspace_base}')
@@ -274,7 +274,7 @@ class EventStreamRuntime(Runtime):
            container = self.docker_client.containers.run(
                self.runtime_container_image,
                command=(
-                    f'/openhands/miniforge3/bin/mamba run --no-capture-output -n base '
+                    f'/openhands/micromamba/bin/micromamba run -n openhands '
                    f'poetry run '
                    f'python -u -m openhands.runtime.client.client {self._container_port} '
                    f'--working-dir "{sandbox_workspace_dir}" '
@@ -324,7 +324,7 @@ class EventStreamRuntime(Runtime):
            )

    @tenacity.retry(
-        stop=tenacity.stop_after_attempt(10),
+        stop=tenacity.stop_after_delay(120) | stop_if_should_exit(),
        wait=tenacity.wait_exponential(multiplier=2, min=1, max=20),
        reraise=(ConnectionRefusedError,),
    )
@@ -430,13 +430,15 @@ class EventStreamRuntime(Runtime):
                    logger.debug(f'response: {response}')
                    error_message = response.text
                    logger.error(f'Error from server: {error_message}')
-                    obs = ErrorObservation(f'Command execution failed: {error_message}')
+                    obs = ErrorObservation(f'Action execution failed: {error_message}')
            except requests.Timeout:
                logger.error('No response received within the timeout period.')
-                obs = ErrorObservation('Command execution timed out')
+                obs = ErrorObservation(
+                    f'Action execution timed out after {action.timeout} seconds.'
+                )
            except Exception as e:
-                logger.error(f'Error during command execution: {e}')
-                obs = ErrorObservation(f'Command execution failed: {str(e)}')
+                logger.error(f'Error during action execution: {e}')
+                obs = ErrorObservation(f'Action execution failed: {str(e)}')
            self._refresh_logs()
            return obs

@@ -452,9 +454,6 @@ class EventStreamRuntime(Runtime):
    def write(self, action: FileWriteAction) -> Observation:
        return self.run_action(action)

-    def edit(self, action: FileEditAction) -> Observation:
-        return self.run_action(action)
-
    def browse(self, action: BrowseURLAction) -> Observation:
        return self.run_action(action)

@@ -10,18 +10,9 @@ import_functions(
    module=file_reader, function_names=file_reader.__all__, target_globals=globals()
 )
 __all__ = file_ops.__all__ + file_reader.__all__
-__except__ = [
-    'create_file',
-    'edit_file_by_replace',
-    'insert_content_at_line',
-    'append_file',
-]  ## DISABLED TEMPORARILY.

 DOCUMENTATION = ''
 for func_name in __all__:
-    if func_name in __except__:
-        continue
-
    func = globals()[func_name]

    cur_doc = func.__doc__
@@ -5,8 +5,3 @@ import_functions(
    module=file_ops, function_names=file_ops.__all__, target_globals=globals()
 )
 __all__ = file_ops.__all__
-
-create_file = file_ops.create_file
-append_file = file_ops.append_file
-edit_file_by_replace = file_ops.edit_file_by_replace
-insert_content_at_line = file_ops.insert_content_at_line
@@ -22,10 +22,7 @@ import shutil
 import tempfile
 import uuid

-if __package__ is None or __package__ == '':
-    from aider import Linter
-else:
-    from openhands.runtime.plugins.agent_skills.utils.aider import Linter
+from openhands.linter import DefaultLinter, LintResult

 CURRENT_FILE: str | None = None
 CURRENT_LINE = 1
@@ -98,13 +95,16 @@ def _lint_file(file_path: str) -> tuple[str | None, int | None]:
    Returns:
        tuple[str | None, int | None]: (lint_error, first_error_line_number)
    """
-    linter = Linter(root=os.getcwd())
-    lint_error = linter.lint(file_path)
+    linter = DefaultLinter()
+    lint_error: list[LintResult] = linter.lint(file_path)
    if not lint_error:
        # Linting successful. No issues found.
        return None, None
-    first_error_line = lint_error.lines[0] if lint_error.lines else None
-    return 'ERRORS:\n' + lint_error.text, first_error_line
+    first_error_line = lint_error[0].line if len(lint_error) > 0 else None
+    error_text = 'ERRORS:\n' + '\n'.join(
+        [f'{file_path}:{err.line}:{err.column}: {err.message}' for err in lint_error]
+    )
+    return error_text, first_error_line


 def _print_window(
@@ -518,7 +518,8 @@ def _edit_file_impl(
            with open(original_file_backup_path, 'w') as f:
                f.writelines(lines)

-            lint_error, first_error_line = _lint_file(file_name)
+            file_name_abs = os.path.abspath(file_name)
+            lint_error, first_error_line = _lint_file(file_name_abs)

            # Select the errors caused by the modification
            def extract_last_part(line):
@@ -786,7 +787,6 @@ def append_file(file_name: str, content: str) -> None:

    Args:
        file_name: str: The name of the file to edit.
-        line_number: int: The line number (starting from 1) to insert the content after.
        content: str: The content to insert.
    """
    ret_str = _edit_file_impl(
@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
@@ -1,8 +0,0 @@
-# Aider is AI pair programming in your terminal
-
-Aider lets you pair program with LLMs,
-to edit code in your local git repository.
-
-Please see the [original repository](https://github.com/paul-gauthier/aider) for more information.
-
-OpenHands has adapted and integrated its linter module ([original code](https://github.com/paul-gauthier/aider/blob/main/aider/linter.py)).
@@ -1,9 +0,0 @@
-if __package__ is None or __package__ == '':
-    from linter import Linter, LintResult
-else:
-    from openhands.runtime.plugins.agent_skills.utils.aider.linter import (
-        Linter,
-        LintResult,
-    )
-
-__all__ = ['Linter', 'LintResult']
@@ -1,378 +0,0 @@
-import json
-import os
-import subprocess
-import sys
-import tempfile
-import traceback
-import warnings
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Optional
-
-from grep_ast import TreeContext, filename_to_lang
-from tree_sitter_languages import get_parser  # noqa: E402
-
-# tree_sitter is throwing a FutureWarning
-warnings.simplefilter('ignore', category=FutureWarning)
-
-
-@dataclass
-class LintResult:
-    text: str
-    lines: list
-
-
-class Linter:
-    def __init__(self, encoding='utf-8', root=None):
-        self.encoding = encoding
-        self.root = root
-
-        self.ts_installed = self._check_tool_installed('tsc')
-        self.eslint_installed = self._check_tool_installed('eslint')
-
-        self.languages = dict(
-            python=self.py_lint,
-        )
-        if self.eslint_installed:
-            self.languages['javascript'] = self.ts_eslint
-            self.languages['typescript'] = self.ts_eslint
-        elif self.ts_installed:
-            self.languages['javascript'] = self.ts_tsc_lint
-            self.languages['typescript'] = self.ts_tsc_lint
-        self.all_lint_cmd = None
-
-    def set_linter(self, lang, cmd):
-        if lang:
-            self.languages[lang] = cmd
-            return
-
-        self.all_lint_cmd = cmd
-
-    def get_rel_fname(self, fname):
-        if self.root:
-            return os.path.relpath(fname, self.root)
-        else:
-            return fname
-
-    def run_cmd(self, cmd, rel_fname, code):
-        cmd += ' ' + rel_fname
-        cmd = cmd.split()
-
-        process = subprocess.Popen(
-            cmd,
-            cwd=self.root,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            stdin=subprocess.PIPE,  # Add stdin parameter
-        )
-        stdout, _ = process.communicate(
-            input=code.encode()
-        )  # Pass the code to the process
-        errors = stdout.decode().strip()
-        self.returncode = process.returncode
-        if self.returncode == 0:
-            return  # zero exit status
-
-        cmd = ' '.join(cmd)
-        res = ''
-        res += errors
-        line_num = extract_error_line_from(res)
-        return LintResult(text=res, lines=[line_num])
-
-    def get_abs_fname(self, fname):
-        if os.path.isabs(fname):
-            return fname
-        elif os.path.isfile(fname):
-            rel_fname = self.get_rel_fname(fname)
-            return os.path.abspath(rel_fname)
-        else:  # if a temp file
-            return self.get_rel_fname(fname)
-
-    def lint(self, fname, cmd=None) -> LintResult | None:
-        code = Path(fname).read_text(self.encoding)
-        absolute_fname = self.get_abs_fname(fname)
-        if cmd:
-            cmd = cmd.strip()
-        if not cmd:
-            lang = filename_to_lang(fname)
-            if not lang:
-                return None
-            if self.all_lint_cmd:
-                cmd = self.all_lint_cmd
-            else:
-                cmd = self.languages.get(lang)
-        if callable(cmd):
-            linkres = cmd(fname, absolute_fname, code)
-        elif cmd:
-            linkres = self.run_cmd(cmd, absolute_fname, code)
-        else:
-            linkres = basic_lint(absolute_fname, code)
-        return linkres
-
-    def flake_lint(self, rel_fname, code):
-        fatal = 'F821,F822,F831,E112,E113,E999,E902'
-        flake8 = f'flake8 --select={fatal} --isolated'
-
-        try:
-            flake_res = self.run_cmd(flake8, rel_fname, code)
-        except FileNotFoundError:
-            flake_res = None
-        return flake_res
-
-    def py_lint(self, fname, rel_fname, code):
-        error = self.flake_lint(rel_fname, code)
-        if not error:
-            error = lint_python_compile(fname, code)
-        if not error:
-            error = basic_lint(rel_fname, code)
-        return error
-
-    def _check_tool_installed(self, tool_name: str) -> bool:
-        """Check if a tool is installed."""
-        try:
-            subprocess.run(
-                [tool_name, '--version'],
-                check=True,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-            )
-            return True
-        except (subprocess.CalledProcessError, FileNotFoundError):
-            return False
-
-    def print_lint_result(self, lint_result: LintResult) -> None:
-        print(f'\n{lint_result.text.strip()}')
-        if isinstance(lint_result.lines, list) and lint_result.lines:
-            if isinstance(lint_result.lines[0], LintResult):
-                self.print_lint_result(lint_result.lines[0])
-
-    def ts_eslint(self, fname: str, rel_fname: str, code: str) -> Optional[LintResult]:
-        """Use ESLint to check for errors. If ESLint is not installed return None."""
-        if not self.eslint_installed:
-            return None
-
-        # Enhanced ESLint configuration with React support
-        eslint_config = {
-            'env': {'es6': True, 'browser': True, 'node': True},
-            'extends': ['eslint:recommended', 'plugin:react/recommended'],
-            'parserOptions': {
-                'ecmaVersion': 2021,
-                'sourceType': 'module',
-                'ecmaFeatures': {'jsx': True},
-            },
-            'plugins': ['react'],
-            'rules': {
-                'no-unused-vars': 'warn',
-                'no-console': 'off',
-                'react/prop-types': 'warn',
-                'semi': ['error', 'always'],
-            },
-            'settings': {'react': {'version': 'detect'}},
-        }
-
-        # Write config to a temporary file
-        with tempfile.NamedTemporaryFile(
-            mode='w', suffix='.json', delete=False
-        ) as temp_config:
-            json.dump(eslint_config, temp_config)
-            temp_config_path = temp_config.name
-
-        try:
-            # Point to frontend node_modules directory
-            if self.root:
-                plugin_path = f'{self.root}/frontend/node_modules/'
-            else:
-                return None
-
-            eslint_cmd = f'eslint --no-eslintrc --config {temp_config_path} --resolve-plugins-relative-to {plugin_path} --format json'
-            eslint_res = ''
-            try:
-                eslint_res = self.run_cmd(eslint_cmd, rel_fname, code)
-                if eslint_res and hasattr(eslint_res, 'text'):
-                    # Parse the ESLint JSON output
-                    eslint_output = json.loads(eslint_res.text)
-                    error_lines = []
-                    error_messages = []
-                    for result in eslint_output:
-                        for message in result.get('messages', []):
-                            line = message.get('line', 0)
-                            error_lines.append(line)
-                            error_messages.append(
-                                f"{rel_fname}:{line}:{message.get('column', 0)}: {message.get('message')} ({message.get('ruleId')})"
-                            )
-                    if not error_messages:
-                        return None
-
-                    return LintResult(text='\n'.join(error_messages), lines=error_lines)
-            except json.JSONDecodeError as e:
-                return LintResult(text=f'\nJSONDecodeError: {e}', lines=[eslint_res])
-            except FileNotFoundError:
-                return None
-            except Exception as e:
-                return LintResult(text=f'\nUnexpected error: {e}', lines=[])
-        finally:
-            os.unlink(temp_config_path)
-        return None
-
-    def ts_tsc_lint(self, fname, rel_fname, code):
-        """Use typescript compiler to check for errors. If TypeScript is not installed return None."""
-        if self.ts_installed:
-            tsc_cmd = 'tsc --noEmit --allowJs --checkJs --strict --noImplicitAny --strictNullChecks --strictFunctionTypes --strictBindCallApply --strictPropertyInitialization --noImplicitThis --alwaysStrict'
-            try:
-                tsc_res = self.run_cmd(tsc_cmd, rel_fname, code)
-                if tsc_res:
-                    # Parse the TSC output
-                    error_lines = []
-                    for line in tsc_res.text.split('\n'):
-                        # Extract lines and column numbers
-                        if ': error TS' in line or ': warning TS' in line:
-                            try:
-                                location_part = line.split('(')[1].split(')')[0]
-                                line_num, _ = map(int, location_part.split(','))
-                                error_lines.append(line_num)
-                            except (IndexError, ValueError):
-                                continue
-                    return LintResult(text=tsc_res.text, lines=error_lines)
-            except FileNotFoundError:
-                pass
-
-        # If still no errors, check for missing semicolons
-        lines = code.split('\n')
-        error_lines = []
-        for i, line in enumerate(lines):
-            stripped_line = line.strip()
-            if (
-                stripped_line
-                and not stripped_line.endswith(';')
-                and not stripped_line.endswith('{')
-                and not stripped_line.endswith('}')
-                and not stripped_line.startswith('//')
-            ):
-                error_lines.append(i + 1)
-
-        if error_lines:
-            error_message = (
-                f"{rel_fname}({error_lines[0]},1): error TS1005: ';' expected."
-            )
-            return LintResult(text=error_message, lines=error_lines)
-
-        # If tsc is not available return None (basic_lint causes other problems!)
-        return None
-
-
-def lint_python_compile(fname, code):
-    try:
-        compile(code, fname, 'exec')  # USE TRACEBACK BELOW HERE
-        return
-    except IndentationError as err:
-        end_lineno = getattr(err, 'end_lineno', err.lineno)
-        if isinstance(end_lineno, int):
-            line_numbers = list(range(end_lineno - 1, end_lineno))
-        else:
-            line_numbers = []
-
-        tb_lines = traceback.format_exception(type(err), err, err.__traceback__)
-        last_file_i = 0
-
-        target = '# USE TRACEBACK'
-        target += ' BELOW HERE'
-        for i in range(len(tb_lines)):
-            if target in tb_lines[i]:
-                last_file_i = i
-                break
-        tb_lines = tb_lines[:1] + tb_lines[last_file_i + 1 :]
-
-    res = ''.join(tb_lines)
-    return LintResult(text=res, lines=line_numbers)
-
-
-def basic_lint(fname, code):
-    """Use tree-sitter to look for syntax errors, display them with tree context."""
-    lang = filename_to_lang(fname)
-    if not lang:
-        return
-
-    parser = get_parser(lang)
-    tree = parser.parse(bytes(code, 'utf-8'))
-
-    errors = traverse_tree(tree.root_node)
-    if not errors:
-        return
-
-    error_messages = [
-        f'{fname}:{line}:{col}: {error_details}' for line, col, error_details in errors
-    ]
-    return LintResult(
-        text='\n'.join(error_messages), lines=[line for line, _, _ in errors]
-    )
-
-
-def extract_error_line_from(lint_error):
-    # TODO: this is a temporary fix to extract the error line from the error message
-    # it should be replaced with a more robust/unified solution
-    first_error_line = None
-    for line in lint_error.splitlines(True):
-        if line.strip():
-            # The format of the error message is: <filename>:<line>:<column>: <error code> <error message>
-            parts = line.split(':')
-            if len(parts) >= 2:
-                try:
-                    first_error_line = int(parts[1])
-                    break
-                except ValueError:
-                    continue
-    return first_error_line
-
-
-def tree_context(fname, code, line_nums):
-    context = TreeContext(
-        fname,
-        code,
-        color=False,
-        line_number=True,
-        child_context=False,
-        last_line=False,
-        margin=0,
-        mark_lois=True,
-        loi_pad=3,
-        # header_max=30,
-        show_top_of_file_parent_scope=False,
-    )
-    line_nums = set(line_nums)
-    context.add_lines_of_interest(line_nums)
-    context.add_context()
-    output = context.format()
-
-    return output
-
-
-def traverse_tree(node):
-    """Traverses the tree to find errors"""
-    errors = []
-    if node.type == 'ERROR' or node.is_missing:
-        line_no = node.start_point[0] + 1
-        col_no = node.start_point[1] + 1
-        error_type = 'Missing node' if node.is_missing else 'Syntax error'
-        errors.append((line_no, col_no, error_type))
-
-    for child in node.children:
-        errors += traverse_tree(child)
-
-    return errors
-
-
-def main():
-    """Main function to parse files provided as command line arguments."""
-    if len(sys.argv) < 2:
-        print('Usage: python linter.py <file1> <file2> ...')
-        sys.exit(1)
-
-    linter = Linter(root=os.getcwd())
-    for file_path in sys.argv[1:]:
-        errors = linter.lint(file_path)
-        if errors:
-            print(errors)
-
-
-if __name__ == '__main__':
-    main()
@@ -28,7 +28,8 @@ class JupyterPlugin(Plugin):
                'cd /openhands/code\n'
                'export POETRY_VIRTUALENVS_PATH=/openhands/poetry;\n'
                'export PYTHONPATH=/openhands/code:$PYTHONPATH;\n'
-                '/openhands/miniforge3/bin/mamba run -n base '
+                'export MAMBA_ROOT_PREFIX=/openhands/micromamba;\n'
+                '/openhands/micromamba/bin/micromamba run -n openhands '
                'poetry run jupyter kernelgateway '
                '--KernelGatewayApp.ip=0.0.0.0 '
                f'--KernelGatewayApp.port={self.kernel_gateway_port}\n'
@@ -42,6 +42,7 @@ from openhands.runtime.utils.request import (
    send_request,
 )
 from openhands.runtime.utils.runtime_build import build_runtime_image
+from openhands.utils.tenacity_stop import stop_if_should_exit


 class RemoteRuntime(Runtime):
@@ -59,13 +60,6 @@ class RemoteRuntime(Runtime):
        status_message_callback: Optional[Callable] = None,
    ):
        self.config = config
-        if self.config.sandbox.api_hostname == 'localhost':
-            self.config.sandbox.api_hostname = 'api.all-hands.dev/v0/runtime'
-            logger.warning(
-                'Using localhost as the API hostname is not supported in the RemoteRuntime. Please set a proper hostname.\n'
-                'Setting it to default value: api.all-hands.dev/v0/runtime'
-            )
-        self.api_url = f'https://{self.config.sandbox.api_hostname.rstrip("/")}'

        if self.config.sandbox.api_key is None:
            raise ValueError(
@@ -82,7 +76,7 @@ class RemoteRuntime(Runtime):
            )

        self.runtime_builder = RemoteRuntimeBuilder(
-            self.api_url, self.config.sandbox.api_key
+            self.config.sandbox.remote_runtime_api_url, self.config.sandbox.api_key
        )
        self.runtime_id: str | None = None
        self.runtime_url: str | None = None
@@ -90,44 +84,54 @@ class RemoteRuntime(Runtime):
        self.instance_id = (
            sid + str(uuid.uuid4()) if sid is not None else str(uuid.uuid4())
        )
-        if self.config.sandbox.runtime_container_image is not None:
-            raise ValueError(
-                'Setting runtime_container_image is not supported in the remote runtime.'
-            )
-        self.container_image: str = self.config.sandbox.base_container_image
        self.container_name = 'oh-remote-runtime-' + self.instance_id
-        logger.debug(f'RemoteRuntime `{sid}` config:\n{self.config}')
-        response = send_request(self.session, 'GET', f'{self.api_url}/registry_prefix')
-        response_json = response.json()
-        registry_prefix = response_json['registry_prefix']
-        os.environ['OH_RUNTIME_RUNTIME_IMAGE_REPO'] = (
-            registry_prefix.rstrip('/') + '/runtime'
-        )
-        logger.info(
-            f'Runtime image repo: {os.environ["OH_RUNTIME_RUNTIME_IMAGE_REPO"]}'
-        )
-
-        if self.config.sandbox.runtime_extra_deps:
+        if self.config.sandbox.runtime_container_image is not None:
            logger.info(
-                f'Installing extra user-provided dependencies in the runtime image: {self.config.sandbox.runtime_extra_deps}'
+                f'Running remote runtime with image: {self.config.sandbox.runtime_container_image}'
+            )
+            self.container_image = self.config.sandbox.runtime_container_image
+        else:
+            logger.info(
+                f'Building remote runtime with base image: {self.config.sandbox.base_container_image}'
+            )
+            logger.debug(f'RemoteRuntime `{sid}` config:\n{self.config}')
+            response = send_request(
+                self.session,
+                'GET',
+                f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix',
+            )
+            response_json = response.json()
+            registry_prefix = response_json['registry_prefix']
+            os.environ['OH_RUNTIME_RUNTIME_IMAGE_REPO'] = (
+                registry_prefix.rstrip('/') + '/runtime'
+            )
+            logger.info(
+                f'Runtime image repo: {os.environ["OH_RUNTIME_RUNTIME_IMAGE_REPO"]}'
            )

-        # Build the container image
-        self.container_image = build_runtime_image(
-            self.container_image,
-            self.runtime_builder,
-            extra_deps=self.config.sandbox.runtime_extra_deps,
-        )
+            if self.config.sandbox.runtime_extra_deps:
+                logger.info(
+                    f'Installing extra user-provided dependencies in the runtime image: {self.config.sandbox.runtime_extra_deps}'
+                )

-        # Use the /image_exists endpoint to check if the image exists
-        response = send_request(
-            self.session,
-            'GET',
-            f'{self.api_url}/image_exists',
-            params={'image': self.container_image},
-        )
-        if response.status_code != 200 or not response.json()['exists']:
-            raise RuntimeError(f'Container image {self.container_image} does not exist')
+            # Build the container image
+            self.container_image = build_runtime_image(
+                self.config.sandbox.base_container_image,
+                self.runtime_builder,
+                extra_deps=self.config.sandbox.runtime_extra_deps,
+                force_rebuild=self.config.sandbox.force_rebuild_runtime,
+            )
+
+            response = send_request(
+                self.session,
+                'GET',
+                f'{self.config.sandbox.remote_runtime_api_url}/image_exists',
+                params={'image': self.container_image},
+            )
+            if response.status_code != 200 or not response.json()['exists']:
+                raise RuntimeError(
+                    f'Container image {self.container_image} does not exist'
+                )

        # Prepare the request body for the /start endpoint
        plugin_arg = ''
@@ -141,8 +145,8 @@ class RemoteRuntime(Runtime):
        start_request = {
            'image': self.container_image,
            'command': (
-                f'/openhands/miniforge3/bin/mamba run --no-capture-output -n base '
-                'PYTHONUNBUFFERED=1 poetry run '
+                f'/openhands/micromamba/bin/micromamba run -n openhands '
+                'poetry run '
                f'python -u -m openhands.runtime.client.client {self.port} '
                f'--working-dir {self.config.workspace_mount_path_in_sandbox} '
                f'{plugin_arg}'
@@ -157,7 +161,10 @@ class RemoteRuntime(Runtime):

        # Start the sandbox using the /start endpoint
        response = send_request(
-            self.session, 'POST', f'{self.api_url}/start', json=start_request
+            self.session,
+            'POST',
+            f'{self.config.sandbox.remote_runtime_api_url}/start',
+            json=start_request,
        )
        if response.status_code != 201:
            raise RuntimeError(f'Failed to start sandbox: {response.text}')
@@ -169,6 +176,11 @@ class RemoteRuntime(Runtime):
            f'Sandbox started. Runtime ID: {self.runtime_id}, URL: {self.runtime_url}'
        )

+        if 'session_api_key' in start_response:
+            self.session.headers.update(
+                {'X-Session-API-Key': start_response['session_api_key']}
+            )
+
        # Initialize the eventstream and env vars
        super().__init__(
            config, event_stream, sid, plugins, env_vars, status_message_callback
@@ -186,13 +198,13 @@ class RemoteRuntime(Runtime):
        ), 'Runtime URL is not set. This should never happen.'

    @retry(
-        stop=stop_after_attempt(10),
+        stop=stop_after_attempt(10) | stop_if_should_exit(),
        wait=wait_exponential(multiplier=1, min=4, max=60),
        retry=retry_if_exception_type(RuntimeError),
        reraise=True,
    )
    def _wait_until_alive(self):
-        logger.info('Waiting for sandbox to be alive...')
+        logger.info(f'Waiting for runtime to be alive at url: {self.runtime_url}')
        response = send_request(
            self.session,
            'GET',
@@ -215,7 +227,7 @@ class RemoteRuntime(Runtime):
                response = send_request(
                    self.session,
                    'POST',
-                    f'{self.api_url}/stop',
+                    f'{self.config.sandbox.remote_runtime_api_url}/stop',
                    json={'runtime_id': self.runtime_id},
                )
                if response.status_code != 200:
@@ -14,7 +14,6 @@ from openhands.events.action import (
    BrowseInteractiveAction,
    BrowseURLAction,
    CmdRunAction,
-    FileEditAction,
    FileReadAction,
    FileWriteAction,
    IPythonRunCellAction,
@@ -180,10 +179,6 @@ class Runtime:
    def write(self, action: FileWriteAction) -> Observation:
        pass

-    @abstractmethod
-    def edit(self, action: FileEditAction) -> Observation:
-        pass
-
    @abstractmethod
    def browse(self, action: BrowseURLAction) -> Observation:
        pass
@@ -10,6 +10,8 @@ from tenacity import (
    wait_exponential,
 )

+from openhands.utils.tenacity_stop import stop_if_should_exit
+

 def is_server_error(exception):
    return (
@@ -47,10 +49,10 @@ def send_request(
    if retry_fns is not None:
        for fn in retry_fns:
            retry_condition |= retry_if_exception(fn)
-    kwargs["timeout"] = timeout
+    kwargs['timeout'] = timeout

    @retry(
-        stop=stop_after_delay(timeout),
+        stop=stop_after_delay(timeout) | stop_if_should_exit(),
        wait=wait_exponential(multiplier=1, min=4, max=60),
        retry=retry_condition,
        reraise=True,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Xingyao Wang	16a2cf37da	fix: reuse config parser for cli (#4187 )	2024-10-03 09:41:02 -04:00
Robert Brennan	9c95d0ff58	Enable authentication for runtime environments (#4179 ) Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>	2024-10-03 09:14:39 -04:00
Graham Neubig	9641bfbd3e	Fix issue #4181 : 'Prompting best practices documentation' (#4183 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-10-03 07:58:13 -04:00
Ikko Eltociear Ashimine	5c31fd9357	chore: update agent_session.py (#4186 )	2024-10-03 07:00:45 +00:00
Engel Nyst	1abfd3b808	Retry on litellm's APIError, which includes 502 (#4167 )	2024-10-03 01:54:49 +02:00
Xingyao Wang	e0594432e2	fix: build shutdown listener (#4147 )	2024-10-02 22:25:10 +00:00
Xingyao Wang	e81c5597d6	feat(runtime): use micromamba instead of mamba and fix build issue (#4154 )	2024-10-02 21:23:18 +00:00
Rehan Ganapathy	c8a933590a	(feat) allow specification of config.toml location via args (solves #3947 ) (#4168 ) Co-authored-by: Rehan Ganapathy <rehanganapathy@MACASF.local>	2024-10-02 20:30:12 +00:00
mamoodi	dd228c07e0	Small reordering of PR template (#4173 )	2024-10-02 13:30:53 -04:00
tofarr	e0f8a5d508	Fix: Add timeout on websocket accept (#4169 )	2024-10-02 10:51:12 -06:00
dependabot[bot]	e93db80769	chore(deps-dev): bump reportlab from 4.2.4 to 4.2.5 (#4170 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-02 18:28:34 +02:00
dependabot[bot]	14a4e1018a	chore(deps): bump litellm from 1.48.7 to 1.48.9 (#4176 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-02 16:15:15 +00:00
dependabot[bot]	bb151655cc	chore(deps-dev): bump streamlit from 1.38.0 to 1.39.0 (#4175 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-02 16:06:58 +00:00
dependabot[bot]	471867859f	chore(deps): bump boto3 from 1.35.30 to 1.35.31 (#4174 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-02 15:50:47 +00:00
dependabot[bot]	a1d09c4437	chore(deps): bump google-cloud-aiplatform from 1.68.0 to 1.69.0 (#4172 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-02 15:40:16 +00:00
dependabot[bot]	240b500acf	chore(deps-dev): bump openai from 1.50.2 to 1.51.0 (#4171 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-02 17:28:43 +02:00
Xingyao Wang	d518ca08b7	standardize error message across remote runtime and eventstream runtime (#4159 )	2024-10-02 22:42:17 +08:00
Graham Neubig	52e0630af8	Update .openhands_instructions with linting (#4165 )	2024-10-02 08:10:09 -04:00
Graham Neubig	178dbfaf4a	Run pre-commit (#4163 )	2024-10-02 04:52:02 +00:00
Xingyao Wang	240a470a1d	Revert "add few seconds to properly receive timeout error from client" This reverts commit `dd2cb4399a`.	2024-10-01 23:44:05 -04:00
Xingyao Wang	dd2cb4399a	add few seconds to properly receive timeout error from client	2024-10-01 23:43:50 -04:00
tofarr	4eaf28d7b1	Fix ctrl c not working during startup (#4155 )	2024-10-02 11:05:00 +08:00
Engel Nyst	5a45c648a8	attributes for BE/FE should not be sent (#4150 )	2024-10-01 23:00:03 +00:00
Xingyao Wang	3cf794faef	fix(runtime build): only check for image exist on exact hash tag (#4152 )	2024-10-01 22:20:25 +00:00
mamoodi	04643d6f3c	Make Claude Sonnet 3.5 the recommended model and update docs accordingly (#4151 )	2024-10-01 20:32:39 +00:00
Xingyao Wang	53a015f718	fix: make llm_completions optional to fix `eval_infer.py` (#4148 )	2024-10-02 03:55:03 +08:00
Graham Neubig	148d22e1af	Fix issue #4136 : 'Restructuring documentation' (#4138 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-10-01 17:44:48 +00:00
Robert Brennan	31b2e4b5b2	allow specifying exact remote image (#4135 )	2024-10-01 13:17:51 -04:00
dependabot[bot]	1d6633164f	chore(deps-dev): bump @types/node from 22.7.3 to 22.7.4 in /frontend (#4118 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-01 13:16:32 -04:00
dependabot[bot]	dd89cfba2a	chore(deps): bump @react-types/shared from 3.24.1 to 3.25.0 in /frontend (#4139 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-01 13:16:08 -04:00
mamoodi	0144caaf1f	Update eval doc for remote runtime (#4145 )	2024-10-01 13:14:36 -04:00
Robert Brennan	ec1a86f150	Handle errors when starting session (#4134 )	2024-10-01 12:40:09 -04:00
dependabot[bot]	926af7f5fd	chore(deps): bump boto3 from 1.35.29 to 1.35.30 (#4144 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-01 16:19:07 +00:00
dependabot[bot]	cc55c6dbe5	chore(deps): bump litellm from 1.48.6 to 1.48.7 (#4141 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-01 18:18:49 +02:00
OpenHands	c777cfeacf	Fix issue #4113 : Document github action (#4124 ) Co-authored-by: Graham Neubig <neubig@gmail.com>	2024-10-01 18:08:57 +02:00
dependabot[bot]	823966c24e	chore(deps-dev): bump @types/react from 18.3.9 to 18.3.10 in /frontend (#4117 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-01 10:55:45 -04:00
dependabot[bot]	adba7dad96	chore(deps): bump uvicorn from 0.30.6 to 0.31.0 (#4114 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-01 10:55:27 -04:00
Xingyao Wang	1109637efb	Update instruction for new version of eval runtime-api (#4128 )	2024-09-30 23:48:38 +00:00
mamoodi	71adfeebab	Update PR Template for better release notes (#4126 )	2024-09-30 17:06:56 -04:00
Robert Brennan	8059e8e298	make runtime url configurable (#4093 )	2024-09-30 18:59:57 +00:00
Xingyao Wang	54ac340e0b	refactor: standardize linter output data structure and interface (#4077 ) Co-authored-by: Graham Neubig <neubig@gmail.com>	2024-10-01 02:40:23 +08:00
dependabot[bot]	13901b4b5a	chore(deps): bump python-multipart from 0.0.9 to 0.0.12 (#4121 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-30 20:30:55 +02:00
dependabot[bot]	0b27d51135	chore(deps): bump litellm from 1.48.5 to 1.48.6 (#4120 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-30 20:30:09 +02:00
dependabot[bot]	f0ce682fa0	chore(deps): bump json-repair from 0.29.5 to 0.29.7 (#4115 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-30 20:28:32 +02:00
dependabot[bot]	3567911da8	chore(deps): bump boto3 from 1.35.28 to 1.35.29 (#4122 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-30 20:27:58 +02:00
Graham Neubig	215d227e5a	Reference the OpenHands resolver (#4125 )	2024-09-30 14:26:12 -04:00
mamoodi	50e6cc6156	Release 0.9.7 (#4123 )	2024-09-30 11:28:16 -04:00
Xingyao Wang	8d6eda3623	fix eval_infer.sh to correctly copy SWE-Bench logs (#4111 )	2024-09-29 18:39:18 -05:00
Cole Murray	d5f965b474	Update LiteLLLM to 1.48.5 (#4110 )	2024-09-29 06:42:59 +00:00
tobitege	c3bbe604eb	(fix) Fix logging in shared eval file to prevent key disclosure (#4108 )	2024-09-28 19:33:16 +00:00
Ana Noemi	c7fe39998c	Update README to decrease unsuccessful drivebys (#4091 )	2024-09-28 18:52:01 +00:00
Xingyao Wang	ec6e07647f	fix hash equivalance verification ci for fork (#4107 )	2024-09-29 02:19:59 +08:00
Graham Neubig	e744eadb8b	Robustify openhands resolver workflow (#4105 )	2024-09-28 11:35:56 -04:00
Engel Nyst	e582806004	Vision and prompt caching fixes (#4014 )	2024-09-28 14:37:29 +02:00
OpenHands	f427f9d8d4	Fix issue #4103 : Improve description of how to do frontend setup and testing in `.openhands_instructions` (#4104 ) Co-authored-by: Graham Neubig <neubig@gmail.com>	2024-09-28 06:41:34 +00:00
Graham Neubig	d669c7b60d	Add github issue resolution workflow (#4102 )	2024-09-28 04:52:52 +00:00
dependabot[bot]	42be4ee5bc	chore(deps-dev): bump openai from 1.48.0 to 1.50.2 (#4101 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-28 05:04:03 +02:00
Engel Nyst	f994277d0f	Make agents follow configured temperature (#4099 )	2024-09-28 01:15:46 +00:00
tofarr	5ccee7c8a7	Fix Bash commands now do not block and actually respect the timeout (#4058 )	2024-09-28 08:40:00 +08:00
tobitege	575a829d94	(enh) add test_python_version to test_bash.py runtime tests (#4098 )	2024-09-28 08:21:14 +08:00
Xingyao Wang	2bed3a424c	chore: pass logger DEBUG mode to client side (#4096 )	2024-09-28 08:21:04 +08:00
Xingyao Wang	a4cc010110	chore: parser fix for deepseek (#4097 )	2024-09-28 08:20:51 +08:00
tobitege	9651368e6a	revert #3871 dockerfile template: don't write to .bashrc file (#4095 )	2024-09-27 21:49:51 +00:00
tofarr	c5025fb66e	Fix Reducing the amount being downloaded every time the hash changes. (#4078 )	2024-09-27 15:48:33 -06:00
Robert Brennan	3f9111c615	add idle time to client server (#4084 )	2024-09-27 19:41:16 +00:00
dependabot[bot]	89e95f2671	chore(deps): bump boto3 from 1.35.27 to 1.35.28 (#4090 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-27 16:48:34 +00:00
dependabot[bot]	5bfa0c2f8d	chore(deps): bump browsergym from 0.7.0 to 0.7.1 (#4089 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-27 16:21:58 +00:00
dependabot[bot]	84141f656d	chore(deps-dev): bump chromadb from 0.5.9 to 0.5.11 (#4088 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-27 16:19:17 +00:00
dependabot[bot]	6ff7506581	chore(deps-dev): bump reportlab from 4.2.2 to 4.2.4 (#4086 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-27 16:01:32 +00:00
dependabot[bot]	41dc7f0256	chore(deps-dev): bump llama-index from 0.11.13 to 0.11.14 (#4085 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-27 15:50:17 +00:00
Xingyao Wang	34f3b61536	[runtime hash] fix runtime hash mismatch between inside `app` image and in "development mode" (#4039 )	2024-09-27 15:26:26 +00:00
dependabot[bot]	4533c47595	chore(deps-dev): bump @types/node from 22.7.2 to 22.7.3 in /frontend (#4081 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-27 15:20:11 +00:00
Xingyao Wang	47774e60b0	chore: remove deprecated dockerfile (#4079 )	2024-09-27 15:03:23 +00:00
Robert Brennan	b78f646b65	Release 0.9.6 (#4076 )	2024-09-26 21:27:17 +00:00
Amir	3e5c01dfc8	Remove param from docstring that does not exist in the append_file (#4060 )	2024-09-26 22:25:11 +02:00
tobitege	29c34e0b6a	(fix) actions.ts: restored handleAssistantMessage handling order (#4074 )	2024-09-26 19:56:12 +00:00
tofarr	c919086e25	Fix for regression (#4075 ) Regression fixed	2024-09-26 12:58:00 -06:00
Engel Nyst	0a03c802f5	Refactor llm.py (#4057 )	2024-09-26 17:44:18 +00:00
Xingyao Wang	081ebdbdd8	[runtime] do not keep rebuilding from generic image (#4072 )	2024-09-26 17:19:46 +00:00
dependabot[bot]	572c7b726d	chore(deps-dev): bump ruff from 0.6.7 to 0.6.8 (#4067 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-26 17:19:16 +00:00
Xingyao Wang	cfc5bb70c1	Update README.md for CodeAct (#4070 )	2024-09-26 16:55:08 +00:00
dependabot[bot]	008b866a38	chore(deps-dev): bump jsdom from 25.0.0 to 25.0.1 in /frontend (#3992 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-26 16:34:29 +00:00
dependabot[bot]	676ad3e140	chore(deps-dev): bump chromadb from 0.5.7 to 0.5.9 (#4069 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-26 16:08:17 +00:00
dependabot[bot]	19278de5d0	chore(deps): bump json-repair from 0.29.4 to 0.29.5 (#4068 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-26 15:48:39 +00:00
dependabot[bot]	891e4a8d34	chore(deps): bump datasets from 3.0.0 to 3.0.1 (#4065 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-26 15:35:36 +00:00
dependabot[bot]	85be8607e0	chore(deps): bump litellm from 1.48.1 to 1.48.2 (#4066 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-26 23:33:24 +08:00
dependabot[bot]	49b244610c	chore(deps-dev): bump openai from 1.47.1 to 1.48.0 (#4063 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-26 17:16:59 +02:00
dependabot[bot]	b347b1d06f	chore(deps): bump boto3 from 1.35.26 to 1.35.27 (#4064 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-26 17:15:11 +02:00
dependabot[bot]	0c86a60b35	chore(deps-dev): bump @types/node from 22.7.0 to 22.7.2 in /frontend (#4062 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-26 19:11:06 +04:00
tofarr	01317138e2	Fix: uvicorn reloading when python files in workspace change, & started section for debugging instructions for developers (#4041 ) Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>	2024-09-26 08:57:37 -06:00
Xingyao Wang	e03855cd7f	Make sure we print the observation in the same way as the LLM sees it	2024-09-26 14:01:48 +00:00
jaki300	757c9593f1	Create gke-example.md (#3795 ) Co-authored-by: Robert Brennan <accounts@rbren.io>	2024-09-26 09:11:33 -04:00
mamoodi	266e8ff951	Release 0.9.5 (#4061 )	2024-09-26 08:36:31 -04:00
dependabot[bot]	3e79cd12a6	chore(deps-dev): bump @types/react from 18.3.8 to 18.3.9 in /frontend (#4029 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-26 14:41:21 +04:00
tobitege	2cc1c3ef42	(enh) Docker runtime builder with BuildKit support, enh. caching (#4009 )	2024-09-26 08:50:53 +02:00
dependabot[bot]	ef0b08a46e	chore(deps-dev): bump tailwindcss from 3.4.12 to 3.4.13 in /frontend (#4030 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-26 05:28:03 +00:00
dependabot[bot]	f1d5202884	chore(deps): bump vite from 5.4.7 to 5.4.8 in /frontend (#4046 )	2024-09-26 12:56:35 +08:00
dependabot[bot]	11cedfb854	chore(deps): bump google-cloud-aiplatform from 1.67.1 to 1.68.0 (#4051 )	2024-09-26 12:56:16 +08:00
dependabot[bot]	6d103a0db2	chore(deps-dev): bump @types/node from 22.6.1 to 22.7.0 in /frontend (#4047 )	2024-09-26 12:56:00 +08:00
Engel Nyst	798aaeaef6	remove Exception in the agent (#4054 )	2024-09-26 06:39:17 +02:00
tofarr	0df4b97e5b	Fix startup statuses (#4053 )	2024-09-25 14:38:32 -06:00
Xingyao Wang	81b3cd71b3	[eval] log evaluating warnings directly to console (#4026 )	2024-09-26 03:42:32 +08:00
Robert Brennan	9241ae2148	Fix persistence of "advanced settings" (#4038 )	2024-09-25 12:57:08 -04:00
dependabot[bot]	d3f86e052a	chore(deps-dev): bump llama-index from 0.11.12 to 0.11.13 (#4044 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-25 18:24:09 +02:00
dependabot[bot]	e0c65f8f9c	chore(deps): bump google-generativeai from 0.8.1 to 0.8.2 (#4050 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-25 18:22:01 +02:00
dependabot[bot]	394ab360a8	chore(deps): bump boto3 from 1.35.25 to 1.35.26 (#4048 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-25 18:21:27 +02:00
dependabot[bot]	8a146d5ced	chore(deps): bump litellm from 1.48.0 to 1.48.1 (#4049 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-09-25 18:20:47 +02:00