dont close runtimes

try reuse_ids
Merge branch 'main' into rb/parallel-tests
2026-04-29 03:00:45 -04:00 · 2024-08-30 12:14:02 -04:00 · 2024-08-30 11:40:56 -04:00 · 2024-08-30 11:37:20 -04:00 · 2024-08-30 11:30:42 -04:00 · 2024-08-30 11:30:05 -04:00
549 changed files with 25452 additions and 18548 deletions
@@ -1 +1 @@
-The files in this directory configure a development container for GitHub Codespaces.
+The files in this directory configure a development container for GitHub Codespaces.
@@ -1,5 +1,5 @@
 # Workflow that builds, tests and then pushes the runtime docker images to the ghcr.io repository
-name: Build, Test and Publish RT Image
+name: Build, Test and Publish Runtime Image

 # Only run one workflow of the same group at a time.
 # There can be at most one running and one pending job in a concurrency group at any time.
@@ -38,6 +38,10 @@ jobs:
        base_image:
          - image: 'nikolaik/python-nodejs:python3.11-nodejs22'
            tag: nikolaik
+          - image: 'python:3.11-bookworm'
+            tag: python
+          - image: 'node:22-bookworm'
+            tag: node
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -66,162 +70,85 @@ jobs:
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@v3
+      - name: Install poetry via pipx
+        run: pipx install poetry
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
-      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cache/pypoetry
-            ~/.virtualenvs
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-poetry-
-      - name: Install poetry via pipx
-        run: pipx install poetry
+          cache: 'poetry'
      - name: Install Python dependencies using Poetry
        run: make install-python-dependencies
      - name: Create source distribution and Dockerfile
        run: poetry run python3 openhands/runtime/utils/runtime_build.py --base_image ${{ matrix.base_image.image }} --build_folder containers/runtime --force_rebuild
      - name: Build and push runtime image ${{ matrix.base_image.image }}
-        if: github.event.pull_request.head.repo.fork != true
+        if: "!github.event.pull_request.head.repo.fork"
        run: |
          ./containers/build.sh runtime ${{ github.repository_owner }} --push ${{ matrix.base_image.tag }}
      # Forked repos can't push to GHCR, so we need to upload the image as an artifact
      - name: Build runtime image ${{ matrix.base_image.image }} for fork
-        if: github.event.pull_request.head.repo.fork
+        if: "github.event.pull_request.head.repo.fork"
        uses: docker/build-push-action@v6
        with:
          tags: ghcr.io/all-hands-ai/runtime:${{ github.sha }}-${{ matrix.base_image.tag }}
          outputs: type=docker,dest=/tmp/runtime-${{ matrix.base_image.tag }}.tar
          context: containers/runtime
      - name: Upload runtime image for fork
-        if: github.event.pull_request.head.repo.fork
+        if: "github.event.pull_request.head.repo.fork"
        uses: actions/upload-artifact@v4
        with:
          name: runtime-${{ matrix.base_image.tag }}
          path: /tmp/runtime-${{ matrix.base_image.tag }}.tar

-  # Run unit tests with the EventStream runtime Docker images as root
-  test_runtime_root:
-    name: RT Unit Tests (Root)
-    needs: [ghcr_build_runtime]
+  # Run unit tests with the EventStream runtime Docker images
+  test_runtime:
+    name: Test Runtime
    runs-on: ubuntu-latest
+    needs: [ghcr_build_runtime]
    strategy:
-      fail-fast: false
      matrix:
-        base_image: ['nikolaik']
+        base_image: ['nikolaik', 'python', 'node']
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: true
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: true
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
-        if: github.event.pull_request.head.repo.fork
+        if: "github.event.pull_request.head.repo.fork"
        uses: actions/download-artifact@v4
        with:
          name: runtime-${{ matrix.base_image }}
          path: /tmp
      - name: Load runtime image for fork
-        if: github.event.pull_request.head.repo.fork
+        if: "github.event.pull_request.head.repo.fork"
        run: |
          docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
-      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cache/pypoetry
-            ~/.virtualenvs
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-poetry-
+      - name: Install poetry via pipx
+        run: pipx install poetry
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
-      - name: Install poetry via pipx
-        run: pipx install poetry
+          cache: 'poetry'
      - name: Install Python dependencies using Poetry
        run: make install-python-dependencies
      - name: Run runtime tests
        run: |
-          # We install pytest-xdist in order to run tests across CPUs
-          poetry run pip install pytest-xdist
-
-          # Install to be able to retry on failures for flaky tests
-          poetry run pip install pytest-rerunfailures
-
          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')

-          SKIP_CONTAINER_LOGS=true \
          TEST_RUNTIME=eventstream \
          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
+          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
-          RUN_AS_OPENHANDS=false \
-          poetry run pytest -n 3 -raR --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-
-  # Run unit tests with the EventStream runtime Docker images as openhands user
-  test_runtime_oh:
-    name: RT Unit Tests (openhands)
-    runs-on: ubuntu-latest
-    needs: [ghcr_build_runtime]
-    strategy:
-      matrix:
-        base_image: ['nikolaik']
-    steps:
-      - uses: actions/checkout@v4
-      # Forked repos can't push to GHCR, so we need to download the image as an artifact
-      - name: Download runtime image for fork
-        if: github.event.pull_request.head.repo.fork
-        uses: actions/download-artifact@v4
-        with:
-          name: runtime-${{ matrix.base_image }}
-          path: /tmp
-      - name: Load runtime image for fork
-        if: github.event.pull_request.head.repo.fork
-        run: |
-          docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
-      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cache/pypoetry
-            ~/.virtualenvs
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-poetry-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-      - name: Install poetry via pipx
-        run: pipx install poetry
-      - name: Install Python dependencies using Poetry
-        run: make install-python-dependencies
-      - name: Run runtime tests
-        run: |
-          # We install pytest-xdist in order to run tests across CPUs
-          poetry run pip install pytest-xdist
-
-          # Install to be able to retry on failures for flaky tests
-          poetry run pip install pytest-rerunfailures
-
-          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
-          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
-
-          SKIP_CONTAINER_LOGS=true \
-          TEST_RUNTIME=eventstream \
-          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
-          TEST_IN_CI=true \
-          RUN_AS_OPENHANDS=true \
-          poetry run pytest -n 3 -raR --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -229,41 +156,33 @@ jobs:

  # Run integration tests with the eventstream runtime Docker image
  runtime_integration_tests_on_linux:
-    name: RT Integration Tests (Linux)
+    name: Runtime Integration Tests on Linux
    runs-on: ubuntu-latest
    needs: [ghcr_build_runtime]
    strategy:
      fail-fast: false
      matrix:
-        base_image: ['nikolaik']
+        base_image: ['nikolaik', 'python', 'node']
    steps:
      - uses: actions/checkout@v4
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
-        if: github.event.pull_request.head.repo.fork
+        if: "github.event.pull_request.head.repo.fork"
        uses: actions/download-artifact@v4
        with:
          name: runtime-${{ matrix.base_image }}
          path: /tmp
      - name: Load runtime image for fork
-        if: github.event.pull_request.head.repo.fork
+        if: "github.event.pull_request.head.repo.fork"
        run: |
          docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
-      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cache/pypoetry
-            ~/.virtualenvs
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-poetry-
+      - name: Install poetry via pipx
+        run: pipx install poetry
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
-      - name: Install poetry via pipx
-        run: pipx install poetry
+          cache: 'poetry'
      - name: Install Python dependencies using Poetry
        run: make install-python-dependencies
      - name: Run integration tests
@@ -273,7 +192,7 @@ jobs:

          TEST_RUNTIME=eventstream \
          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
+          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          TEST_ONLY=true \
          ./tests/integration/regenerate.sh
@@ -282,26 +201,11 @@ jobs:
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

-  # The two following jobs (named identically) are to check whether all the runtime tests have passed as the
-  # "All Runtime Tests Passed" is a required job for PRs to merge
-  # Due to this bug: https://github.com/actions/runner/issues/2566, we want to create a job that runs when the
-  # prerequisites have been cancelled or failed so merging is disallowed, otherwise Github considers "skipped" as "success"
-  runtime_tests_check_success:
+  # Checks that all runtime tests have passed
+  all_runtime_tests_passed:
    name: All Runtime Tests Passed
-    if: ${{ !cancelled() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
+    needs: [test_runtime, runtime_integration_tests_on_linux]
    steps:
      - name: All tests passed
        run: echo "All runtime tests have passed successfully!"
-
-  runtime_tests_check_fail:
-    name: All Runtime Tests Passed
-    if: ${{ cancelled() || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
-    runs-on: ubuntu-latest
-    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
-    steps:
-      - name: Some tests failed
-        run: |
-          echo "Some runtime tests failed or were cancelled"
-          exit 1
@@ -22,21 +22,13 @@ jobs:
        python-version: ['3.11']
    steps:
      - uses: actions/checkout@v4
+      - name: Install poetry via pipx
+        run: pipx install poetry
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
-      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cache/pypoetry
-            ~/.virtualenvs
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-poetry-
-      - name: Install poetry via pipx
-        run: pipx install poetry
+          cache: 'poetry'
      - name: Install Python dependencies using Poetry
        run: poetry install --without evaluation,llama-index
      - name: Install & Start Docker
@@ -1,14 +1,9 @@
-# Publishes the OpenHands PyPi package
-name: Publish PyPi Package
+name: Release workflow

-# Triggered manually
 on:
-  workflow_dispatch:
-    inputs:
-      reason:
-        description: 'Reason for manual trigger'
-        required: true
-        default: ''
+  push:
+    tags:
+      - "[0-9]+.[0-9]+.[0-9]+*"

 jobs:
  release:
@@ -3,23 +3,6 @@ name: Regenerate Integration Tests

 on:
  workflow_dispatch:
-    inputs:
-      debug:
-        description: 'Enable debug mode'
-        type: boolean
-        default: true
-      log_to_file:
-        description: 'Enable logging to file'
-        type: boolean
-        default: true
-      force_regenerate_tests:
-        description: 'Force regeneration of tests'
-        type: boolean
-        default: false
-      force_use_llm:
-        description: 'Force use of LLM'
-        type: boolean
-        default: false

 jobs:
  regenerate_integration_tests:
@@ -29,32 +12,21 @@ jobs:
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
+    - name: Install poetry via pipx
+      run: pipx install poetry
    - name: Set up Python
      uses: actions/setup-python@v5
      with:
-        python-version: "3.11"
-    - name: Cache Poetry dependencies
-      uses: actions/cache@v4
-      with:
-        path: |
-          ~/.cache/pypoetry
-          ~/.virtualenvs
-        key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-        restore-keys: |
-          ${{ runner.os }}-poetry-
-    - name: Install poetry via pipx
-      run: pipx install poetry
+        python-version: ${{ matrix.python-version }}
+        cache: 'poetry'
    - name: Install Python dependencies using Poetry
-      run: make install-python-dependencies
+      run: poetry install --without evaluation,llama-index
    - name: Build Environment
      run: make build
+
    - name: Regenerate integration tests
-      run: |
-        DEBUG=${{ inputs.debug }} \
-        LOG_TO_FILE=${{ inputs.log_to_file }} \
-        FORCE_REGENERATE_TESTS=${{ inputs.force_regenerate_tests }} \
-        FORCE_USE_LLM=${{ inputs.force_use_llm }} \
-        ./tests/integration/regenerate.sh
+      run: ./tests/integration/regenerate.sh
+
    - name: Commit changes
      run: |
        if git diff --quiet --exit-code; then
@@ -65,6 +37,5 @@ jobs:
        git config --global user.name 'github-actions[bot]'
        git config --global user.email 'github-actions[bot]@users.noreply.github.com'
        git add .
-        # run it twice in case pre-commit makes changes
-        git commit -am "Regenerate integration tests" || git commit -am "Regenerate integration tests"
+        git commit -m "Regenerate integration tests"
        git push
@@ -228,4 +228,3 @@ runtime_*.tar
 # docker build
 containers/runtime/Dockerfile
 containers/runtime/project.tar.gz
-containers/runtime/code
@@ -97,28 +97,3 @@ Please refer to [this README](./tests/integration/README.md) for details.
 ### 9. Add or update dependency
 1. Add your dependency in `pyproject.toml` or use `poetry add xxx`
 2. Update the poetry.lock file via `poetry lock --no-update`
-
-## Develop inside Docker container
-
-TL;DR
-
-```bash
-make docker-dev
-```
-
-See more details [here](./containers/dev/README.md)
-
-If you are just interested in running `OpenHands` without installing all the required tools on your host.
-
-```bash
-make docker-run
-```
-
-If you do not have `make` on your host, run:
-
-```bash
-cd ./containers/dev
-./dev.sh
-```
-
-You do need [Docker](https://docs.docker.com/engine/install/) installed on your host though.
@@ -2,9 +2,8 @@ SHELL=/bin/bash
 # Makefile for OpenHands project

 # Variables
-BACKEND_HOST ?= "127.0.0.1"
 BACKEND_PORT = 3000
-BACKEND_HOST_PORT = "$(BACKEND_HOST):$(BACKEND_PORT)"
+BACKEND_HOST = "127.0.0.1:$(BACKEND_PORT)"
 FRONTEND_PORT = 3001
 DEFAULT_WORKSPACE_DIR = "./workspace"
 DEFAULT_MODEL = "gpt-4o"
@@ -190,12 +189,12 @@ build-frontend:
 # Start backend
 start-backend:
 	@echo "$(YELLOW)Starting backend...$(RESET)"
-	@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"
+	@poetry run uvicorn openhands.server.listen:app --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"

 # Start frontend
 start-frontend:
 	@echo "$(YELLOW)Starting frontend...$(RESET)"
-	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST_PORT) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run start
+	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run start

 # Common setup for running the app (non-callable)
 _run_setup:
@@ -205,7 +204,7 @@ _run_setup:
 	fi
 	@mkdir -p logs
 	@echo "$(YELLOW)Starting backend server...$(RESET)"
-	@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) &
+	@poetry run uvicorn openhands.server.listen:app --port $(BACKEND_PORT) &
 	@echo "$(YELLOW)Waiting for the backend to start...$(RESET)"
 	@until nc -z localhost $(BACKEND_PORT); do sleep 0.1; done
 	@echo "$(GREEN)Backend started successfully.$(RESET)"
@@ -217,20 +216,6 @@ run:
 	@cd frontend && echo "$(BLUE)Starting frontend with npm...$(RESET)" && npm run start -- --port $(FRONTEND_PORT)
 	@echo "$(GREEN)Application started successfully.$(RESET)"

-# Run the app (in docker)
-docker-run: WORKSPACE_BASE ?= $(PWD)/workspace
-docker-run:
-	@if [ -f /.dockerenv ]; then \
-		echo "Running inside a Docker container. Exiting..."; \
-		exit 0; \
-	else \
-		echo "$(YELLOW)Running the app in Docker $(OPTIONS)...$(RESET)"; \
-		export WORKSPACE_BASE=${WORKSPACE_BASE}; \
-		export SANDBOX_USER_ID=$(shell id -u); \
-		export DATE=$(shell date +%Y%m%d%H%M%S); \
-		docker compose up $(OPTIONS); \
-	fi
-
 # Run the app (WSL mode)
 run-wsl:
 	@echo "$(YELLOW)Running the app in WSL mode...$(RESET)"
@@ -295,16 +280,6 @@ setup-config-prompts:
 		fi


-# Develop in container
-docker-dev:
-	@if [ -f /.dockerenv ]; then \
-		echo "Running inside a Docker container. Exiting..."; \
-		exit 0; \
-	else \
-		echo "$(YELLOW)Build and run in Docker $(OPTIONS)...$(RESET)"; \
-		./containers/dev/dev.sh $(OPTIONS); \
-	fi
-
 # Clean up all caches
 clean:
 	@echo "$(YELLOW)Cleaning up caches...$(RESET)"
@@ -323,10 +298,7 @@ help:
 	@echo "  $(GREEN)start-frontend$(RESET)      - Start the frontend server for the OpenHands project."
 	@echo "  $(GREEN)run$(RESET)                 - Run the OpenHands application, starting both backend and frontend servers."
 	@echo "                        Backend Log file will be stored in the 'logs' directory."
-	@echo "  $(GREEN)docker-dev$(RESET)          - Build and run the OpenHands application in Docker."
-	@echo "  $(GREEN)docker-run$(RESET)          - Run the OpenHands application, starting both backend and frontend servers in Docker."
 	@echo "  $(GREEN)help$(RESET)                - Display this help message, providing information on available targets."

 # Phony targets
 .PHONY: build check-dependencies check-python check-npm check-docker check-poetry install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help
-.PHONY: docker-dev docker-run
@@ -1,49 +1,65 @@
 <a name="readme-top"></a>

-<div align="center">
-  <img src="./docs/static/img/logo.png" alt="Logo" width="200">
-  <h1 align="center">OpenHands: Code Less, Make More</h1>
-</div>
+<!--
+*** Thanks for checking out the Best-README-Template. If you have a suggestion
+*** that would make this better, please fork the repo and create a pull request
+*** or simply open an issue with the tag "enhancement".
+*** Don't forget to give the project a star!
+*** Thanks again! Now go create something AMAZING! :D
+-->

+<!-- PROJECT SHIELDS -->
+<!--
+*** I'm using markdown "reference style" links for readability.
+*** Reference links are enclosed in brackets [ ] instead of parentheses ( ).
+*** See the bottom of this document for the declaration of the reference variables
+*** for contributors-url, forks-url, etc. This is an optional, concise syntax you may use.
+*** https://www.markdownguide.org/basic-syntax/#reference-style-links
+-->

 <div align="center">
  <a href="https://github.com/All-Hands-AI/OpenHands/graphs/contributors"><img src="https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Contributors"></a>
+  <a href="https://github.com/All-Hands-AI/OpenHands/network/members"><img src="https://img.shields.io/github/forks/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Forks"></a>
  <a href="https://github.com/All-Hands-AI/OpenHands/stargazers"><img src="https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Stargazers"></a>
-  <a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue"></a>
+  <a href="https://github.com/All-Hands-AI/OpenHands/issues"><img src="https://img.shields.io/github/issues/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Issues"></a>
  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License"></a>
+  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=blue" alt="Credits"></a>
  <br/>
  <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
  <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
-  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits"></a>
-  <br/>
-  <a href="https://docs.all-hands.dev/modules/usage/getting-started"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
-  <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper%20on%20Arxiv-000?logoColor=FFE165&logo=arxiv&style=for-the-badge" alt="Paper on Arxiv"></a>
-  <a href="https://huggingface.co/spaces/OpenHands/evaluation"><img src="https://img.shields.io/badge/Benchmark%20score-000?logoColor=FFE165&logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark Score"></a>
-  <hr>
+  <a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge"></a>
 </div>

-Welcome to OpenHands (formerly OpenDevin), a platform for software development agents powered by AI.
+<!-- PROJECT LOGO -->
+<div align="center">
+  <img src="./docs/static/img/logo.png" alt="Logo" width="200" height="200">
+  <h1 align="center">OpenHands: Code Less, Make More</h1>
+  <a href="https://docs.all-hands.dev/modules/usage/intro"><img src="https://img.shields.io/badge/Documentation-OpenHands-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Check out the documentation"></a>
+  <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv"></a>
+  <br/>
+  <a href="https://huggingface.co/spaces/OpenDevin/evaluation"><img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark"></a>
+</div>
+<hr>

-OpenHands agents can do anything a human developer can: modify code, run commands, browse the web,
-call APIs, and yes—even copy code snippets from StackOverflow.
+Welcome to OpenHands, a platform for autonomous software engineers, powered by AI and LLMs (previously called "OpenDevin").

-Learn more at [docs.all-hands.dev](https://docs.all-hands.dev), or jump to the [Quick Start](#-quick-start).
+OpenHands agents collaborate with human developers to write code, fix bugs, and ship features.

 ![App screenshot](./docs/static/img/screenshot.png)

-## ⚡ Quick Start
+## ⚡ Getting Started
+OpenHands works best with Docker version 26.0.0+ (Docker Desktop 4.31.0+).
+You must be using Linux, Mac OS, or WSL on Windows.

-The easiest way to run OpenHands is in Docker. You can change `WORKSPACE_BASE` below to
-point OpenHands to existing code that you'd like to modify.
+To start OpenHands in a docker container, run the following commands in your terminal:

-See the [Getting Started](https://docs.all-hands.dev/modules/usage/getting-started) guide for
-system requirements and more information.
+> [!WARNING]
+> When you run the following command, files in `./workspace` may be modified or deleted.

 ```bash
-export WORKSPACE_BASE=$(pwd)/workspace
-
-docker run -it --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
+WORKSPACE_BASE=$(pwd)/workspace
+docker run -it \
+    --pull=always \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
@@ -54,23 +70,29 @@ docker run -it --pull=always \
    ghcr.io/all-hands-ai/openhands:0.9
 ```

-You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
+> [!NOTE]
+> This command pulls the `0.9` tag, which represents the most recent stable release of OpenHands. You have other options as well:
+> - For a specific release version, use `ghcr.io/all-hands-ai/openhands:<OpenHands_version>` (replace <OpenHands_version> with the desired version number).
+> - For the most up-to-date development version, use `ghcr.io/all-hands-ai/openhands:main`. This version may be **(unstable!)** and is recommended for testing or development purposes only.
+>
+> Choose the tag that best suits your needs based on stability requirements and desired features.

-You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
-or as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode).
+You'll find OpenHands running at [http://localhost:3000](http://localhost:3000) with access to `./workspace`. To have OpenHands operate on your code, place it in `./workspace`.
+OpenHands will only have access to this workspace folder. The rest of your system will not be affected as it runs in a secured docker sandbox.

-Visit [Getting Started](https://docs.all-hands.dev/modules/usage/getting-started) for more information and setup instructions.
+Upon opening OpenHands, you must select the appropriate `Model` and enter the `API Key` within the settings that should pop up automatically. These can be set at any time by selecting
+the `Settings` button (gear icon) in the UI. If the required `Model` does not exist in the list, you can manually enter it in the text box.

-If you want to modify the OpenHands source code, check out [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
+For the development workflow, see [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).

-Having issues? The [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting) can help.
+Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).

-## 📖 Documentation
+## 🚀 Documentation

 To learn more about the project, and for tips on using OpenHands,
-**check out our [documentation](https://docs.all-hands.dev/modules/usage/getting-started)**.
+**check out our [documentation](https://docs.all-hands.dev/modules/usage/intro)**.

-There you'll find resources on how to use different LLM providers,
+There you'll find resources on how to use different LLM providers (like ollama and Anthropic's Claude),
 troubleshooting resources, and advanced configuration options.

 ## 🤝 How to Contribute
@@ -105,6 +127,17 @@ Let's make software engineering better together!

 Distributed under the MIT License. See [`LICENSE`](./LICENSE) for more information.

+[contributors-shield]: https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge
+[contributors-url]: https://github.com/All-Hands-AI/OpenHands/graphs/contributors
+[forks-shield]: https://img.shields.io/github/forks/All-Hands-AI/OpenHands?style=for-the-badge
+[forks-url]: https://github.com/All-Hands-AI/OpenHands/network/members
+[stars-shield]: https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge
+[stars-url]: https://github.com/All-Hands-AI/OpenHands/stargazers
+[issues-shield]: https://img.shields.io/github/issues/All-Hands-AI/OpenHands?style=for-the-badge
+[issues-url]: https://github.com/All-Hands-AI/OpenHands/issues
+[license-shield]: https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge
+[license-url]: https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE
+
 ## 🙏 Acknowledgements

 OpenHands is built by a large number of contributors, and every contribution is greatly appreciated! We also build upon other open source projects, and we are deeply thankful for their work.
@@ -65,15 +65,10 @@ In order to accomplish my goal I need to send the information asked back to the
 """


-def get_prompt(
-    error_prefix: str, cur_url: str, cur_axtree_txt: str, prev_action_str: str
-) -> str:
+def get_prompt(error_prefix: str, cur_axtree_txt: str, prev_action_str: str) -> str:
    prompt = f"""\
 {error_prefix}

-# Current Page URL:
-{cur_url}
-
 # Current Accessibility Tree:
 {cur_axtree_txt}

@@ -144,7 +139,6 @@ class BrowsingAgent(Agent):
        """
        messages: list[Message] = []
        prev_actions = []
-        cur_url = ''
        cur_axtree_txt = ''
        error_prefix = ''
        last_obs = None
@@ -185,9 +179,6 @@ class BrowsingAgent(Agent):
                self.error_accumulator += 1
                if self.error_accumulator > 5:
                    return MessageAction('Too many errors encountered. Task failed.')
-
-            cur_url = last_obs.url
-
            try:
                cur_axtree_txt = flatten_axtree_to_str(
                    last_obs.axtree_object,
@@ -213,11 +204,11 @@ class BrowsingAgent(Agent):

        messages.append(Message(role='system', content=[TextContent(text=system_msg)]))

-        prompt = get_prompt(error_prefix, cur_url, cur_axtree_txt, prev_action_str)
+        prompt = get_prompt(error_prefix, cur_axtree_txt, prev_action_str)
        messages.append(Message(role='user', content=[TextContent(text=prompt)]))
-
+        logger.debug(prompt)
        response = self.llm.completion(
-            messages=self.llm.format_messages_for_llm(messages),
+            messages=[message.model_dump() for message in messages],
            temperature=0.0,
            stop=[')```', ')\n```'],
        )
@@ -57,7 +57,7 @@ class Flags:

    @classmethod
    def from_dict(self, flags_dict):
-        """Helper for JSON serializable requirement."""
+        """Helper for JSON serializble requirement."""
        if isinstance(flags_dict, Flags):
            return flags_dict

@@ -354,7 +354,7 @@ and executed by a program, make sure to follow the formatting instructions.
        self._prompt += '\n'.join(
            [
                f"""\
- - [{msg['role']}], {msg['message']}"""
+ - [{msg['role']}] {msg['message']}"""
                for msg in chat_messages
            ]
        )
@@ -24,9 +24,9 @@ class BrowsingResponseParser(ResponseParser):
        if action_str is None:
            return ''
        action_str = action_str.strip()
-        if action_str and not action_str.endswith('```'):
+        if not action_str.endswith('```'):
            action_str = action_str + ')```'
-        logger.debug(action_str)
+        logger.info(action_str)
        return action_str

    def parse_action(self, action_str: str) -> Action:
@@ -6,7 +6,6 @@ from openhands.events.action import (
    AgentDelegateAction,
    AgentFinishAction,
    CmdRunAction,
-    FileEditAction,
    IPythonRunCellAction,
    MessageAction,
 )
@@ -17,7 +16,6 @@ class CodeActResponseParser(ResponseParser):
    - CmdRunAction(command) - bash command to run
    - IPythonRunCellAction(code) - IPython code to run
    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
-    - FileEditAction(diff_block) - Search/Replace block to edit.
    - MessageAction(content) - Message action to run (e.g. ask for clarification)
    - AgentFinishAction() - end the interaction
    """
@@ -30,7 +28,6 @@ class CodeActResponseParser(ResponseParser):
            CodeActActionParserCmdRun(),
            CodeActActionParserIPythonRunCell(),
            CodeActActionParserAgentDelegate(),
-            CodeActActionParserFileEdit(),
        ]
        self.default_parser = CodeActActionParserMessage()

@@ -42,7 +39,7 @@ class CodeActResponseParser(ResponseParser):
        action = response.choices[0].message.content
        if action is None:
            return ''
-        for lang in ['bash', 'ipython', 'edit', 'browse']:
+        for lang in ['bash', 'ipython', 'browse']:
            if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
                action += f'</execute_{lang}>'
        return action
@@ -161,33 +158,6 @@ class CodeActActionParserAgentDelegate(ActionParser):
        return AgentDelegateAction(agent='BrowsingAgent', inputs={'task': task})


-class CodeActActionParserFileEdit(ActionParser):
-    """Parser action:
-    - FileEditAction(diff_block) - Search/Replace block to edit.
-    """
-
-    def __init__(
-        self,
-    ):
-        self.diff_block = None
-
-    def check_condition(self, action_str: str) -> bool:
-        self.diff_block = re.search(
-            r'<execute_edit>(.*)</execute_edit>', action_str, re.DOTALL
-        )
-        return self.diff_block is not None
-
-    def parse(self, action_str: str) -> Action:
-        assert (
-            self.diff_block is not None
-        ), 'self.diff_block should not be None when parse is called'
-        thought = action_str.replace(self.diff_block.group(0), '').strip()
-        return FileEditAction(
-            diff_block=self.diff_block.group(1).strip(),
-            thought=thought,
-        )
-
-
 class CodeActActionParserMessage(ActionParser):
    """Parser action:
    - MessageAction(content) - Message action to run (e.g. ask for clarification)
@@ -1,28 +1,22 @@
 import os
-from itertools import islice

 from agenthub.codeact_agent.action_parser import CodeActResponseParser
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
-from openhands.core.exceptions import OperationCancelled
-from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.events.action import (
    Action,
    AgentDelegateAction,
    AgentFinishAction,
    CmdRunAction,
-    FileEditAction,
    IPythonRunCellAction,
    MessageAction,
 )
 from openhands.events.observation import (
    AgentDelegateObservation,
    CmdOutputObservation,
-    FileEditObservation,
    IPythonRunCellObservation,
-    UserRejectObservation,
 )
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.observation.observation import Observation
@@ -38,7 +32,7 @@ from openhands.utils.prompt import PromptManager


 class CodeActAgent(Agent):
-    VERSION = '1.10'
+    VERSION = '1.9'
    """
    The Code Act Agent is a minimalist agent.
    The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
@@ -106,8 +100,6 @@ class CodeActAgent(Agent):
            return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
        elif isinstance(action, AgentDelegateAction):
            return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
-        elif isinstance(action, FileEditAction):
-            return f'{action.thought}\n<execute_edit>\n{action.diff_block}\n</execute_edit>'
        elif isinstance(action, MessageAction):
            return action.content
        elif isinstance(action, AgentFinishAction) and action.source == 'agent':
@@ -117,7 +109,6 @@ class CodeActAgent(Agent):
    def get_action_message(self, action: Action) -> Message | None:
        if (
            isinstance(action, AgentDelegateAction)
-            or isinstance(action, FileEditAction)
            or isinstance(action, CmdRunAction)
            or isinstance(action, IPythonRunCellAction)
            or isinstance(action, MessageAction)
@@ -125,11 +116,7 @@ class CodeActAgent(Agent):
        ):
            content = [TextContent(text=self.action_to_str(action))]

-            if (
-                self.llm.vision_is_active()
-                and isinstance(action, MessageAction)
-                and action.images_urls
-            ):
+            if isinstance(action, MessageAction) and action.images_urls:
                content.append(ImageContent(image_urls=action.images_urls))

            return Message(
@@ -139,15 +126,14 @@ class CodeActAgent(Agent):

    def get_observation_message(self, obs: Observation) -> Message | None:
        max_message_chars = self.llm.config.max_message_chars
-        obs_prefix = 'OBSERVATION:\n'
        if isinstance(obs, CmdOutputObservation):
-            text = obs_prefix + truncate_content(obs.content, max_message_chars)
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
            text += (
                f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
            )
            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, IPythonRunCellObservation):
-            text = obs_prefix + obs.content
+            text = 'OBSERVATION:\n' + obs.content
            # replace base64 images with a placeholder
            splitted = text.split('\n')
            for i, line in enumerate(splitted):
@@ -158,23 +144,15 @@ class CodeActAgent(Agent):
            text = '\n'.join(splitted)
            text = truncate_content(text, max_message_chars)
            return Message(role='user', content=[TextContent(text=text)])
-        elif isinstance(obs, FileEditObservation):
-            text = obs_prefix + truncate_content(obs.content, max_message_chars)
-            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, AgentDelegateObservation):
-            text = obs_prefix + truncate_content(
-                obs.outputs['content'] if 'content' in obs.outputs else '',
-                max_message_chars,
+            text = 'OBSERVATION:\n' + truncate_content(
+                str(obs.outputs), max_message_chars
            )
            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, ErrorObservation):
-            text = obs_prefix + truncate_content(obs.content, max_message_chars)
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
            text += '\n[Error occurred in processing last action]'
            return Message(role='user', content=[TextContent(text=text)])
-        elif isinstance(obs, UserRejectObservation):
-            text = obs_prefix + truncate_content(obs.content, max_message_chars)
-            text += '\n[Last action has been rejected by the user]'
-            return Message(role='user', content=[TextContent(text=text)])
        else:
            # If an observation message is not returned, it will cause an error
            # when the LLM tries to return the next message
@@ -195,7 +173,6 @@ class CodeActAgent(Agent):
        - CmdRunAction(command) - bash command to run
        - IPythonRunCellAction(code) - IPython code to run
        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
-        - FileEditAction(diff_block) - Search/Replace block to edit.
        - MessageAction(content) - Message action to run (e.g. ask for clarification)
        - AgentFinishAction() - end the interaction
        """
@@ -206,32 +183,23 @@ class CodeActAgent(Agent):

        # prepare what we want to send to the LLM
        messages = self._get_messages(state)
+
        params = {
-            'messages': self.llm.format_messages_for_llm(messages),
+            'messages': [message.model_dump() for message in messages],
            'stop': [
                '</execute_ipython>',
                '</execute_bash>',
                '</execute_browse>',
-                '</execute_edit>',
            ],
+            'temperature': 0.0,
        }

-        if self.llm.is_caching_prompt_active():
+        if self.llm.supports_prompt_caching:
            params['extra_headers'] = {
                'anthropic-beta': 'prompt-caching-2024-07-31',
            }

-        # TODO: move exception handling to agent_controller
-        try:
-            response = self.llm.completion(**params)
-        except OperationCancelled as e:
-            raise e
-        except Exception as e:
-            logger.error(f'{e}')
-            error_message = '{}: {}'.format(type(e).__name__, str(e).split('\n')[0])
-            return AgentFinishAction(
-                thought=f'Agent encountered an error while processing the last action.\nError: {error_message}\nPlease try again.'
-            )
+        response = self.llm.completion(**params)

        return self.action_parser.parse(response)

@@ -242,7 +210,7 @@ class CodeActAgent(Agent):
                content=[
                    TextContent(
                        text=self.prompt_manager.system_message,
-                        cache_prompt=self.llm.is_caching_prompt_active(),  # Cache system prompt
+                        cache_prompt=self.llm.supports_prompt_caching,  # Cache system prompt
                    )
                ],
            ),
@@ -251,7 +219,7 @@ class CodeActAgent(Agent):
                content=[
                    TextContent(
                        text=self.prompt_manager.initial_user_message,
-                        cache_prompt=self.llm.is_caching_prompt_active(),  # if the user asks the same query,
+                        cache_prompt=self.llm.supports_prompt_caching,  # if the user asks the same query,
                    )
                ],
            ),
@@ -270,14 +238,14 @@ class CodeActAgent(Agent):
            if message:
                # handle error if the message is the SAME role as the previous message
                # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
-                # there shouldn't be two consecutive messages from the same role
+                # there should not have two consecutive messages from the same role
                if messages and messages[-1].role == message.role:
                    messages[-1].content.extend(message.content)
                else:
                    messages.append(message)

        # Add caching to the last 2 user messages
-        if self.llm.is_caching_prompt_active():
+        if self.llm.supports_prompt_caching:
            user_turns_processed = 0
            for message in reversed(messages):
                if message.role == 'user' and user_turns_processed < 2:
@@ -286,17 +254,14 @@ class CodeActAgent(Agent):
                    ].cache_prompt = True  # Last item inside the message content
                    user_turns_processed += 1

-        # The latest user message is important:
+        # the latest user message is important:
        # we want to remind the agent of the environment constraints
        latest_user_message = next(
-            islice(
-                (
-                    m
-                    for m in reversed(messages)
-                    if m.role == 'user'
-                    and any(isinstance(c, TextContent) for c in m.content)
-                ),
-                1,
+            (
+                m
+                for m in reversed(messages)
+                if m.role == 'user'
+                and any(isinstance(c, TextContent) for c in m.content)
            ),
            None,
        )
@@ -1,62 +1,30 @@
 {% set MINIMAL_SYSTEM_PREFIX %}
-A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions.
-The assistant can use a Python environment with <execute_ipython>, e.g.:
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
 <execute_ipython>
 print("Hello World!")
 </execute_ipython>
-The assistant can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
-If a bash command returns exit code `-1`, this means the process is not yet finished.
-The assistant must then send a second <execute_bash>. The second <execute_bash> can be empty
-(which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process,
-or it can contain the text `ctrl+c` to interrupt the process.
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.

-For commands that may run indefinitely, the output should be redirected to a file and the command run
-in the background, e.g. <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
-If a command execution result says "Command timed out. Sending SIGINT to the process",
-the assistant should retry running the command in the background.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
 {% endset %}
 {% set BROWSING_PREFIX %}
 The assistant can browse the Internet with <execute_browse> and </execute_browse>.
 For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
 Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
-{% endset %}
-{% set EDIT_DIFF_PREFIX %}
-The assistant can edit files with <execute_edit> and </execute_edit>. Each change must be described with a SEARCH/REPLACE block.
-Every SEARCH section must EXACTLY MATCH the existing file content, character for character, including all comments, docstrings, etc. SEARCH/REPLACE blocks will replace all matching occurrences. Include enough lines to make the SEARCH blocks uniquely match the lines to change.
-Keep SEARCH/REPLACE blocks as concise as possible. Break large SEARCH/REPLACE blocks into a series of smaller blocks that each change a small portion of the file.
-To move code within a file, use 2 SEARCH/REPLACE blocks: 1 to delete it from its current location, 1 to insert it in the new location.
-If you want to put code in a new file, use a SEARCH/REPLACE block with: a new file path, an empty `SEARCH` section and the new file's contents in the `REPLACE` section.
-
-Every SEARCH/REPLACE block must use this format:
-1. The FULL file path alone on a line, verbatim. No bold asterisks, no quotes around it, no escaping of characters, etc.
-2. The start of search block: <<<<<<< SEARCH
-3. A contiguous chunk of lines to search for in the existing source code
-4. The dividing line: =======
-5. The lines to replace into the source code
-6. The end of the replace block: >>>>>>> REPLACE
-
-For example,
-<execute_edit>
-demo.py
-<<<<<<< SEARCH
-    print("hello")
-=======
-    print("goodbye")
->>>>>>> REPLACE
-</execute_edit>
-
 {% endset %}
 {% set PIP_INSTALL_PREFIX %}
 The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
 {% endset %}
-{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + EDIT_DIFF_PREFIX + PIP_INSTALL_PREFIX %}
+{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX %}
 {% set COMMAND_DOCS %}
 Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
 {{ agent_skills_docs }}
-IMPORTANT:
- `open_file` only returns the first 100 lines of the file by default! The assistant MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
- Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
- Any code issued should be less than 50 lines to avoid context being cut off!
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
 {% endset %}
 {% set SYSTEM_SUFFIX %}
 Responses should be concise.
@@ -64,8 +32,7 @@ The assistant should attempt fewer things at a time instead of putting too many
 Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
 If the assistant is finished with the task you MUST include <finish></finish> in your response.
 IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
-The assistant should utilize full file paths and the `pwd` command to prevent path-related errors.
-The assistant must avoid apologies and thanks in its responses.
+The assistant should utilize full file paths and the 'pwd' command to prevent path-related errors. The assistant should refrain from excessive apologies in its responses.

 {% endset %}
 {# Combine all parts without newlines between them #}
@@ -4,12 +4,23 @@
 USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.

 ASSISTANT:
-Sure! Let me write the Python code for starting a web server and save it to a new Python file `app.py`:
-<execute_edit>
-app.py
-<<<<<<< SEARCH
-=======
-from flask import Flask
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
 app = Flask(__name__)

@app.route('/')
@@ -18,17 +29,30 @@ def index():
    return str(numbers)

 if __name__ == '__main__':
-    app.run(port=5000)
->>>>>>> REPLACE
-</execute_edit>
+    app.run(port=5000)"""

+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>

 USER:
 OBSERVATION:
-I updated the file app.py by
-appending:
-from flask import Flask
-app = Flask(__name__)
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]

 ASSISTANT:
 I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
@@ -153,22 +177,30 @@ USER:

 ASSISTANT:
 I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
-<execute_edit>
-app.py
-<<<<<<< SEARCH
-    return str(numbers)
-=======
-    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
->>>>>>> REPLACE
-</execute_edit>
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>

 USER:
 Observation:
-I updated the file app.py by
-replacing:
-    return str(numbers)
-with:
-    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]

 ASSISTANT:
 Running the updated file:
@@ -94,11 +94,7 @@ class CodeActSWEAgent(Agent):
        ):
            content = [TextContent(text=self.action_to_str(action))]

-            if (
-                self.llm.vision_is_active()
-                and isinstance(action, MessageAction)
-                and action.images_urls
-            ):
+            if isinstance(action, MessageAction) and action.images_urls:
                content.append(ImageContent(image_urls=action.images_urls))

            return Message(
@@ -160,8 +156,9 @@ class CodeActSWEAgent(Agent):

        # prepare what we want to send to the LLM
        messages: list[Message] = self._get_messages(state)
+
        response = self.llm.completion(
-            messages=self.llm.format_messages_for_llm(messages),
+            messages=[message.model_dump() for message in messages],
            stop=[
                '</execute_ipython>',
                '</execute_bash>',
@@ -73,13 +73,10 @@ class MicroAgent(Agent):
            latest_user_message=last_user_message,
        )
        content = [TextContent(text=prompt)]
-        if self.llm.vision_is_active() and last_image_urls:
+        if last_image_urls:
            content.append(ImageContent(image_urls=last_image_urls))
        message = Message(role='user', content=content)
-        resp = self.llm.completion(
-            messages=self.llm.format_messages_for_llm(message),
-            temperature=0.0,
-        )
+        resp = self.llm.completion(messages=[message.model_dump()])
        action_resp = resp['choices'][0]['message']['content']
        action = parse_response(action_resp)
        return action
@@ -46,8 +46,8 @@ class PlannerAgent(Agent):
            state, self.llm.config.max_message_chars
        )
        content = [TextContent(text=prompt)]
-        if self.llm.vision_is_active() and image_urls:
+        if image_urls:
            content.append(ImageContent(image_urls=image_urls))
        message = Message(role='user', content=content)
-        resp = self.llm.completion(messages=self.llm.format_messages_for_llm(message))
+        resp = self.llm.completion(messages=[message.model_dump()])
        return self.response_parser.parse(resp)
@@ -1,22 +0,0 @@
-#
-services:
-  openhands:
-    build:
-      context: ./
-      dockerfile: ./containers/app/Dockerfile
-    image: openhands:latest
-    container_name: openhands-app-${DATE:-}
-    environment:
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.9-nikolaik}
-      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
-      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
-    ports:
-      - "3000:3000"
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock
-      - ${WORKSPACE_BASE:-$PWD/workspace}:/opt/workspace_base
-    pull_policy: build
-    stdin_open: true
-    tty: true
@@ -126,29 +126,21 @@ embedding_model = ""
 # Model to use
 model = "gpt-4o"

-# Number of retries to attempt when an operation fails with the LLM.
-# Increase this value to allow more attempts before giving up
-#num_retries = 8
+# Number of retries to attempt
+#num_retries = 5

-# Maximum wait time (in seconds) between retry attempts
-# This caps the exponential backoff to prevent excessively long
-#retry_max_wait = 120
+# Retry maximum wait time
+#retry_max_wait = 60

-# Minimum wait time (in seconds) between retry attempts
-# This sets the initial delay before the first retry
-#retry_min_wait = 15
+# Retry minimum wait time
+#retry_min_wait = 3

-# Multiplier for exponential backoff calculation
-# The wait time increases by this factor after each failed attempt
-# A value of 2.0 means each retry waits twice as long as the previous one
+# Retry multiplier for exponential backoff
 #retry_multiplier = 2.0

 # Drop any unmapped (unsupported) params without causing an exception
 #drop_params = false

-# Using the prompt caching feature provided by the LLM
-#caching_prompt = false
-
 # Base URL for the OLLAMA API
 #ollama_base_url = ""

@@ -159,17 +151,14 @@ model = "gpt-4o"
 #timeout = 0

 # Top p for the API
-#top_p = 1.0
+#top_p = 0.5

-# If model is vision capable, this option allows to disable image processing (useful for cost reduction).
-#disable_vision = true
-
-[llm.gpt4o-mini]
+[llm.gpt3]
 # API key to use
 api_key = "your-api-key"

 # Model to use
-model = "gpt-4o-mini"
+model = "gpt-3.5"

 #################################### Agent ###################################
 # Configuration for agents (group name starts with 'agent')
@@ -1,124 +0,0 @@
-# syntax=docker/dockerfile:1
-
-###
-FROM ubuntu:22.04 AS dind
-
-# https://docs.docker.com/engine/install/ubuntu/
-RUN apt-get update && apt-get install -y \
-	ca-certificates \
-	curl \
-	&& install -m 0755 -d /etc/apt/keyrings \
-	&& curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \
-	&& chmod a+r /etc/apt/keyrings/docker.asc \
-	&& echo \
-		"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
-		$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
-
-RUN apt-get update && apt-get install -y \
-	docker-ce \
-	docker-ce-cli \
-	containerd.io \
-	docker-buildx-plugin \
-	docker-compose-plugin \
-	&& rm -rf /var/lib/apt/lists/* \
-	&& apt-get clean \
-	&& apt-get autoremove -y
-
-###
-FROM dind AS openhands
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-#
-RUN apt-get update && apt-get install -y \
-	bash \
-    build-essential \
-    curl \
-	git \
-	git-lfs \
-	software-properties-common \
-	make \
-    netcat \
-    sudo \
-	wget \
-	&& rm -rf /var/lib/apt/lists/* \
-	&& apt-get clean \
-	&& apt-get autoremove -y
-
-# https://github.com/cli/cli/blob/trunk/docs/install_linux.md
-RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg \
-	&& chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg \
-	&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
-	&& apt-get update && apt-get -y install \
-    gh \
-  && rm -rf /var/lib/apt/lists/* \
-  && apt-get clean \
-  && apt-get autoremove -y
-
-# Python 3.11
-RUN add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update \
-    && apt-get install -y python3.11 python3.11-venv python3.11-dev python3-pip \
-    && ln -s /usr/bin/python3.11 /usr/bin/python
-
-# NodeJS >= 18.17.1
-RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
-    && apt-get install -y nodejs
-
-# Poetry >= 1.8
-RUN curl -fsSL https://install.python-poetry.org | python3.11 - \
-    && ln -s ~/.local/bin/poetry /usr/local/bin/poetry
-
-#
-RUN <<EOF
-#!/bin/bash
-printf "#!/bin/bash
-set +x
-uname -a
-docker --version
-gh --version | head -n 1
-git --version
-#
-python --version
-echo node `node --version`
-echo npm `npm --version`
-poetry --version
-netcat -h 2>&1 | head -n 1
-" > /version.sh
-chmod a+x /version.sh
-EOF
-
-###
-FROM openhands AS dev
-
-RUN apt-get update && apt-get install -y \
-	dnsutils \
-	file \
-	iproute2 \
-	jq \
-	lsof \
-	ripgrep \
-	silversearcher-ag \
-	vim \
-	&& rm -rf /var/lib/apt/lists/* \
-	&& apt-get clean \
-	&& apt-get autoremove -y
-
-WORKDIR /app
-
-# cache build dependencies
-RUN \
-  --mount=type=bind,source=./,target=/app/ \
-  <<EOF
-#!/bin/bash
-make -s clean
-make -s check-dependencies
-make -s install-python-dependencies
-
-# NOTE
-# node_modules are .dockerignore-d therefore not mountable
-# make -s install-frontend-dependencies
-EOF
-
-#
-CMD ["bash"]
@@ -1,54 +0,0 @@
-# Develop in Docker
-
-Install [Docker](https://docs.docker.com/engine/install/) on your host machine and run:
-
-```bash
-make docker-dev
-# same as:
-cd ./containers/dev
-./dev.sh
-```
-
-It could take some time if you are running for the first time as Docker will pull all the  tools required for building OpenHands. The next time you run again, it should be instant.
-
-## Build and run
-
-If everything goes well, you should be inside a container after Docker finishes building the `openhands:dev` image similar to the following:
-
-```bash
-Build and run in Docker ...
-root@93fc0005fcd2:/app#
-```
-
-You may now proceed with the normal [build and run](../../Development.md) workflow as if you were on the host.
-
-## Make changes
-
-The source code on the host is mounted as `/app` inside docker. You may edit the files as usual either inside the Docker container or on your host with your favorite IDE/editors.
-
-The following are also mapped as readonly from your host:
-
-```yaml
-# host credentials
- $HOME/.git-credentials:/root/.git-credentials:ro
- $HOME/.gitconfig:/root/.gitconfig:ro
- $HOME/.npmrc:/root/.npmrc:ro
-```
-
-## VSCode
-
-Alternatively, if you use VSCode, you could also [attach to the running container](https://code.visualstudio.com/docs/devcontainers/attach-container).
-
-See details for [developing in docker](https://code.visualstudio.com/docs/devcontainers/containers) or simply ask `OpenHands` ;-)
-
-## Rebuild dev image
-
-You could optionally pass additional options to the build script.
-
-```bash
-make docker-dev OPTIONS="--build"
-# or
-./containers/dev/dev.sh --build
-```
-
-See [docker compose run](https://docs.docker.com/reference/cli/docker/compose/run/) for more options.
@@ -1,38 +0,0 @@
-#
-services:
-  dev:
-    privileged: true
-    build:
-      context: ${OPENHANDS_WORKSPACE:-../../}
-      dockerfile: ./containers/dev/Dockerfile
-    image: openhands:dev
-    container_name: openhands-dev
-    environment:
-      - BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
-      - SANDBOX_API_HOSTNAME=host.docker.internal
-      #
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.9-nikolaik}
-      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
-      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
-    ports:
-      - "3000:3000"
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock
-      - ${WORKSPACE_BASE:-$PWD/workspace}:/opt/workspace_base
-      # source code
-      - ${OPENHANDS_WORKSPACE:-../../}:/app
-      # host credentials
-      - $HOME/.git-credentials:/root/.git-credentials:ro
-      - $HOME/.gitconfig:/root/.gitconfig:ro
-      - $HOME/.npmrc:/root/.npmrc:ro
-      # cache
-      - cache-data:/root/.cache
-    pull_policy: never
-    stdin_open: true
-    tty: true
-
-##
-volumes:
-  cache-data:
@@ -1,39 +0,0 @@
-#!/bin/bash
-set -o pipefail
-
-function get_docker() {
-    echo "Docker is required to build and run OpenHands."
-    echo "https://docs.docker.com/get-started/get-docker/"
-    exit 1
-}
-
-function check_tools() {
-	command -v docker &>/dev/null || get_docker
-}
-
-function exit_if_indocker() {
-    if [ -f /.dockerenv ]; then
-        echo "Running inside a Docker container. Exiting..."
-        exit 1
-    fi
-}
-
-#
-exit_if_indocker
-
-check_tools
-
-##
-OPENHANDS_WORKSPACE=$(git rev-parse --show-toplevel)
-
-cd "$OPENHANDS_WORKSPACE/containers/dev/" || exit 1
-
-##
-export BACKEND_HOST="0.0.0.0"
-#
-export SANDBOX_USER_ID=$(id -u)
-export WORKSPACE_BASE=${WORKSPACE_BASE:-$OPENHANDS_WORKSPACE/workspace}
-
-docker compose run --rm --service-ports "$@" dev
-
-##
@@ -1,12 +1,11 @@
-# Dynamically constructed Dockerfile
+# Dynamic constructed Dockerfile

-This folder builds a runtime image (sandbox), which will use a dynamically generated `Dockerfile`
-that depends on the `base_image` **AND** a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that is based on the current commit of `openhands`.
+This folder builds runtime image (sandbox), which will use a `Dockerfile` that is dynamically generated depends on the `base_image` AND a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that's based on the current commit of `openhands`.

-The following command will generate a `Dockerfile` file for `nikolaik/python-nodejs:python3.11-nodejs22` (the default base image), an updated `config.sh` and the runtime source distribution files/folders into `containers/runtime`:
+The following command will generate Dockerfile for `ubuntu:22.04` and the source distribution `.tar` into `containers/runtime`.

 ```bash
 poetry run python3 openhands/runtime/utils/runtime_build.py \
-    --base_image nikolaik/python-nodejs:python3.11-nodejs22 \
+    --base_image ubuntu:22.04 \
    --build_folder containers/runtime
 ```
@@ -4,8 +4,8 @@ import { themes as prismThemes } from "prism-react-renderer";

 const config: Config = {
  title: "OpenHands",
-  tagline: "Code Less, Make More",
-  favicon: "img/logo-square.png",
+  tagline: "An Open Platform for AI Software Developers as Generalist Agents",
+  favicon: "img/logo.png",

  // Set the production url of your site here
  url: "https://docs.all-hands.dev",
@@ -73,28 +73,23 @@ const config: Config = {
          type: "docSidebar",
          sidebarId: "docsSidebar",
          position: "left",
-          label: "User Guides",
+          label: "Docs",
        },
        {
          type: "docSidebar",
          sidebarId: "apiSidebar",
          position: "left",
-          label: "Python API",
-        },
-        {
-          type: 'localeDropdown',
-          position: 'left',
-        },
-        {
-          href: "https://all-hands.dev",
-          label: "Company",
-          position: "right",
+          label: "Codebase",
        },
        {
          href: "https://github.com/All-Hands-AI/OpenHands",
          label: "GitHub",
          position: "right",
        },
+        {
+          type: 'localeDropdown',
+          position: 'left',
+        },
      ],
    },
    prism: {
@@ -59,6 +59,10 @@ Félicitations !

 ## Explication technique

+Le code pertinent est défini dans [ssh_box.py](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/ssh_box.py) et [image_agnostic_util.py](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py).
+
+En particulier, ssh_box.py vérifie l'objet config pour ```config.sandbox.base_container_image``` et ensuite tente de récupérer l'image à l'aide de [get_od_sandbox_image](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py#L72), qui est défini dans image_agnostic_util.py.
+
 Lorsqu'une image personnalisée est utilisée pour la première fois, elle ne sera pas trouvée et donc elle sera construite (à l'exécution ultérieure, l'image construite sera trouvée et renvoyée).

 L'image personnalisée est construite avec [_build_sandbox_image()](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py#L29), qui crée un fichier docker en utilisant votre image personnalisée comme base et configure ensuite l'environnement pour OpenHands, comme ceci:
@@ -41,4 +41,4 @@ ne peut être aussi puissant que les modèles qui le pilotent -- heureusement, l

 Certains LLM ont des limites de taux et peuvent nécessiter des réessais. OpenHands réessaiera automatiquement les demandes s'il reçoit une erreur 429 ou une erreur de connexion API.
 Vous pouvez définir les variables d'environnement `LLM_NUM_RETRIES`, `LLM_RETRY_MIN_WAIT`, `LLM_RETRY_MAX_WAIT` pour contrôler le nombre de réessais et le temps entre les réessais.
-Par défaut, `LLM_NUM_RETRIES` est 8 et `LLM_RETRY_MIN_WAIT`, `LLM_RETRY_MAX_WAIT` sont respectivement de 15 secondes et 120 secondes.
+Par défaut, `LLM_NUM_RETRIES` est 5 et `LLM_RETRY_MIN_WAIT`, `LLM_RETRY_MAX_WAIT` sont respectivement de 3 secondes et 60 secondes.
@@ -43,4 +43,4 @@ OpenHands 将向你配置的 LLM 发出许多提示。大多数这些 LLM 都是

 一些 LLM 有速率限制，可能需要重试操作。OpenHands 会在收到 429 错误或 API 连接错误时自动重试请求。
 你可以设置 `LLM_NUM_RETRIES`，`LLM_RETRY_MIN_WAIT`，`LLM_RETRY_MAX_WAIT` 环境变量来控制重试次数和重试之间的时间。
-默认情况下，`LLM_NUM_RETRIES` 为 8，`LLM_RETRY_MIN_WAIT` 和 `LLM_RETRY_MAX_WAIT` 分别为 15 秒和 120 秒。
+默认情况下，`LLM_NUM_RETRIES` 为 5，`LLM_RETRY_MIN_WAIT` 和 `LLM_RETRY_MAX_WAIT` 分别为 3 秒和 60 秒。
@@ -74,7 +74,7 @@ WORKSPACE_DIR="$(pwd)/workspace"

 如有需要，可以替换您选择的 `LLM_MODEL`。

-完成！现在您可以通过 `make run` 启动 OpenHands 而无需 Docker。现在您应该可以连接到 `http://localhost:3000/`
+完成！现在您可以通过 `make run` 启动 Devin 而无需 Docker。现在您应该可以连接到 `http://localhost:3000/`

 ## 选择您的模型

@@ -5,7 +5,6 @@ sidebar_position: 8
 # 📚 Misc

 ## ⭐️ Research Strategy
-
 Achieving full replication of production-grade applications with LLMs is a complex endeavor. Our strategy involves:

 1. **Core Technical Research:** Focusing on foundational research to understand and improve the technical aspects of code generation and handling
@@ -14,11 +13,9 @@ Achieving full replication of production-grade applications with LLMs is a compl
 4. **Evaluation:** Establishing comprehensive evaluation metrics to better understand and improve our models

 ## 🚧 Default Agent
-
 Our default Agent is currently the [CodeActAgent](agents), which is capable of generating code and handling files.

 ## 🤝 How to Contribute
-
 OpenHands is a community-driven project, and we welcome contributions from everyone. Whether you're a developer, a researcher, or simply enthusiastic about advancing the field of software engineering with AI, there are many ways to get involved:

 - **Code Contributions:** Help us develop the core functionalities, frontend interface, or sandboxing solutions
@@ -28,7 +25,6 @@ OpenHands is a community-driven project, and we welcome contributions from every
 For details, please check [this document](https://github.com/All-Hands-AI/OpenHands/blob/main/CONTRIBUTING.md).

 ## 🤖 Join Our Community
-
 We have both Slack workspace for the collaboration on building OpenHands and Discord server for discussion about anything related, e.g., this project, LLM, agent, etc.

 - [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA)
@@ -41,7 +37,6 @@ If you would love to contribute, feel free to join our community. Let's simplify
 [![Star History Chart](https://api.star-history.com/svg?repos=All-Hands-AI/OpenHands&type=Date)](https://star-history.com/#All-Hands-AI/OpenHands&Date)

 ## 🛠️ Built With
-
 OpenHands is built using a combination of powerful frameworks and libraries, providing a robust foundation for its development. Here are the key technologies used in the project:

 ![FastAPI](https://img.shields.io/badge/FastAPI-black?style=for-the-badge) ![uvicorn](https://img.shields.io/badge/uvicorn-black?style=for-the-badge) ![LiteLLM](https://img.shields.io/badge/LiteLLM-black?style=for-the-badge) ![Docker](https://img.shields.io/badge/Docker-black?style=for-the-badge) ![Ruff](https://img.shields.io/badge/Ruff-black?style=for-the-badge) ![MyPy](https://img.shields.io/badge/MyPy-black?style=for-the-badge) ![LlamaIndex](https://img.shields.io/badge/LlamaIndex-black?style=for-the-badge) ![React](https://img.shields.io/badge/React-black?style=for-the-badge)
@@ -49,5 +44,4 @@ OpenHands is built using a combination of powerful frameworks and libraries, pro
 Please note that the selection of these technologies is in progress, and additional technologies may be added or existing ones may be removed as the project evolves. We strive to adopt the most suitable and efficient tools to enhance the capabilities of OpenHands.

 ## 📜 License
-
 Distributed under the MIT License. See [our license](https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE) for more information.
@@ -3,11 +3,8 @@ sidebar_position: 3
 ---

 # 🧠 Main Agent and Capabilities
-
 ## CodeActAgent
-
 ### Description
-
 This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a
 unified **code** action space for both _simplicity_ and _performance_.

@@ -22,7 +19,6 @@ The conceptual idea is illustrated below. At each turn, the agent can:
 ![image](https://github.com/All-Hands-AI/OpenHands/assets/38853559/92b622e3-72ad-4a61-8f41-8c040b6d5fb3)

 ### Demo
-
 https://github.com/All-Hands-AI/OpenHands/assets/38853559/f592a192-e86c-4f48-ad31-d69282d5f6ac

 _Example of CodeActAgent with `gpt-4-turbo-2024-04-09` performing a data science task (linear regression)_.
@@ -47,8 +47,8 @@ graph TD
 ```

 1. User Input: The user provides a custom base Docker image
-2. Image Building: OpenHands builds a new Docker image (the "OH runtime image") based on the user-provided image. This new image includes OpenHands-specific code, primarily the "runtime client"
-3. Container Launch: When OpenHands starts, it launches a Docker container using the OH runtime image
+2. Image Building: OpenHands builds a new Docker image (the "OD runtime image") based on the user-provided image. This new image includes OpenHands-specific code, primarily the "runtime client"
+3. Container Launch: When OpenHands starts, it launches a Docker container using the OD runtime image
 4. Client Initialization: The runtime client initializes inside the container, setting up necessary components like a bash shell and loading any specified plugins
 5. Communication: The OpenHands backend (`runtime.py`) communicates with the runtime client over RESTful API, sending actions and receiving observations
 6. Action Execution: The runtime client receives actions from the backend, executes them in the sandboxed environment, and sends back observations
@@ -62,7 +62,7 @@ The role of the client:
 - It formats and returns observations to the backend, ensuring a consistent interface for processing results


-## How OpenHands builds and maintains OH Runtime images
+## How OpenHands builds and maintains OD Runtime images

 OpenHands' approach to building and managing runtime images ensures efficiency, consistency, and flexibility in creating and maintaining Docker images for both production and development environments.

@@ -80,9 +80,9 @@ OpenHands uses a dual-tagging system for its runtime images to balance reproduci
   - This ensures reproducibility; the same hash always means the same image contents

 2. Generic tag: `{target_image_repo}:{target_image_tag}`.
-   Example: `runtime:oh_v0.9.3_ubuntu_tag_22.04`
+   Example: `runtime:od_v0.8.3_ubuntu_tag_22.04`

-   - This tag follows the format: `runtime:oh_v{OH_VERSION}_{BASE_IMAGE_NAME}_tag_{BASE_IMAGE_TAG}`
+   - This tag follows the format: `runtime:od_v{OD_VERSION}_{BASE_IMAGE_NAME}_tag_{BASE_IMAGE_TAG}`
   - It represents the latest build for a particular base image and OpenHands version combination
   - This tag is updated whenever a new image is built from the same base image, even if the source code changes

@@ -94,11 +94,11 @@ The hash-based tag ensures reproducibility, while the generic tag provides a sta
   - Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`.
     Example: `runtime:abc123def456`
   - Generic tag: `{target_image_repo}:{target_image_tag}`.
-     Example: `runtime:oh_v0.9.3_ubuntu_tag_22.04`
+     Example: `runtime:od_v0.8.3_ubuntu_tag_22.04`

 2. Build Process:
-   - a. Convert the base image name to an OH runtime image name
-      Example: `ubuntu:22.04` -> `runtime:oh_v0.9.3_ubuntu_tag_22.04`
+   - a. Convert the base image name to an OD runtime image name
+      Example: `ubuntu:22.04` -> `runtime:od_v0.8.3_ubuntu_tag_22.04`
   - b. Generate a build context (Dockerfile and OpenHands source code) and calculate its hash
   - c. Check for an existing image with the calculated hash
   - d. If not found, check for a recent compatible image to use as a base
@@ -108,7 +108,7 @@ The hash-based tag ensures reproducibility, while the generic tag provides a sta
 3. Image Reuse and Rebuilding Logic:
   The system follows these steps to determine whether to build a new image or use an existing one from a user-provided (base) image (e.g., `ubuntu:22.04`):
   - a. If an image exists with the same hash (e.g., `runtime:abc123def456`), it will be reused as is
-   - b. If the exact hash is not found, the system will try to rebuild using the latest generic image (e.g., `runtime:oh_v0.9.3_ubuntu_tag_22.04`) as a base. This saves time by leveraging existing dependencies
+   - b. If the exact hash is not found, the system will try to rebuild using the latest generic image (e.g., `runtime:od_v0.8.3_ubuntu_tag_22.04`) as a base. This saves time by leveraging existing dependencies
   - c. If neither the hash-tagged nor the generic-tagged image is found, the system will build the image completely from scratch

 4. Caching and Efficiency:
@@ -121,10 +121,10 @@ Here's a flowchart illustrating the build process:
 ```mermaid
 flowchart TD
    A[Start] --> B{Convert base image name}
-    B --> |ubuntu:22.04 -> runtime:oh_v0.9.3_ubuntu_tag_22.04| C[Generate build context and hash]
+    B --> |ubuntu:22.04 -> runtime:od_v0.8.3_ubuntu_tag_22.04| C[Generate build context and hash]
    C --> D{Check for existing image with hash}
    D -->|Found runtime:abc123def456| E[Use existing image]
-    D -->|Not found| F{Check for runtime:oh_v0.9.3_ubuntu_tag_22.04}
+    D -->|Not found| F{Check for runtime:od_v0.8.3_ubuntu_tag_22.04}
    F -->|Found| G[Rebuild based on recent image]
    F -->|Not found| H[Build from scratch]
    G --> I[Tag with hash and generic tags]
@@ -137,13 +137,13 @@ This approach ensures that:

 1. Identical source code and Dockerfile always produce the same image (via hash-based tags)
 2. The system can quickly rebuild images when minor changes occur (by leveraging recent compatible images)
-3. The generic tag (e.g., `runtime:oh_v0.9.3_ubuntu_tag_22.04`) always points to the latest build for a particular base image and OpenHands version combination
+3. The generic tag (e.g., `runtime:od_v0.8.3_ubuntu_tag_22.04`) always points to the latest build for a particular base image and OpenHands version combination

 ## Runtime Plugin System

 The OpenHands Runtime supports a plugin system that allows for extending functionality and customizing the runtime environment. Plugins are initialized when the runtime client starts up.

-Check [an example of Jupyter plugin here](https://github.com/All-Hands-AI/OpenHands/blob/ecf4aed28b0cf7c18d4d8ff554883ba182fc6bdd/openhands/runtime/plugins/jupyter/__init__.py#L21-L55) if you want to implement your own plugin.
+Check [an example of Jupyter plugin here](https://github.com/All-Hands-AI/OpenHands/blob/9c44d94cef32e6426ebd8deeeb52963153b2348a/openhands/runtime/plugins/jupyter/__init__.py#L30-L63) if you want to implement your own plugin.

 *More details about the Plugin system are still under construction - contributions are welcomed!*

@@ -1,66 +0,0 @@
---
-sidebar_position: 2
---
-
-# Getting Started
-
-## System Requirements
-
-* Docker version 26.0.0+ or Docker Desktop 4.31.0+
-* You must be using Linux or Mac OS
-  * If you are on Windows, you must use [WSL](https://learn.microsoft.com/en-us/windows/wsl/install)
-
-## Installation
-
-The easiest way to run OpenHands is in Docker. You can change `WORKSPACE_BASE` below to point OpenHands to
-existing code that you'd like to modify.
-
-```bash
-export WORKSPACE_BASE=$(pwd)/workspace
-
-docker run -it --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
-    -e SANDBOX_USER_ID=$(id -u) \
-    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-    -v $WORKSPACE_BASE:/opt/workspace_base \
-    -v /var/run/docker.sock:/var/run/docker.sock \
-    -p 3000:3000 \
-    --add-host host.docker.internal:host-gateway \
-    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9
-```
-
-You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
-or as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode).
-
-## Setup
-
-After running the command above, you'll find OpenHands running at [http://localhost:3000](http://localhost:3000).
-
-The agent will have access to the `./workspace` folder to do its work. You can copy existing code here, or change `WORKSPACE_BASE` in the
-command to point to an existing folder.
-
-Upon launching OpenHands, you'll see a settings modal. You **must** select an `LLM Provider` and `LLM Model` and enter a corresponding `API Key`.
-These can be changed at any time by selecting the `Settings` button (gear icon) in the UI.
-
-If the required `LLM Model` does not exist in the list, you can toggle `Advanced Options` and manually enter it with the correct prefix
-in the `Custom Model` text box.
-The `Advanced Options` also allow you to specify a `Base URL` if required.
-
-<div style={{ display: 'flex', justifyContent: 'center', gap: '20px' }}>
-  <img src="/img/settings-screenshot.png" alt="settings-modal" width="340" />
-  <img src="/img/settings-advanced.png" alt="settings-modal" width="335" />
-</div>
-
-## Versions
-
-The command above pulls the `0.9` tag, which represents the most recent stable release of OpenHands. You have other options as well:
- For a specific release, use `ghcr.io/all-hands-ai/openhands:$VERSION`, replacing $VERSION with the version number.
- We use semver, and release major, minor, and patch tags. So `0.9` will automatically point to the latest `0.9.x` release, and `0` will point to the latest `0.x.x` release.
- For the most up-to-date development version, you can use `ghcr.io/all-hands-ai/openhands:main`. This version is unstable and is recommended for testing or development purposes only.
-
-You can choose the tag that best suits your needs based on stability requirements and desired features.
-
-For the development workflow, see [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
-
-Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).
@@ -1,108 +0,0 @@
-# CLI Mode
-
-OpenHands can be run in an interactive CLI mode, which allows users to start an interactive session via the command line.
-
-This mode is different from the [headless mode](headless-mode), which is non-interactive and better for scripting.
-
-## With Python
-
-To start an interactive OpenHands session via the command line, follow these steps:
-
-1. Ensure you have followed the [Development setup instructions](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md)
-
-2. Run the following command:
-
-```bash
-poetry run python -m openhands.core.cli
-```
-
-This command will start an interactive session where you can input tasks and receive responses from OpenHands.
-
-You'll need to be sure to set your model, API key, and other settings via environment variables
-[or the `config.toml` file](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml).
-
-
-## With Docker
-
-To run OpenHands in CLI mode with Docker, follow these steps:
-
-1. Set `WORKSPACE_BASE` to the directory you want OpenHands to edit:
-
-```bash
-WORKSPACE_BASE=$(pwd)/workspace
-```
-
-2. Set `LLM_MODEL` to the model you want to use:
-
-```bash
-LLM_MODEL="anthropic/claude-3-5-sonnet-20240620"
-```
-
-3. Set `LLM_API_KEY` to your API key:
-
-```bash
-LLM_API_KEY="sk_test_12345"
-```
-
-4. Run the following Docker command:
-
-```bash
-docker run -it \
-    --pull=always \
-    -e SANDBOX_USER_ID=$(id -u) \
-    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-    -e LLM_API_KEY=$LLM_API_KEY \
-    -e LLM_MODEL=$LLM_MODEL \
-    -v $WORKSPACE_BASE:/opt/workspace_base \
-    -v /var/run/docker.sock:/var/run/docker.sock \
-    --add-host host.docker.internal:host-gateway \
-    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9 \
-    python -m openhands.core.cli
-```
-
-This command will start an interactive session in Docker where you can input tasks and receive responses from OpenHands.
-
-## Examples of CLI Commands and Expected Outputs
-
-Here are some examples of CLI commands and their expected outputs:
-
-### Example 1: Simple Task
-
-```bash
-How can I help? >> Write a Python script that prints "Hello, World!"
-```
-
-Expected Output:
-
-```bash
-🤖 Sure! Here is a Python script that prints "Hello, World!":
-
-❯ print("Hello, World!")
-```
-
-### Example 2: Bash Command
-
-```bash
-How can I help? >> Create a directory named "test_dir"
-```
-
-Expected Output:
-
-```bash
-🤖 Creating a directory named "test_dir":
-
-❯ mkdir test_dir
-```
-
-### Example 3: Error Handling
-
-```bash
-How can I help? >> Delete a non-existent file
-```
-
-Expected Output:
-
-```bash
-🤖 An error occurred. Please try again.
-```
@@ -1,19 +1,26 @@
-# Custom Sandbox
+# Create and Use a Custom Docker Sandbox

-The sandbox is where the agent does its work. Instead of running commands directly on your computer
-(which could be dangerous), the agent runs them inside of a Docker container.
-
-The default OpenHands sandbox (`python-nodejs:python3.11-nodejs22`
-from [nikolaik/python-nodejs](https://hub.docker.com/r/nikolaik/python-nodejs)) comes with some packages installed such
-as python and Node.js but your use case may need additional software installed by default.
+The default OpenHands sandbox comes with a [minimal ubuntu configuration](https://github.com/All-Hands-AI/OpenHands/blob/main/containers/sandbox/Dockerfile).
+Your use case may need additional software installed by default.

 There are two ways you can do so:

-1. Use an existing image from docker hub
+1. Use an existing image from docker hub. For instance, if you want to have `nodejs` installed, you can do so by using the `node:20` image
 2. Creating your own custom docker image and using it

 If you want to take the first approach, you can skip the `Create Your Docker Image` section.

+For a more feature-rich environment, you might consider using pre-built images like **[nikolaik/python-nodejs](https://hub.docker.com/r/nikolaik/python-nodejs)**, which comes with both Python and Node.js pre-installed, along with many other useful tools and libraries, like:
+
+- Node.js: 22.x
+- npm: 10.x
+- yarn: stable
+- Python: latest
+- pip: latest
+- pipenv: latest
+- poetry: latest
+- uv: latest
+
 ## Setup

 Make sure you are able to run OpenHands using the [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) first.
@@ -72,7 +79,7 @@ Run OpenHands by running ```make run``` in the top level directory.

 Navigate to ```localhost:3001``` and check if your desired dependencies are available.

-In the case of the example above, running ```node -v``` in the terminal produces ```v20.15.0```.
+In the case of the example above, running ```node -v``` in the terminal produces ```v20.15.0```

 Congratulations!

@@ -1,4 +1,4 @@
-# Evaluation
+# Contribute to OpenHands Evaluation Harness

 This guide provides an overview of how to integrate your own evaluation benchmark into the OpenHands framework.

@@ -12,7 +12,7 @@ Here's an example configuration file you can use to define and use multiple LLMs
 ```toml
 [llm]
 # IMPORTANT: add your API key here, and set the model to the one you want to evaluate
-model = "claude-3-5-sonnet-20240620"
+model = "gpt-4o-2024-05-13"
 api_key = "sk-XXX"

 [llm.eval_gpt4_1106_preview_llm]
@@ -1,12 +1,9 @@
-# Headless Mode
+# Running in Headless Mode

-You can run OpenHands with a single command, without starting the web application.
-This makes it easy to write scripts and automate tasks with OpenHands.
-
-This is different from [CLI Mode](cli-mode), which is interactive, and better for active development.
+You can run OpenHands via a CLI, without starting the web application. This makes it easy
+to automate tasks with OpenHands.

 ## With Python
-
 To run OpenHands in headless mode with Python,
 [follow the Development setup instructions](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md),
 and then run:
@@ -15,32 +12,19 @@ and then run:
 poetry run python -m openhands.core.main -t "write a bash script that prints hi"
 ```

-You'll need to be sure to set your model, API key, and other settings via environment variables
-[or the `config.toml` file](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml).
-
 ## With Docker
-
-1. Set `WORKSPACE_BASE` to the directory you want OpenHands to edit:
+To run OpenHands in headless mode with Docker, run:

 ```bash
+# Set WORKSPACE_BASE to the directory you want OpenHands to edit
 WORKSPACE_BASE=$(pwd)/workspace
-```

-2. Set `LLM_MODEL` to the model you want to use:
+# Set LLM_API_KEY to an API key, e.g. for OpenAI or Anthropic
+LLM_API_KEY="abcde"

-```bash
-LLM_MODEL="anthropic/claude-3-5-sonnet-20240620"
-```
+# Set LLM_MODEL to the model you want to use
+LLM_MODEL="gpt-4o"

-3. Set `LLM_API_KEY` to your API key:
-
-```bash
-LLM_API_KEY="sk_test_12345"
-```
-
-4. Run the following Docker command:
-
-```bash
 docker run -it \
    --pull=always \
    -e SANDBOX_USER_ID=$(id -u) \
@@ -51,6 +35,7 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9 \
-    python -m openhands.core.main -t "write a bash script that prints hi"
+    ghcr.io/all-hands-ai/openhands:main \ # TODO: pin a version here
+    python -m openhands.core.main \
+    -t "Write a bash script that prints Hello World"
 ```
@@ -0,0 +1,5 @@
+---
+sidebar_position: 6
+---
+
+# 🔎 How To Section
@@ -1,12 +1,11 @@
-# Kubernetes
+# Use OpenHands in OpenShift/K8S

-There are different ways you might run OpenHands on Kubernetes or OpenShift. This guide goes through one possible way:
+There are different ways this can be accomplished. This guide goes through one possible way:
 1. Create a PV "as a cluster admin" to map workspace_base data and docker directory to the pod through the worker node
 2. Create a PVC to be able to mount those PVs to the pod
 3. Create a pod which contains two containers; the OpenHands and Sandbox containers

 ## Detailed Steps for the Example Above
-
 > Note: Make sure you are logged in to the cluster first with the proper account for each step. PV creation requires cluster administrator!

 > Make sure you have read/write permissions on the hostPath used below (i.e. /tmp/workspace)
@@ -0,0 +1,71 @@
+---
+sidebar_position: 1
+---
+
+# 💻 OpenHands
+
+OpenHands is an **autonomous AI software engineer** capable of executing complex engineering tasks and collaborating actively with users on software development projects.
+This project is fully open-source, so you can use and modify it however you like.
+
+:::tip
+Explore the codebase of OpenHands on [GitHub](https://github.com/All-Hands-AI/OpenHands) or join one of our communities!
+
+<a href="https://github.com/All-Hands-AI/OpenHands/graphs/contributors">
+  <img
+    src="https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge"
+    alt="Contributors"
+  />
+</a>
+<a href="https://github.com/All-Hands-AI/OpenHands/network/members">
+  <img
+    src="https://img.shields.io/github/forks/All-Hands-AI/OpenHands?style=for-the-badge"
+    alt="Forks"
+  />
+</a>
+<a href="https://github.com/All-Hands-AI/OpenHands/stargazers">
+  <img
+    src="https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge"
+    alt="Stargazers"
+  />
+</a>
+<a href="https://github.com/All-Hands-AI/OpenHands/issues">
+  <img
+    src="https://img.shields.io/github/issues/All-Hands-AI/OpenHands?style=for-the-badge"
+    alt="Issues"
+  />
+</a>
+<br></br>
+<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE">
+  <img
+    src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge"
+    alt="MIT License"
+  />
+</a>
+<br></br>
+<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA">
+  <img
+    src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
+    alt="Join our Slack community"
+  />
+</a>
+<a href="https://discord.gg/ESHStjSjD4">
+  <img
+    src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge"
+    alt="Join our Discord community"
+  />
+</a>
+:::
+
+## 🛠️ Getting Started
+[Check out the getting started guide on Github](https://github.com/All-Hands-AI/OpenHands?tab=readme-ov-file#-getting-started)
+
+[contributors-shield]: https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge
+[contributors-url]: https://github.com/All-Hands-AI/OpenHands/graphs/contributors
+[forks-shield]: https://img.shields.io/github/forks/All-Hands-AI/OpenHands?style=for-the-badge
+[forks-url]: https://github.com/All-Hands-AI/OpenHands/network/members
+[stars-shield]: https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge
+[stars-url]: https://github.com/All-Hands-AI/OpenHands/stargazers
+[issues-shield]: https://img.shields.io/github/issues/All-Hands-AI/OpenHands?style=for-the-badge
+[issues-url]: https://github.com/All-Hands-AI/OpenHands/issues
+[license-shield]: https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge
+[license-url]: https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE
@@ -1,46 +1,55 @@
-# Azure
+# Azure OpenAI LLM

-OpenHands uses LiteLLM to make calls to Azure's chat models. You can find their documentation on using Azure as a provider [here](https://docs.litellm.ai/docs/providers/azure).
+## Completion

-## Azure OpenAI Configuration
+OpenHands uses LiteLLM for completion calls. You can find their documentation on Azure [here](https://docs.litellm.ai/docs/providers/azure)

-When running OpenHands, you'll need to set the following environment variable using `-e` in the
-[docker run command](/modules/usage/getting-started#installation):
+### Azure openai configs
+
+When running the OpenHands Docker image, you'll need to set the following environment variables using `-e`:

 ```
-LLM_API_VERSION="<api-version>"              # e.g. "2023-05-15"
+LLM_BASE_URL="<azure-api-base-url>"          # e.g. "https://openai-gpt-4-test-v-1.openai.azure.com/"
+LLM_API_KEY="<azure-api-key>"
+LLM_MODEL="azure/<your-gpt-deployment-name>"
+LLM_API_VERSION="<api-version>"          # e.g. "2024-02-15-preview"
 ```

 Example:
 ```bash
-docker run -it --pull=always \
-    -e LLM_API_VERSION="2023-05-15"
-    ...
+docker run -it \
+--pull=always \
+-e SANDBOX_USER_ID=$(id -u) \
+-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
+-e LLM_BASE_URL="x.openai.azure.com" \
+-e LLM_API_VERSION="2024-02-15-preview" \
+-v $WORKSPACE_BASE:/opt/workspace_base \
+-v /var/run/docker.sock:/var/run/docker.sock \
+-p 3000:3000 \
+--add-host host.docker.internal:host-gateway \
+--name openhands-app-$(date +%Y%m%d%H%M%S) \
+ghcr.io/all-hands-ai/openhands:main
 ```

-Then set the following in the OpenHands UI through the Settings:
+You can set the LLM_MODEL and LLM_API_KEY in the OpenHands UI itself.

 :::note
-You will need your ChatGPT deployment name which can be found on the deployments page in Azure. This is referenced as
-&lt;deployment-name&gt; below.
+You can find your ChatGPT deployment name on the deployments page in Azure. It could be the same with the chat model name (e.g. 'GPT4-1106-preview'), by default or initially set, but it doesn't have to be the same. Run openhands, and when you load it in the browser, go to Settings and set model as above: "azure/&lt;your-actual-gpt-deployment-name&gt;". If it's not in the list, enter your own text and save it.
 :::

-* Enable `Advanced Options`
-* `Custom Model` to azure/&lt;deployment-name&gt;
-* `Base URL` to your Azure API Base URL (e.g. `https://example-endpoint.openai.azure.com`)
-* `API Key` to your Azure API key
-
 ## Embeddings

-OpenHands uses llama-index for embeddings. You can find their documentation on Azure [here](https://docs.llamaindex.ai/en/stable/api_reference/embeddings/azure_openai/).
+OpenHands uses llama-index for embeddings. You can find their documentation on Azure [here](https://docs.llamaindex.ai/en/stable/api_reference/embeddings/azure_openai/)

-### Azure OpenAI Configuration
+### Azure openai configs

-When running OpenHands, set the following environment variables using `-e` in the
-[docker run command](/modules/usage/getting-started#installation):
+The model used for Azure OpenAI embeddings is "text-embedding-ada-002".
+You need the correct deployment name for this model in your Azure account.
+
+When running OpenHands in Docker, set the following environment variables using `-e`:

 ```
 LLM_EMBEDDING_MODEL="azureopenai"
-LLM_EMBEDDING_DEPLOYMENT_NAME="<your-embedding-deployment-name>"   # e.g. "TextEmbedding...<etc>"
-LLM_API_VERSION="<api-version>"                                    # e.g. "2024-02-15-preview"
+LLM_EMBEDDING_DEPLOYMENT_NAME="<your-embedding-deployment-name>"        # e.g. "TextEmbedding...<etc>"
+LLM_API_VERSION="<api-version>"         # e.g. "2024-02-15-preview"
 ```
@@ -1,30 +1,28 @@
-# Google Gemini/Vertex
+# Google Gemini/Vertex LLM

-OpenHands uses LiteLLM to make calls to Google's chat models. You can find their documentation on using Google as a provider:
+## Completion
+
+OpenHands uses LiteLLM for completion calls. The following resources are relevant for using OpenHands with Google's LLMs

 - [Gemini - Google AI Studio](https://docs.litellm.ai/docs/providers/gemini)
 - [VertexAI - Google Cloud Platform](https://docs.litellm.ai/docs/providers/vertex)

-## Gemini - Google AI Studio Configs
+### Gemini - Google AI Studio Configs

-When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
-* `LLM Provider` to `Gemini`
-* `LLM Model` to the model you will be using.
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. gemini/&lt;model-name&gt; like `gemini/gemini-1.5-pro`).
-* `API Key` to your Gemini API key
+To use Gemini through Google AI Studio when running the OpenHands Docker image, you'll need to set the following environment variables using `-e`:

-## VertexAI - Google Cloud Platform Configs
+```
+GEMINI_API_KEY="<your-google-api-key>"
+LLM_MODEL="gemini/gemini-1.5-pro"
+```

-To use Vertex AI through Google Cloud Platform when running OpenHands, you'll need to set the following environment
-variables using `-e` in the [docker run command](/modules/usage/getting-started#installation):
+### Vertex AI - Google Cloud Platform Configs
+
+To use Vertex AI through Google Cloud Platform when running the OpenHands Docker image, you'll need to set the following environment variables using `-e`:

 ```
 GOOGLE_APPLICATION_CREDENTIALS="<json-dump-of-gcp-service-account-json>"
 VERTEXAI_PROJECT="<your-gcp-project-id>"
 VERTEXAI_LOCATION="<your-gcp-location>"
+LLM_MODEL="vertex_ai/<desired-llm-model>"
 ```
-
-Then set the following in the OpenHands UI through the Settings:
-* `LLM Provider` to `VertexAI`
-* `LLM Model` to the model you will be using.
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. vertex_ai/&lt;model-name&gt;).
@@ -1,23 +0,0 @@
-# Groq
-
-OpenHands uses LiteLLM to make calls to chat models on Groq. You can find their documentation on using Groq as a provider [here](https://docs.litellm.ai/docs/providers/groq).
-
-## Configuration
-
-When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
-* `LLM Provider` to `Groq`
-* `LLM Model` to the model you will be using. [Visit here to see the list of
-models that Groq hosts](https://console.groq.com/docs/models). If the model is not in the list, toggle
-`Advanced Options`, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`)
-* `API key` to your Groq API key. To find or create your Groq API Key, [see here](https://console.groq.com/keys)
-
-
-
-## Using Groq as an OpenAI-Compatible Endpoint
-
-The Groq endpoint for chat completion is [mostly OpenAI-compatible](https://console.groq.com/docs/openai). Therefore, you can access Groq models as you
-would access any OpenAI-compatible endpoint. You can set the following in the OpenHands UI through the Settings:
-* Enable `Advanced Options`
-* `Custom Model` to the prefix `openai/` + the model you will be using (e.g. `openai/llama3-70b-8192`)
-* `Base URL` to `https://api.groq.com/openai/v1`
-* `API Key` to your Groq API key
@@ -1,67 +1,46 @@
 ---
-sidebar_position: 3
+sidebar_position: 2
 ---

 # 🤖 LLM Backends

-OpenHands can connect to any LLM supported by LiteLLM. However, it requires a powerful model to work.
-The following are verified by the community to work with OpenHands:
+OpenHands can connect to many LLMs. However, the recommended models to use are GPT-4 and Claude 3.5.

-* claude-3-5-sonnet
-* gemini-1.5-pro / gemini-1.5-flash
-* gpt-4 / gpt-4o
-* llama-3.1-405b / hermes-3-llama-3.1-405b
-* wizardlm-2-8x22b
-
-:::warning
-OpenHands will issue many prompts to the LLM you configure. Most of these LLMs cost money, so be sure to set spending
-limits and monitor usage.
-:::
-
-If you have successfully run OpenHands with specific LLMs not in the list, please add them to the verified list. We
-also encourage you to open a PR to share your setup process to help others using the same provider and LLM!
-
-For a full list of the providers and models available, please consult the
+Current local and open source models are not nearly as powerful. When using an alternative model, you may see long
+wait times between messages, poor responses, or errors about malformed JSON. OpenHands can only be as powerful as the
+models driving it.
+For a full list of the LM providers and models available, please consult the
 [litellm documentation](https://docs.litellm.ai/docs/providers).

-:::note
-Most current local and open source models are not as powerful. When using such models, you may see long
-wait times between messages, poor responses, or errors about malformed JSON. OpenHands can only be as powerful as the
-models driving it. However, if you do find ones that work, please add them to the verified list above.
+:::warning
+OpenHands will issue many prompts to the LLM you configure. Most of these LLMs cost money--be sure to set spending limits and monitor usage.
 :::

-## LLM Configuration
+The `LLM_MODEL` environment variable controls which model is used in programmatic interactions.
+But when using the OpenHands UI, you'll need to choose your model in the settings window.

-The following can be set in the OpenHands UI through the Settings:
-* `LLM Provider`
-* `LLM Model`
-* `API Key`
-* `Base URL` (through `Advanced Settings`)
+The following environment variables might be necessary for some LLMs/providers:

-There are some settings that may be necessary for some LLMs/providers that cannot be set through the UI. Instead, these
-can be set through environment variables passed to the [docker run command](/modules/usage/getting-started#installation)
-using `-e`:
-
-* `LLM_API_VERSION`
-* `LLM_EMBEDDING_MODEL`
-* `LLM_EMBEDDING_DEPLOYMENT_NAME`
-* `LLM_DROP_PARAMS`
-* `LLM_DISABLE_VISION`
-* `LLM_CACHING_PROMPT`
+- `LLM_API_KEY`
+- `LLM_BASE_URL`
+- `LLM_EMBEDDING_MODEL`
+- `LLM_EMBEDDING_DEPLOYMENT_NAME`
+- `LLM_API_VERSION`
+- `LLM_DROP_PARAMS`

 We have a few guides for running OpenHands with specific model providers:

-* [Azure](llms/azure-llms)
-* [Google](llms/google-llms)
-* [Groq](llms/groq)
-* [OpenAI](llms/openai-llms)
-* [OpenRouter](llms/openrouter)
+- [OpenAI](llms/openai-llms)
+- [ollama](llms/local-llms)
+- [Azure](llms/azure-llms)
+- [Google](llms/google-llms)

-### API retries and rate limits
+If you're using another provider, we encourage you to open a PR to share your setup!
+
+## API retries and rate limits

 Some LLMs have rate limits and may require retries. OpenHands will automatically retry requests if it receives a 429 error or API connection error.
 You can set the following environment variables to control the number of retries and the time between retries:
-
-* `LLM_NUM_RETRIES` (Default of 8)
-* `LLM_RETRY_MIN_WAIT` (Default of 15 seconds)
-* `LLM_RETRY_MAX_WAIT` (Default of 120 seconds)
+* `LLM_NUM_RETRIES` (Default of 5)
+* `LLM_RETRY_MIN_WAIT` (Default of 3 seconds)
+* `LLM_RETRY_MAX_WAIT` (Default of 60 seconds)
@@ -1,11 +1,7 @@
 # Local LLM with Ollama

-:::warning
-When using a Local LLM, OpenHands may have limited functionality.
-:::
-
 Ensure that you have the Ollama server up and running.
-For detailed startup instructions, refer to [here](https://github.com/ollama/ollama).
+For detailed startup instructions, refer to [here](https://github.com/ollama/ollama)

 This guide assumes you've started ollama with `ollama serve`. If you're running ollama differently (e.g. inside docker), the instructions might need to be modified. Please note that if you're running WSL the default ollama configuration blocks requests from docker containers. See [here](#configuring-ollama-service-wsl-en).

@@ -28,14 +24,17 @@ mistral:7b-instruct-v0.2-q4_K_M eb14864c7427    4.4 GB  2 weeks ago
 starcoder2:latest               f67ae0f64584    1.7 GB  19 hours ago
 ```

-## Run OpenHands with Docker
+## Start OpenHands

-### Start OpenHands
-Use the instructions [here](../getting-started) to start OpenHands using Docker.
+### Docker
+
+Use the instructions [here](../intro) to start OpenHands using Docker.
 But when running `docker run`, you'll need to add a few more arguments:

 ```bash
 --add-host host.docker.internal:host-gateway \
+-e LLM_API_KEY="ollama" \
+-e LLM_BASE_URL="http://host.docker.internal:11434" \
 -e LLM_OLLAMA_BASE_URL="http://host.docker.internal:11434" \
 ```

@@ -52,6 +51,8 @@ docker run \
    --pull=always \
    --add-host host.docker.internal:host-gateway \
    -e SANDBOX_USER_ID=$(id -u) \
+    -e LLM_API_KEY="ollama" \
+    -e LLM_BASE_URL="http://host.docker.internal:11434" \
    -e LLM_OLLAMA_BASE_URL="http://host.docker.internal:11434" \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
@@ -62,16 +63,6 @@ docker run \

 You should now be able to connect to `http://localhost:3000/`

-### Configure the Web Application
-
-When running `openhands`, you'll need to set the following in the OpenHands UI through the Settings:
- the model to "ollama/&lt;model-name&gt;"
- the base url to `http://host.docker.internal:11434`
- the API key is optional, you can use any string, such as `ollama`.
-
-
-## Run OpenHands in Development Mode
-
 ### Build from Source

 Use the instructions in [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to build OpenHands.
@@ -82,22 +73,23 @@ Make sure `config.toml` is there by running `make setup-config` which will creat
 workspace_base="./workspace"

 [llm]
+model="ollama/codellama:7b"
+api_key="ollama"
 embedding_model="local"
+base_url="http://localhost:11434"
 ollama_base_url="http://localhost:11434"

 ```

-Done! Now you can start OpenHands by: `make run`. You now should be able to connect to `http://localhost:3000/`
+Replace `LLM_MODEL` of your choice if you need to.

-### Configure the Web Application
+Done! Now you can start OpenHands by: `make run` without Docker. You now should be able to connect to `http://localhost:3000/`
+
+## Select your Model

 In the OpenHands UI, click on the Settings wheel in the bottom-left corner.
 Then in the `Model` input, enter `ollama/codellama:7b`, or the name of the model you pulled earlier.
-If it doesn’t show up in the dropdown, enable `Advanced Settings` and type it in. Please note: you need the model name as listed by `ollama list`, with the prefix `ollama/`.
-
-In the API Key field, enter `ollama` or any value, since you don't need a particular key.
-
-In the Base URL field, enter `http://localhost:11434`.
+If it doesn’t show up in a dropdown, that’s fine, just type it in. Click Save when you’re done.

 And now you're ready to go!

@@ -204,9 +196,9 @@ base_url="http://localhost:1234/v1"
 custom_llm_provider="openai"
 ```

-Done! Now you can start OpenHands by: `make run` without Docker. You now should be able to connect to `http://localhost:3000/`
+Done! Now you can start Devin by: `make run` without Docker. You now should be able to connect to `http://localhost:3000/`

-# Note
+# Note:

 For WSL, run the following commands in cmd to set up the networking mode to mirrored:

@@ -1,24 +1,75 @@
 # OpenAI

-OpenHands uses LiteLLM to make calls to OpenAI's chat models. You can find their documentation on using OpenAI as a provider [here](https://docs.litellm.ai/docs/providers/openai).
+OpenHands uses [LiteLLM](https://www.litellm.ai/) to make calls to OpenAI's chat models. You can find their full documentation on OpenAI chat calls [here](https://docs.litellm.ai/docs/providers/openai).

 ## Configuration

-When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
-* `LLM Provider` to `OpenAI`
-* `LLM Model` to the model you will be using.
-[Visit here to see a full list of OpenAI models that LiteLLM supports.](https://docs.litellm.ai/docs/providers/openai#openai-chat-completion-models)
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. openai/&lt;model-name&gt; like `openai/gpt-4o`).
-* `API Key` to your OpenAI API key. To find or create your OpenAI Project API Key, [see here](https://platform.openai.com/api-keys).
+### Manual Configuration
+
+When running the OpenHands Docker image, you'll need to set the following environment variables:
+
+```sh
+LLM_MODEL="openai/<gpt-model-name>" # e.g. "openai/gpt-4o"
+LLM_API_KEY="<your-openai-project-api-key>"
+```
+
+To see a full list of OpenAI models that LiteLLM supports, please visit https://docs.litellm.ai/docs/providers/openai#openai-chat-completion-models.
+
+To find or create your OpenAI Project API Key, please visit https://platform.openai.com/api-keys.
+
+**Example**:
+
+```sh
+export WORKSPACE_BASE=$(pwd)/workspace
+
+docker run -it \
+    --pull=always \
+    -e SANDBOX_USER_ID=$(id -u) \
+    -e LLM_MODEL="openai/<gpt-model-name>" \
+    -e LLM_API_KEY="<your-openai-project-api-key>" \
+    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
+    -v $WORKSPACE_BASE:/opt/workspace_base \
+    -v /var/run/docker.sock:/var/run/docker.sock \
+    -p 3000:3000 \
+    --add-host host.docker.internal:host-gateway \
+    --name openhands-app-$(date +%Y%m%d%H%M%S) \
+    ghcr.io/opendevin/opendevin:0.8
+```
+
+### UI Configuration
+
+You can also directly set the `LLM_MODEL` and `LLM_API_KEY` in the OpenHands client itself. Follow this guide to get up and running with the OpenHands client.
+
+From there, you can set your model and API key in the settings window.

 ## Using OpenAI-Compatible Endpoints

 Just as for OpenAI Chat completions, we use LiteLLM for OpenAI-compatible endpoints. You can find their full documentation on this topic [here](https://docs.litellm.ai/docs/providers/openai_compatible).

-## Using an OpenAI Proxy
+When running the OpenHands Docker image, you'll need to set the following environment variables:

-If you're using an OpenAI proxy, you'll need to set the following in the OpenHands UI through the Settings:
-* Enable `Advanced Options`
-* `Custom Model` to openai/&lt;model-name&gt; (e.g. `openai/gpt-4o` or openai/&lt;proxy-prefix&gt;/&lt;model-name&gt;)
-* `Base URL` to the URL of your OpenAI proxy
-* `API Key` to your OpenAI API key
+```sh
+LLM_BASE_URL="<api-base-url>" # e.g. "http://0.0.0.0:3000"
+LLM_MODEL="openai/<model-name>" # e.g. "openai/mistral"
+LLM_API_KEY="<your-api-key>"
+```
+
+**Example**:
+
+```sh
+export WORKSPACE_BASE=$(pwd)/workspace
+
+docker run -it \
+    --pull=always \
+    -e SANDBOX_USER_ID=$(id -u) \
+    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
+    -e LLM_BASE_URL="<api-base-url>" \
+    -e LLM_MODEL="openai/<model-name>" \
+    -e LLM_API_KEY="<your-api-key>" \
+    -v $WORKSPACE_BASE:/opt/workspace_base \
+    -v /var/run/docker.sock:/var/run/docker.sock \
+    -p 3000:3000 \
+    --add-host host.docker.internal:host-gateway \
+    --name openhands-app-$(date +%Y%m%d%H%M%S) \
+    ghcr.io/opendevin/opendevin:0.8
+```
@@ -1,12 +0,0 @@
-# OpenRouter
-
-OpenHands uses LiteLLM to make calls to chat models on OpenRouter. You can find their documentation on using OpenRouter as a provider [here](https://docs.litellm.ai/docs/providers/openrouter).
-
-## Configuration
-
-When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
-* `LLM Provider` to `OpenRouter`
-* `LLM Model` to the model you will be using.
-[Visit here to see a full list of OpenRouter models](https://openrouter.ai/models).
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. openrouter/&lt;model-name&gt; like `openrouter/anthropic/claude-3.5-sonnet`).
-* `API Key` to your OpenRouter API key.
@@ -9,14 +9,13 @@ We'll try to make the install process easier, but for now you can look for your
 If you find more information or a workaround for one of these issues, please open a *PR* to add details to this file.

 :::tip
-OpenHands only supports Windows via [WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
-Please be sure to run all commands inside your WSL terminal.
-Check out [Notes for WSL on Windows Users](troubleshooting/windows) for some troubleshooting guides.
+If you're running on Windows and having trouble, check out our [Notes for Windows and WSL users](troubleshooting/windows).
 :::

 ## Common Issues

 * [Unable to connect to Docker](#unable-to-connect-to-docker)
+* [Unable to connect to SSH box](#unable-to-connect-to-ssh-box)
 * [404 Resource not found](#404-resource-not-found)
 * [`make build` getting stuck on package installations](#make-build-getting-stuck-on-package-installations)
 * [Sessions are not restored](#sessions-are-not-restored)
@@ -46,6 +45,58 @@ OpenHands uses a Docker container to do its work safely, without potentially bre
 * If you are on a Mac, check the [permissions requirements](https://docs.docker.com/desktop/mac/permission-requirements/) and in particular consider enabling the `Allow the default Docker socket to be used` under `Settings > Advanced` in Docker Desktop.
 * In addition, upgrade your Docker to the latest version under `Check for Updates`

+---
+### Unable to connect to SSH box
+
+[GitHub Issue](https://github.com/All-Hands-AI/OpenHands/issues/1156)
+
+**Symptoms**
+
+```python
+self.shell = DockerSSHBox(
+...
+pexpect.pxssh.ExceptionPxssh: Could not establish connection to host
+```
+
+**Details**
+
+By default, OpenHands connects to a running container using SSH. On some machines,
+especially Windows, this seems to fail.
+
+**Workarounds**
+
+* Restart your computer (sometimes it does work)
+* Be sure to have the latest versions of WSL and Docker
+* Check that your distribution in WSL is up to date as well
+* Try [this reinstallation guide](https://github.com/All-Hands-AI/OpenHands/issues/1156#issuecomment-2064549427)
+
+---
+### Unable to connect to LLM
+
+[GitHub Issue](https://github.com/All-Hands-AI/OpenHands/issues/1208)
+
+**Symptoms**
+
+```python
+  File "/app/.venv/lib/python3.12/site-packages/openai/_exceptions.py", line 81, in __init__
+    super().__init__(message, response.request, body=body)
+                              ^^^^^^^^^^^^^^^^
+AttributeError: 'NoneType' object has no attribute 'request'
+```
+
+**Details**
+
+[GitHub Issues](https://github.com/All-Hands-AI/OpenHands/issues?q=is%3Aissue+is%3Aopen+404)
+
+This usually happens with *local* LLM setups, when OpenHands can't connect to the LLM server.
+See our guide for [local LLMs](llms/local-llms) for more information.
+
+**Workarounds**
+
+* Check your `base_url` in your config.toml (if it exists) under the "llm" section
+* Check that ollama (or whatever LLM you're using) is running OK
+* Make sure you're using `--add-host host.docker.internal:host-gateway` when running in Docker
+
 ---
 ### `404 Resource not found`

@@ -87,6 +138,7 @@ the API endpoint you're trying to connect to. Most often this happens for Azure
  * If you're running inside the UI, be sure to set the `model` in the settings modal
  * If you're running headless (via main.py) be sure to set `LLM_MODEL` in your env/config
 * Make sure you've followed any special instructions for your LLM provider
+  * [ollama](/modules/usage/llms/local-llms)
  * [Azure](/modules/usage/llms/azure-llms)
  * [Google](/modules/usage/llms/google-llms)
 * Make sure your API key is correct
@@ -1,4 +1,4 @@
-# Notes for WSL on Windows Users
+# Notes for Windows and WSL Users

 OpenHands only supports Windows via [WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
 Please be sure to run all commands inside your WSL terminal.
@@ -14,7 +14,7 @@
        "@docusaurus/theme-mermaid": "^3.5.2",
        "@mdx-js/react": "^3.0.0",
        "clsx": "^2.0.0",
-        "prism-react-renderer": "^2.4.0",
+        "prism-react-renderer": "^2.3.0",
        "react": "^18.3.1",
        "react-dom": "^18.3.1",
        "react-icons": "^5.3.0",
@@ -24,7 +24,7 @@
        "@docusaurus/module-type-aliases": "^3.5.1",
        "@docusaurus/tsconfig": "^3.5.2",
        "@docusaurus/types": "^3.5.1",
-        "typescript": "~5.6.2"
+        "typescript": "~5.5.4"
      },
      "engines": {
        "node": ">=18.0"
@@ -12640,9 +12640,9 @@
      }
    },
    "node_modules/prism-react-renderer": {
-      "version": "2.4.0",
-      "resolved": "https://registry.npmjs.org/prism-react-renderer/-/prism-react-renderer-2.4.0.tgz",
-      "integrity": "sha512-327BsVCD/unU4CNLZTWVHyUHKnsqcvj2qbPlQ8MiBE2eq2rgctjigPA1Gp9HLF83kZ20zNN6jgizHJeEsyFYOw==",
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/prism-react-renderer/-/prism-react-renderer-2.3.1.tgz",
+      "integrity": "sha512-Rdf+HzBLR7KYjzpJ1rSoxT9ioO85nZngQEoFIhL07XhtJHlCU3SOz0GJ6+qvMyQe0Se+BV3qpe6Yd/NmQF5Juw==",
      "dependencies": {
        "@types/prismjs": "^1.26.0",
        "clsx": "^2.0.0"
@@ -14853,9 +14853,9 @@
      }
    },
    "node_modules/typescript": {
-      "version": "5.6.2",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.2.tgz",
-      "integrity": "sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==",
+      "version": "5.5.4",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.4.tgz",
+      "integrity": "sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==",
      "bin": {
        "tsc": "bin/tsc",
        "tsserver": "bin/tsserver"
@@ -21,7 +21,7 @@
    "@docusaurus/theme-mermaid": "^3.5.2",
    "@mdx-js/react": "^3.0.0",
    "clsx": "^2.0.0",
-    "prism-react-renderer": "^2.4.0",
+    "prism-react-renderer": "^2.3.0",
    "react": "^18.3.1",
    "react-dom": "^18.3.1",
    "react-icons": "^5.3.0",
@@ -31,7 +31,7 @@
    "@docusaurus/module-type-aliases": "^3.5.1",
    "@docusaurus/tsconfig": "^3.5.2",
    "@docusaurus/types": "^3.5.1",
-    "typescript": "~5.6.2"
+    "typescript": "~5.5.4"
  },
  "browserslist": {
    "production": [
@@ -1,113 +1,8 @@
 import type { SidebarsConfig } from "@docusaurus/plugin-content-docs";

 const sidebars: SidebarsConfig = {
+  docsSidebar: [{ type: "autogenerated", dirName: "usage" }],
  apiSidebar: [require("./modules/python/sidebar.json")],
-  docsSidebar: [
-    {
-      type: 'doc',
-      label: 'Getting Started',
-      id: 'usage/getting-started',
-    },
-    {
-      type: 'category',
-      label: 'LLMs',
-      items: [
-        {
-          type: 'doc',
-          label: 'Overview',
-          id: 'usage/llms/llms',
-        },
-        {
-          type: 'category',
-          label: 'Providers',
-          items: [
-            {
-              type: 'doc',
-              label: 'Azure',
-              id: 'usage/llms/azure-llms',
-            },
-            {
-              type: 'doc',
-              label: 'Google',
-              id: 'usage/llms/google-llms',
-            },
-            {
-              type: 'doc',
-              label: 'Groq',
-              id: 'usage/llms/groq',
-            },
-            {
-              type: 'doc',
-              label: 'OpenAI',
-              id: 'usage/llms/openai-llms',
-            },
-            {
-              type: 'doc',
-              label: 'OpenRouter',
-              id: 'usage/llms/openrouter',
-            },
-          ],
-        },
-      ],
-    },
-    {
-      type: 'doc',
-      label: 'Troubleshooting',
-      id: 'usage/troubleshooting/troubleshooting',
-    },
-    {
-      type: 'doc',
-      label: 'Feedback',
-      id: 'usage/feedback',
-    },
-    {
-      type: 'category',
-      label: 'How-to Guides',
-      items: [
-        {
-          type: 'doc',
-          id: 'usage/how-to/cli-mode',
-        },
-        {
-          type: 'doc',
-          id: 'usage/how-to/headless-mode',
-        },
-        {
-          type: 'doc',
-          id: 'usage/how-to/custom-sandbox-guide',
-        },
-        {
-          type: 'doc',
-          id: 'usage/how-to/evaluation-harness',
-        },
-        {
-          type: 'doc',
-          id: 'usage/how-to/openshift-example',
-        }
-      ]
-    },
-    {
-      type: 'category',
-      label: 'Architecture',
-      items: [
-        {
-          type: 'doc',
-          label: 'Backend',
-          id: 'usage/architecture/backend',
-        },
-        {
-          type: 'doc',
-          label: 'Runtime',
-          id: 'usage/architecture/runtime',
-        }
-      ],
-    },
-    {
-      type: 'doc',
-      label: 'About',
-      id: 'usage/about',
-    }
-  ],
 };

 export default sidebars;
@@ -7,6 +7,17 @@ function CustomFooter() {
  return (
    <footer className="custom-footer">
      <div className="footer-content">
+        <div className="footer-top">
+          <div className="footer-title">
+            <Translate id="footer.title">OpenHands</Translate>
+          </div>
+          <div className="footer-link">
+            <a href="/modules/usage/intro">
+              <Translate id="footer.docs">Docs</Translate>
+            </a>
+          </div>
+        </div>
+
        <div className="footer-icons">
          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA" target="_blank" rel="noopener noreferrer">
            <FaSlack />
@@ -21,7 +32,7 @@ function CustomFooter() {
        <div className="footer-bottom">
          <p>
            <Translate id="footer.copyright" values={{ year: new Date().getFullYear() }}>
-              {'Copyright © {year} All Hands AI, Inc'}
+              {'Copyright © {year} OpenHands'}
            </Translate>
          </p>
        </div>
@@ -17,19 +17,23 @@ export function HomepageHeader() {

        <p className="header-subtitle">{siteConfig.tagline}</p>

-        <div align="center" className="header-links">
-          <a href="https://github.com/All-Hands-AI/OpenHands/graphs/contributors"><img src="https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Contributors" /></a>
-          <a href="https://github.com/All-Hands-AI/OpenHands/stargazers"><img src="https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Stargazers" /></a>
-          <a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" /></a>
-          <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License" /></a>
-          <br/>
-          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
-          <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" /></a>
-          <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits" /></a>
-          <br/>
-          <a href="https://docs.all-hands.dev/modules/usage/getting-started"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation" /></a>
-          <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper%20on%20Arxiv-000?logoColor=FFE165&logo=arxiv&style=for-the-badge" alt="Paper on Arxiv" /></a>
-          <a href="https://huggingface.co/spaces/OpenHands/evaluation"><img src="https://img.shields.io/badge/Benchmark%20score-000?logoColor=FFE165&logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark Score" /></a>
+        <div className="header-links">
+          <a href="https://github.com/All-Hands-AI/OpenHands">
+            <img src="https://img.shields.io/badge/Code-Github-purple?logo=github&logoColor=white&style=for-the-badge" alt="Code" />
+          </a>
+          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA">
+            <img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" />
+          </a>
+          <a href="https://discord.gg/ESHStjSjD4">
+            <img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" />
+          </a>
+
+          <a href="https://arxiv.org/abs/2407.16741">
+            <img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv" />
+          </a>
+          <a href="https://huggingface.co/spaces/OpenDevin/evaluation">
+            <img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark" />
+          </a>
        </div>

        <Demo />
@@ -7,10 +7,8 @@
 /* You can override the default Infima variables here. */

 :root {
+  --ifm-color-primary: #4465db;
  --ifm-code-font-size: 95%;
-  --ifm-color-primary: #000;
-  --ifm-background-color: #F1EAE0;
-  --ifm-navbar-background-color: #F1EAE0;
  --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1);
  --secondary: #171717;
  --secondary-dark: #0a0a0a;
@@ -19,15 +17,21 @@

 /* For readability concerns, you should choose a lighter palette in dark mode. */
 [data-theme="dark"] {
-  --ifm-color-primary: #FFF;
-  --ifm-background-color: #000;
-  --ifm-navbar-background-color: #000;
+  --ifm-color-primary: #4465db;
  --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3);
  --secondary: #737373;
  --secondary-dark: #171717;
-  --secondary-light: #ccc;
+  --secondary-light: #d4d4d4;
 }

-article a, .a {
-  text-decoration: underline;
+.footer--dark {
+  background-image: linear-gradient(
+    140deg,
+    var(--secondary) 20%,
+    var(--secondary-light) 100%
+  );
 }
+
+.a {
+  text-decoration: underline;
+}
@@ -2,19 +2,13 @@

 .custom-footer {
    background-color: dark;
-    color: #000;
-    height: 100px;
+    color: white;
+    height: 200px;
    /* background: linear-gradient(to bottom, #1a1a1a, #1a1a1a); */
-    background-color: #F1EAE0;
+    background: linear-gradient(to bottom, #1f2937, #000000);

  }

-[data-theme="dark"] .custom-footer {
-  background-color: #000;
-  color: #fff;
-}
-
-
  .footer-content {
    display: flex;
    flex-direction: column;
@@ -53,6 +47,7 @@
  }

  .footer-community {
+    text-transform: uppercase;
    font-weight: 300;
  }

@@ -70,3 +65,7 @@
  .footer-icons a:hover {
    color: white;
  }
+
+  .footer-bottom {
+    text-transform: uppercase;
+  }
@@ -2,6 +2,8 @@

 .homepage-header {
  height: 800px;
+  color: white;
+  background: linear-gradient(to top, #64748b, #000000);
 }

 .header-content {
@@ -20,7 +20,8 @@ export default function Home(): JSX.Element {
      title={`${siteConfig.title}`}
      description={translate({
        id: 'homepage.description',
-        message: 'Code Less, Make More',
+        message: 'An Open Platform for AI Software Developers as Generalist Agents',
+        description: 'The homepage description',
      })}
    >
    <HomepageHeader />
@@ -9,12 +9,12 @@ To better organize the evaluation folder, we should follow the rules below:
 - Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
 all the preprocessing/evaluation/analysis scripts.
 - Raw data and experimental records should not be stored within this repo.
- For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.
+- For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenDevin/evaluation) for visualization.
 - Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.

 ## Supported Benchmarks

-To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/how-to/evaluation-harness).
+To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/evaluation_harness).

 ### Software Engineering

@@ -69,8 +69,8 @@ temperature = 0.0

 ### Result Visualization

-Check [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization of existing experimental results.
+Check [this huggingface space](https://huggingface.co/spaces/OpenDevin/evaluation) for visualization of existing experimental results.

 ### Upload your results

-You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results to our hosted huggingface repo via PR following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results to our hosted huggingface repo via PR following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
@@ -256,7 +256,6 @@ def process_instance(
 if __name__ == '__main__':
    args = parse_arguments()
    dataset = load_dataset('RajMaheshwari/Exercism-Python')
-    dataset = dataset.shuffle(seed=42)
    aider_bench_tests = dataset['train'].to_pandas()

    llm_config = None
@@ -49,4 +49,4 @@ fact8
 		facts.Smart(Dave, True)
 		facts.Kind(Dave, True)
 	assert
-		facts.Quiet(Dave, True)
+		facts.Quiet(Dave, True)
@@ -52,7 +52,7 @@ def get_config(
            base_container_image='xingyaoww/od-eval-logic-reasoning:v1.0',
            enable_auto_lint=True,
            use_host_network=False,
-            runtime_extra_deps='$OH_INTERPRETER_PATH -m pip install scitools-pyke',
+            runtime_extra_deps='$OD_INTERPRETER_PATH -m pip install scitools-pyke',
        ),
        # do not mount workspace
        workspace_base=None,
@@ -26,7 +26,7 @@ poetry run python evaluation/miniwob/get_success_rate.py evaluation/evaluation_o

 ## Submit your evaluation results

-You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).


 ## BrowsingAgent V1.0 result
@@ -105,7 +105,7 @@ def get_config(
            base_container_image='xingyaoww/od-eval-mint:v1.0',
            enable_auto_lint=True,
            use_host_network=False,
-            runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
+            runtime_extra_deps=f'$OD_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
        ),
        # do not mount workspace
        workspace_base=None,
@@ -14,9 +14,9 @@ To run the tests for OpenHands project, you can use the provided test runner scr
 3. Navigate to the root directory of the project.
 4. Run the test suite using the test runner script with the required arguments:
   ```
-   python evaluation/regression/run_tests.py --OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxx --model=gpt-4o
+   python evaluation/regression/run_tests.py --OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxx --model=gpt-3.5-turbo
   ```
-   Replace `sk-xxxxxxxxxxxxxxxxxxxxxx` with your actual OpenAI API key. The default model is `gpt-4o`, but you can specify a different model if needed.
+   Replace `sk-xxxxxxxxxxxxxxxxxxxxxx` with your actual OpenAI API key. The default model is `gpt-3.5-turbo`, but you can specify a different model if needed.

 The test runner will discover and execute all the test cases in the `cases/` directory, and display the results of the test suite, including the status of each individual test case and the overall summary.

@@ -29,27 +29,21 @@ if __name__ == '__main__':

    if command == 'reverse':
        from commands.reverse import reverse_string
-
        print(reverse_string(input_string))
    elif command == 'uppercase':
        from commands.uppercase import to_uppercase
-
        print(to_uppercase(input_string))
    elif command == 'lowercase':
        from commands.lowercase import to_lowercase
-
        print(to_lowercase(input_string))
    elif command == 'spongebob':
        from commands.spongebob import spongebob_case
-
        print(spongebob_case(input_string))
    elif command == 'length':
        from commands.length import string_length
-
        print(string_length(input_string))
    elif command == 'scramble':
        from commands.scramble import scramble_string
-
        print(scramble_string(input_string))
    else:
        print('Invalid command!')
@@ -10,27 +10,21 @@ if __name__ == '__main__':

    if command == 'reverse':
        from commands.reverse import reverse_string
-
        print(reverse_string(input_string))
    elif command == 'uppercase':
        from commands.uppercase import to_uppercase
-
        print(to_uppercase(input_string))
    elif command == 'lowercase':
        from commands.lowercase import to_lowercase
-
        print(to_lowercase(input_string))
    elif command == 'spongebob':
        from commands.spongebob import spongebob_case
-
        print(spongebob_case(input_string))
    elif command == 'length':
        from commands.length import string_length
-
        print(string_length(input_string))
    elif command == 'scramble':
        from commands.scramble import scramble_string
-
        print(scramble_string(input_string))
    else:
        print('Invalid command!')
@@ -19,16 +19,27 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 OpenHands now support using the [official evaluation docker](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md) for both **[inference](#run-inference-on-swe-bench-instances) and [evaluation](#evaluate-generated-patches)**.
 This is now the default behavior.

+### Download Docker Images
+
+**(Recommended for reproducibility)** If you have extra local space (e.g., 100GB), you can try pull the [instance-level docker images](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level) we've prepared by running:
+
+```bash
+evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh instance
+```
+
+If you want to save disk space a bit (e.g., with ~50GB free disk space), while speeding up the image pre-build process, you can pull the environment-level docker images:
+
+```bash
+evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh env
+```

 ## Run Inference on SWE-Bench Instances

-Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-Bench set you are running on) for the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support).
-
-When the `run_infer.sh` script is started, it will automatically pull the relevant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.
+Make sure your Docker daemon is running, and you have pulled the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support).

 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
-# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test
+./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers]
+# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300
 ```

 where `model_config` is mandatory, and the rest are optional.
@@ -46,8 +57,6 @@ in order to use `eval_limit`, you must also set `agent`.
 default, it is set to 30.
 - `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
 default, it is set to 1.
- `dataset`, a huggingface dataset name. e.g. `princeton-nlp/SWE-bench` or `princeton-nlp/SWE-bench_Lite`, specifies which dataset to evaluate on.
- `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.

 There are also two optional environment variables you can set.
 ```
@@ -63,22 +72,11 @@ then your command would be:
 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
 ```

-### Run Inference on `RemoteRuntime` (experimental)
-
-This is in limited beta. Contact Xingyao over slack if you want to try this out!
-
+**Evaluate on `RemoteRuntime` (alpha)** (contact Xingyao over slack if you want to try this out!)
 ```bash
-# ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" \
-./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
-# This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
-```
-
-To clean-up all existing runtime you've already started, run:
-
-```bash
-ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+SANDBOX_API_KEY="CONTACT-XINGYAO-TO-GET-A-TESTING-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300
 ```
+Multi-processing is still WIP.

 ### Specify a subset of tasks to run infer

@@ -97,35 +95,13 @@ After running the inference, you will obtain a `output.jsonl` (by default it wil

 ## Evaluate Generated Patches

-### Download Docker Images
-
-**(Recommended for reproducibility)** If you have extra local space (e.g., 200GB), you can try pull the [instance-level docker images](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level) we've prepared by running:
-
-```bash
-evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh instance
-```
-
-If you want to save disk space a bit (e.g., with ~50GB free disk space), while speeding up the image pre-build process, you can pull the environment-level docker images:
-
-```bash
-evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh env
-```
-
-If you want to evaluate on the full SWE-Bench test set:
-
-```bash
-evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh instance full
-```
-
-### Run evaluation
-
 With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patches, and produce a fine-grained report.

 **This evaluation is performed using the official dockerized evaluation announced [here](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).**

 > If you want to evaluate existing results, you should first run this to clone existing outputs
 >```bash
->git clone https://huggingface.co/spaces/OpenHands/evaluation evaluation/evaluation_outputs
+>git clone https://huggingface.co/spaces/OpenDevin/evaluation evaluation/evaluation_outputs
 >```

 NOTE, you should have already pulled the instance-level OR env-level docker images following [this section](#openhands-swe-bench-instance-level-docker-support).
@@ -157,29 +133,12 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
 - `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent.
 - `logs/`: a directory of test logs

-### Run evaluation with `RemoteRuntime` (experimental)
-
-This is in limited beta. Contact Xingyao over slack if you want to try this out!
-
-```bash
-# ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
-# This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
-```
-
-To clean-up all existing runtimes that you've already started, run:
-
-```bash
-ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
-```
-
-
 ## Visualize Results

-First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
+First you need to clone `https://huggingface.co/spaces/OpenDevin/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.

 ```bash
-git clone https://huggingface.co/spaces/OpenHands/evaluation
+git clone https://huggingface.co/spaces/OpenDevin/evaluation
 ```

 **(optional) setup streamlit environment with conda**:
@@ -196,11 +155,11 @@ Then, in a separate Python environment with `streamlit` library, you can run the
 ```bash
 # Make sure you are inside the cloned `evaluation` repo
 conda activate streamlit # if you follow the optional conda env setup above
-streamlit app.py --server.port 8501 --server.address 0.0.0.0
+streamlit run 0_📊_OpenHands_Benchmark.py --server.port 8501 --server.address 0.0.0.0
 ```

 Then you can access the SWE-Bench trajectory visualizer at `localhost:8501`.

 ## Submit your evaluation results

-You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
@@ -1,376 +0,0 @@
-import os
-import tempfile
-import time
-
-import pandas as pd
-from swebench.harness.grading import get_eval_report
-from swebench.harness.run_evaluation import (
-    APPLY_PATCH_FAIL,
-    APPLY_PATCH_PASS,
-)
-from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
-from swebench.harness.utils import load_swebench_dataset
-
-from evaluation.swe_bench.run_infer import get_instance_docker_image
-from evaluation.utils.shared import (
-    EvalMetadata,
-    EvalOutput,
-    prepare_dataset,
-    reset_logger_for_multiprocessing,
-    run_evaluation,
-)
-from openhands.core.config import (
-    AppConfig,
-    SandboxConfig,
-    get_parser,
-)
-from openhands.core.logger import openhands_logger as logger
-from openhands.core.main import create_runtime
-from openhands.events.action import CmdRunAction
-from openhands.events.observation import CmdOutputObservation
-
-# TODO: migrate all swe-bench docker to ghcr.io/openhands
-DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
-logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
-
-
-def process_git_patch(patch):
-    if not isinstance(patch, str):
-        return ''
-
-    if not patch.strip():
-        # skip empty patches
-        return ''
-
-    patch = patch.replace('\r\n', '\n')
-    # There might be some weird characters at the beginning of the patch
-    # due to some OpenHands inference command outputs
-
-    # FOR EXAMPLE:
-    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
-    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
-    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
-    # new file mode 100644
-    # index 0000000000..fc13db5948
-
-    # We "find" the first line that starts with "diff" and then we remove lines before it
-    lines = patch.split('\n')
-    for i, line in enumerate(lines):
-        if line.startswith('diff --git'):
-            patch = '\n'.join(lines[i:])
-            break
-
-    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
-    return patch
-
-
-def get_config(instance: pd.Series) -> AppConfig:
-    # We use a different instance image for the each instance of swe-bench eval
-    base_container_image = get_instance_docker_image(instance['instance_id'])
-    logger.info(
-        f'Using instance container image: {base_container_image}. '
-        f'Please make sure this image exists. '
-        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
-    )
-    config = AppConfig(
-        run_as_openhands=False,
-        runtime=os.environ.get('RUNTIME', 'eventstream'),
-        sandbox=SandboxConfig(
-            base_container_image=base_container_image,
-            use_host_network=False,
-            # large enough timeout, since some testcases take very long to run
-            timeout=1800,
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-        ),
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
-    )
-    return config
-
-
-def process_instance(
-    instance: pd.Series,
-    metadata: EvalMetadata | None = None,
-    reset_logger: bool = True,
-) -> EvalOutput:
-    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-    if reset_logger:
-        global output_file
-        log_dir = output_file.replace('.jsonl', '.logs')
-        os.makedirs(log_dir, exist_ok=True)
-        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
-    else:
-        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
-
-    config = get_config(instance)
-    instance_id = instance.instance_id
-    model_patch = instance['model_patch']
-    test_spec: TestSpec = instance['test_spec']
-    logger.info(f'Starting evaluation for instance {instance_id}.')
-
-    if 'test_result' not in instance.keys():
-        instance['test_result'] = {}
-    instance['test_result']['report'] = {
-        'empty_generation': False,
-        'resolved': False,
-        'failed_apply_patch': False,
-        'error_eval': False,
-        'test_timeout': False,
-    }
-
-    if model_patch == '':
-        instance['test_result']['report']['empty_generation'] = True
-        return EvalOutput(
-            instance_id=instance_id,
-            test_result=instance['test_result'],
-        )
-
-    runtime = create_runtime(config, sid=instance_id)
-
-    # Get patch and save it to /tmp/patch.diff
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Patch file
-        patch_file_path = os.path.join(temp_dir, 'patch.diff')
-        with open(patch_file_path, 'w') as f:
-            f.write(model_patch)
-        runtime.copy_to(patch_file_path, '/tmp')
-        # Eval script
-        eval_script_path = os.path.join(temp_dir, 'eval.sh')
-        with open(eval_script_path, 'w') as f:
-            f.write(test_spec.eval_script)
-        runtime.copy_to(eval_script_path, '/tmp')
-
-    # Set +x
-    action = CmdRunAction(command='chmod +x /tmp/eval.sh')
-    action.timeout = 600
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0
-
-    # Apply patch
-    exec_command = (
-        'cd /testbed && '
-        "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
-        "(echo 'Failed to apply patch with git apply, trying with patch command...' && "
-        "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
-        "echo 'APPLY_PATCH_FAIL')))"
-    )
-    action = CmdRunAction(command=exec_command, keep_prompt=False)
-    action.timeout = 600
-    obs = runtime.run_action(action)
-    assert isinstance(obs, CmdOutputObservation)
-    apply_patch_output = obs.content
-    assert isinstance(apply_patch_output, str)
-    instance['test_result']['apply_patch_output'] = apply_patch_output
-
-    try:
-        if 'APPLY_PATCH_FAIL' in apply_patch_output:
-            logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}')
-            instance['test_result']['report']['failed_apply_patch'] = True
-
-            return EvalOutput(
-                instance_id=instance_id,
-                test_result=instance['test_result'],
-            )
-        elif 'APPLY_PATCH_PASS' in apply_patch_output:
-            logger.info(f'[{instance_id}] {APPLY_PATCH_PASS}:\n{apply_patch_output}')
-
-            # Run eval script in background and save output to log file
-            log_file = '/tmp/eval_output.log'
-            action = CmdRunAction(
-                command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False
-            )
-            action.timeout = 60  # Short timeout just to get the process ID
-            obs = runtime.run_action(action)
-
-            if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
-                pid = obs.content.split()[-1].strip()
-                logger.info(
-                    f'[{instance_id}] Evaluation process started with PID: {pid}'
-                )
-
-                # Poll for completion
-                start_time = time.time()
-                timeout = 1800  # 30 minutes
-                while True:
-                    seconds_elapsed = time.time() - start_time
-                    if seconds_elapsed > timeout:
-                        logger.info(
-                            f'[{instance_id}] Evaluation timed out after {timeout} seconds'
-                        )
-                        instance['test_result']['report']['test_timeout'] = True
-                        break
-                    check_action = CmdRunAction(
-                        command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
-                    )
-                    check_action.timeout = 60
-                    check_obs = runtime.run_action(check_action)
-                    if (
-                        isinstance(check_obs, CmdOutputObservation)
-                        and check_obs.content.split()[-1].strip() == '1'
-                    ):
-                        logger.info(
-                            f'[{instance_id}] Evaluation process completed after {seconds_elapsed} seconds'
-                        )
-                        break
-                    logger.info(
-                        f'[{instance_id}] [{seconds_elapsed:.0f}s] Evaluation still running, waiting...'
-                    )
-                    time.sleep(30)  # Wait for 30 seconds before checking again
-
-                # Read the log file
-                cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False)
-                cat_action.timeout = 300
-                cat_obs = runtime.run_action(cat_action)
-
-                # Grade answer
-                if isinstance(cat_obs, CmdOutputObservation) and cat_obs.exit_code == 0:
-                    test_output = cat_obs.content
-                    assert isinstance(test_output, str)
-                    instance['test_result']['test_output'] = test_output
-
-                    # Get report from test output
-                    logger.info(f'[{instance_id}] Grading answer...')
-                    with tempfile.TemporaryDirectory() as temp_dir:
-                        # Create a directory structure that matches the expected format
-                        # NOTE: this is a hack to make the eval report format consistent
-                        # with the original SWE-Bench eval script
-                        log_dir = os.path.join(temp_dir, 'logs', instance_id)
-                        os.makedirs(log_dir, exist_ok=True)
-                        test_output_path = os.path.join(log_dir, 'test_output.txt')
-                        with open(test_output_path, 'w') as f:
-                            f.write(test_output)
-
-                        _report = get_eval_report(
-                            test_spec=test_spec,
-                            prediction={
-                                'model_patch': model_patch,
-                                'instance_id': instance_id,
-                            },
-                            log_path=test_output_path,
-                            include_tests_status=True,
-                        )
-                        report = _report[instance_id]
-                        logger.info(
-                            f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
-                        )
-                        instance['test_result']['report']['resolved'] = report[
-                            'resolved'
-                        ]
-            else:
-                logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
-                instance['test_result']['report']['error_eval'] = True
-
-            return EvalOutput(
-                instance_id=instance_id,
-                test_result=instance['test_result'],
-            )
-        else:
-            logger.info(
-                f'[{instance_id}] Unexpected output when applying patch:\n{apply_patch_output}'
-            )
-            raise RuntimeError(
-                instance_id,
-                f'Unexpected output when applying patch:\n{apply_patch_output}',
-                logger,
-            )
-    finally:
-        runtime.close()
-
-
-if __name__ == '__main__':
-    parser = get_parser()
-    parser.add_argument(
-        '--input-file',
-        type=str,
-        help='Path to input predictions file',
-        required=True,
-    )
-    parser.add_argument(
-        '--dataset',
-        type=str,
-        default='princeton-nlp/SWE-bench',
-        help='data set to evaluate on, either full-test or lite-test',
-    )
-    parser.add_argument(
-        '--split',
-        type=str,
-        default='test',
-        help='split to evaluate on',
-    )
-    args, _ = parser.parse_known_args()
-
-    # Load SWE-Bench dataset
-    full_dataset: list[SWEbenchInstance] = load_swebench_dataset(
-        args.dataset, args.split
-    )
-    instance_id_to_instance = {
-        instance['instance_id']: instance for instance in full_dataset
-    }
-    logger.info(
-        f'Loaded dataset {args.dataset} with split {args.split} to run inference on.'
-    )
-
-    # Load predictions
-    assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
-    predictions = pd.read_json(args.input_file, lines=True)
-    assert (
-        'instance_id' in predictions.columns
-    ), 'Input file must contain instance_id column.'
-
-    if 'model_patch' not in predictions.columns and (
-        'test_result' in predictions.columns
-        and 'model_patch' in predictions['test_result'].iloc[0]
-    ):
-        raise ValueError(
-            'Input file must contain model_patch column OR test_result column with model_patch field.'
-        )
-    assert len(predictions['instance_id'].unique()) == len(
-        predictions
-    ), 'instance_id column must be unique.'
-
-    if 'model_patch' not in predictions.columns:
-        predictions['model_patch'] = predictions['test_result'].apply(
-            lambda x: x['git_patch']
-        )
-    assert {'instance_id', 'model_patch'}.issubset(
-        set(predictions.columns)
-    ), 'Input file must contain instance_id and model_patch columns.'
-
-    # Process model_patch
-    predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch)
-
-    # Merge predictions with dataset
-    predictions['instance'] = predictions['instance_id'].apply(
-        lambda x: instance_id_to_instance[x]
-    )
-    predictions['test_spec'] = predictions['instance'].apply(make_test_spec)
-
-    # Prepare dataset
-    output_file = args.input_file.replace('.jsonl', '.swebench_eval.jsonl')
-    instances = prepare_dataset(predictions, output_file, args.eval_n_limit)
-
-    run_evaluation(
-        instances,
-        metadata=None,
-        output_file=output_file,
-        num_workers=args.eval_num_workers,
-        process_instance_func=process_instance,
-    )
-
-    # Load evaluated predictions & print number of resolved predictions
-    evaluated_predictions = pd.read_json(output_file, lines=True)
-    fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation']
-
-    def count_report_field(row, field):
-        return row['test_result']['report'][field]
-
-    for field in fields:
-        count = evaluated_predictions.apply(
-            count_report_field, args=(field,), axis=1
-        ).sum()
-        logger.info(
-            f'# {field}: {count} / {len(evaluated_predictions)}. ({count / len(evaluated_predictions):.2%})'
-        )
@@ -2,6 +2,7 @@ import asyncio
 import json
 import os
 import tempfile
+import time
 from typing import Any

 import pandas as pd
@@ -24,15 +25,14 @@ from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
-    get_parser,
+    load_from_env,
+    parse_arguments,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction
 from openhands.events.observation import CmdOutputObservation, ErrorObservation
-from openhands.events.serialization.event import event_to_dict
 from openhands.runtime.runtime import Runtime
-from openhands.runtime.utils.shutdown_listener import sleep_if_should_continue

 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
 USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
@@ -109,11 +109,6 @@ def get_config(
    if USE_INSTANCE_IMAGE:
        # We use a different instance image for the each instance of swe-bench eval
        base_container_image = get_instance_docker_image(instance['instance_id'])
-        logger.info(
-            f'Using instance container image: {base_container_image}. '
-            f'Please make sure this image exists. '
-            f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
-        )
    else:
        base_container_image = SWE_BENCH_CONTAINER_IMAGE
        logger.info(f'Using swe-bench container image: {base_container_image}')
@@ -123,19 +118,26 @@ def get_config(
        run_as_openhands=False,
        max_budget_per_task=4,
        max_iterations=metadata.max_iterations,
-        runtime=os.environ.get('RUNTIME', 'eventstream'),
        sandbox=SandboxConfig(
            base_container_image=base_container_image,
            enable_auto_lint=True,
            use_host_network=False,
            # large enough timeout, since some testcases take very long to run
            timeout=300,
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
    )
+    selected_env_vars = {'runtime', 'sandbox_api_key'}
+    selected_env_vars = {
+        k: v for k, v in os.environ.items() if k.lower() in selected_env_vars
+    }
+    if selected_env_vars:
+        logger.info(
+            f'Loading config keys from env vars: {list(selected_env_vars.keys())}'
+        )
+        load_from_env(config, selected_env_vars)
    config.set_llm_config(metadata.llm_config)
    return config

@@ -158,14 +160,12 @@ def initialize_runtime(
    action = CmdRunAction(
        command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc"""
    )
-    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0

    action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
-    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -177,7 +177,6 @@ def initialize_runtime(

        # inject the instance info
        action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
-        action.timeout = 600
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -205,21 +204,18 @@ def initialize_runtime(
            '/swe_util/',
        )
        action = CmdRunAction(command='cat ~/.bashrc')
-        action.timeout = 600
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert obs.exit_code == 0

        action = CmdRunAction(command='source ~/.bashrc')
-        action.timeout = 600
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert obs.exit_code == 0

        action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
-        action.timeout = 3600
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -235,14 +231,12 @@ def initialize_runtime(
        ), f'Failed to source /swe_util/swe_entry.sh: {obs.content}'

    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0

    action = CmdRunAction(command='git reset --hard')
-    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -251,7 +245,6 @@ def initialize_runtime(
    action = CmdRunAction(
        command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
    )
-    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -279,21 +272,18 @@ def complete_runtime(
    workspace_dir_name = _get_swebench_workspace_dir_name(instance)

    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0

    action = CmdRunAction(command='git config --global core.pager ""')
-    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0

    action = CmdRunAction(command='git add -A')
-    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -317,10 +307,10 @@ def complete_runtime(
                break
            else:
                logger.info('Failed to get git diff, retrying...')
-                sleep_if_should_continue(10)
+                time.sleep(10)
        elif isinstance(obs, ErrorObservation):
            logger.error(f'Error occurred: {obs.content}. Retrying...')
-            sleep_if_should_continue(10)
+            time.sleep(10)
        else:
            raise ValueError(f'Unexpected observation type: {type(obs)}')

@@ -384,7 +374,10 @@ def process_instance(
    if state is None:
        raise ValueError('State should not be None.')

-    histories = [event_to_dict(event) for event in state.history.get_events()]
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
    metrics = state.metrics.get() if state.metrics else None

    # Save the output
@@ -396,7 +389,6 @@ def process_instance(
        metadata=metadata,
        history=histories,
        metrics=metrics,
-        llm_completions=state.extra_data.get('llm_completions', []),
        error=state.last_error if state and state.last_error else None,
    )
    return output
@@ -419,26 +411,12 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:


 if __name__ == '__main__':
-    parser = get_parser()
-    parser.add_argument(
-        '--dataset',
-        type=str,
-        default='princeton-nlp/SWE-bench',
-        help='data set to evaluate on, either full-test or lite-test',
-    )
-    parser.add_argument(
-        '--split',
-        type=str,
-        default='test',
-        help='split to evaluate on',
-    )
-    args, _ = parser.parse_known_args()
+    args = parse_arguments()

    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
    # so we don't need to manage file uploading to OpenHands's repo
-    dataset = load_dataset(args.dataset, split=args.split)
-    logger.info(f'Loaded dataset {args.dataset} with split {args.split}')
-    swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
+    dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
+    swe_bench_tests = filter_dataset(dataset['test'].to_pandas(), 'instance_id')

    llm_config = None
    if args.llm_config:
@@ -467,12 +445,6 @@ if __name__ == '__main__':
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
    instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)

-    if len(instances) > 0 and not isinstance(
-        instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str
-    ):
-        for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']:
-            instances[col] = instances[col].apply(lambda x: str(x))
-
    run_evaluation(
        instances, metadata, output_file, args.eval_num_workers, process_instance
    )
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-
-# API base URL
-BASE_URL="https://api.all-hands.dev/v0"
-
-# Get the list of runtimes
-response=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
-  --header "X-API-Key: ${ALLHANDS_API_KEY}")
-
-n_runtimes=$(echo $response | jq -r '.total')
-echo "Found ${n_runtimes} runtimes. Stopping them..."
-
-runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
-# Loop through each runtime and stop it
-counter=1
-for runtime_id in $runtime_ids; do
-  echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
-  curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
-    --header "X-API-Key: ${ALLHANDS_API_KEY}" \
-    --header "Content-Type: application/json" \
-    --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
-  echo
-  ((counter++))
-done
-
-echo "All runtimes have been stopped."
@@ -6,33 +6,19 @@ LEVEL=$1
 # - base, keyword "sweb.base"
 # - env, keyword "sweb.env"
 # - instance, keyword "sweb.eval"
-SET=$2

 if [ -z "$LEVEL" ]; then
-    echo "Usage: $0 <cache_level> <set>"
+    echo "Usage: $0 <cache_level>"
    echo "cache_level: base, env, or instance"
-    echo "set: lite, full"
    exit 1
 fi

-if [ -z "$SET" ]; then
-    echo "Usage: $0 <cache_level> <set>"
-    echo "cache_level: base, env, or instance"
-    echo "set: lite, full, default is lite"
-    SET="lite"
-fi
-
 NAMESPACE=$2 # xingyaoww
 if [ -z "$NAMESPACE" ]; then
    echo "Default to namespace: xingyaoww"
    NAMESPACE="xingyaoww"
 fi
-
-if [ "$SET" == "lite" ]; then
-    IMAGE_FILE="$(dirname "$0")/all-swebench-lite-instance-images.txt"
-else
-    IMAGE_FILE="$(dirname "$0")/all-swebench-full-instance-images.txt"
-fi
+IMAGE_FILE="$(dirname "$0")/all-swebench-lite-instance-images.txt"

 # Define a pattern based on the level
 case $LEVEL in
@@ -1,79 +0,0 @@
-"""You should first perform the following steps:
-
-1. Build the docker images. Install SWE-Bench first (https://github.com/princeton-nlp/SWE-bench). Then run:
-```bash
-export DATASET_NAME=princeton-nlp/SWE-bench_Lite
-export SPLIT=test
-export MAX_WORKERS=4
-export RUN_ID=some-random-ID
-python -m swebench.harness.run_evaluation \
-    --dataset_name $DATASET_NAME \
-    --split $SPLIT \
-    --predictions_path gold \
-    --max_workers $MAX_WORKERS \
-    --run_id $RUN_ID \
-    --cache_level instance
-```
-
-2. Then run this script to push the docker images to the docker hub. Some of the docker images might fail to build in the previous step - start an issue in the SWE-Bench repo for possible fixes.
-
-To push the docker images for "princeton-nlp/SWE-bench_Lite" test set to the docker hub (e.g., under `docker.io/xingyaoww/`), run:
-```bash
-EVAL_DOCKER_IMAGE_PREFIX='docker.io/xingyaoww/' python3 evaluation/swe_bench/scripts/docker/push_docker_instance_images.py --dataset princeton-nlp/SWE-bench_Lite --split test
-```
-"""
-
-import argparse
-
-import docker
-from datasets import load_dataset
-from tqdm import tqdm
-
-from openhands.core.logger import openhands_logger as logger
-
-logger.setLevel('ERROR')
-from evaluation.swe_bench.run_infer import get_instance_docker_image  # noqa
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--dataset', type=str, default='princeton-nlp/SWE-bench_Lite')
-parser.add_argument('--split', type=str, default='test')
-args = parser.parse_args()
-
-dataset = load_dataset(args.dataset, split=args.split)
-client = docker.from_env()
-
-pbar = tqdm(total=len(dataset))
-counter = {'success': 0, 'failed': 0}
-
-failed_instances = []
-for instance in dataset:
-    instance_id = instance['instance_id']
-    image_name = f'sweb.eval.x86_64.{instance_id}'
-    target_image_name = get_instance_docker_image(instance_id)
-
-    print('-' * 100)
-    # check if image exists
-    try:
-        image: docker.models.images.Image = client.images.get(image_name)
-        image.tag(target_image_name)
-        print(f'Image {image_name} -- tagging to --> {target_image_name}')
-        ret_push = client.images.push(target_image_name)
-        if isinstance(ret_push, str):
-            print(ret_push)
-        else:
-            for line in ret_push:
-                print(line)
-        print(f'Image {image_name} -- pushed to --> {target_image_name}')
-        counter['success'] += 1
-    except docker.errors.ImageNotFound:
-        print(f'ERROR: Image {image_name} does not exist')
-        counter['failed'] += 1
-        failed_instances.append(instance_id)
-    finally:
-        pbar.update(1)
-        pbar.set_postfix(counter)
-
-print(f'Success: {counter["success"]}, Failed: {counter["failed"]}')
-print('Failed instances IDs:')
-for failed_instance in failed_instances:
-    print(failed_instance)
@@ -0,0 +1,63 @@
+import argparse
+import os
+
+import pandas as pd
+
+parser = argparse.ArgumentParser()
+parser.add_argument('od_output_file', type=str)
+args = parser.parse_args()
+output_filepath = args.od_output_file.replace('.jsonl', '.swebench.jsonl')
+print(f'Converting {args.od_output_file} to {output_filepath}')
+
+od_format = pd.read_json(args.od_output_file, orient='records', lines=True)
+# model name is the folder name of od_output_file
+model_name = os.path.basename(os.path.dirname(args.od_output_file))
+
+
+def process_git_patch(patch):
+    if not isinstance(patch, str):
+        return ''
+
+    if not patch.strip():
+        # skip empty patches
+        return ''
+
+    patch = patch.replace('\r\n', '\n')
+    # There might be some weird characters at the beginning of the patch
+    # due to some OpenHands inference command outputs
+
+    # FOR EXAMPLE:
+    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
+    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
+    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
+    # new file mode 100644
+    # index 0000000000..fc13db5948
+
+    # We "find" the first line that starts with "diff" and then we remove lines before it
+    lines = patch.split('\n')
+    for i, line in enumerate(lines):
+        if line.startswith('diff --git'):
+            patch = '\n'.join(lines[i:])
+            break
+
+    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
+    return patch
+
+
+def convert_row_to_swebench_format(row):
+    if 'git_patch' in row:
+        model_patch = row['git_patch']
+    elif 'test_result' in row and 'git_patch' in row['test_result']:
+        model_patch = row['test_result']['git_patch']
+    else:
+        raise ValueError(f'Row {row} does not have a git_patch')
+
+    return {
+        'instance_id': row['instance_id'],
+        'model_patch': process_git_patch(model_patch),
+        'model_name_or_path': model_name,
+    }
+
+
+swebench_format = od_format.apply(convert_row_to_swebench_format, axis=1)
+swebench_format.to_json(output_filepath, lines=True, orient='records')
@@ -1,35 +0,0 @@
-import argparse
-import os
-
-import pandas as pd
-
-from evaluation.swe_bench.eval_infer import process_git_patch
-
-parser = argparse.ArgumentParser()
-parser.add_argument('oh_output_file', type=str)
-args = parser.parse_args()
-output_filepath = args.oh_output_file.replace('.jsonl', '.swebench.jsonl')
-print(f'Converting {args.oh_output_file} to {output_filepath}')
-
-oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
-# model name is the folder name of oh_output_file
-model_name = os.path.basename(os.path.dirname(args.oh_output_file))
-
-
-def convert_row_to_swebench_format(row):
-    if 'git_patch' in row:
-        model_patch = row['git_patch']
-    elif 'test_result' in row and 'git_patch' in row['test_result']:
-        model_patch = row['test_result']['git_patch']
-    else:
-        raise ValueError(f'Row {row} does not have a git_patch')
-
-    return {
-        'instance_id': row['instance_id'],
-        'model_patch': process_git_patch(model_patch),
-        'model_name_or_path': model_name,
-    }
-
-
-swebench_format = oh_format.apply(convert_row_to_swebench_format, axis=1)
-swebench_format.to_json(output_filepath, lines=True, orient='records')
@@ -1,27 +0,0 @@
-import argparse
-
-import pandas as pd
-from datasets import load_dataset
-
-parser = argparse.ArgumentParser()
-parser.add_argument('output_filepath', type=str, help='Path to save the output file')
-parser.add_argument(
-    '--dataset_name',
-    type=str,
-    help='Name of the dataset to download',
-    default='princeton-nlp/SWE-bench_Lite',
-)
-parser.add_argument('--split', type=str, help='Split to download', default='test')
-args = parser.parse_args()
-
-dataset = load_dataset(args.dataset_name, split=args.split)
-output_filepath = args.output_filepath
-print(
-    f'Downloading gold patches from {args.dataset_name} (split: {args.split}) to {output_filepath}'
-)
-patches = [
-    {'instance_id': row['instance_id'], 'model_patch': row['patch']} for row in dataset
-]
-print(f'{len(patches)} gold patches loaded')
-pd.DataFrame(patches).to_json(output_filepath, lines=True, orient='records')
-print(f'Patches saved to {output_filepath}')
@@ -28,9 +28,9 @@ FILE_NAME=$(basename $PROCESS_FILEPATH)
 echo "Evaluating $FILE_NAME @ $FILE_DIR"

 # ================================================
-# detect whether PROCESS_FILEPATH is in OH format or in SWE-bench format
+# detect whether PROCESS_FILEPATH is in OD format or in SWE-bench format
 echo "=============================================================="
-echo "Detecting whether PROCESS_FILEPATH is in OH format or in SWE-bench format"
+echo "Detecting whether PROCESS_FILEPATH is in OD format or in SWE-bench format"
 echo "=============================================================="
 # SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch
 function is_swebench_format() {
@@ -56,9 +56,9 @@ if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
 else
    echo "The file IS NOT in SWE-bench format."

-    # ==== Convert OH format to SWE-bench format ====
+    # ==== Convert OD format to SWE-bench format ====
    echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
-    poetry run python3 evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
+    poetry run python3 evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py $PROCESS_FILEPATH
    # replace .jsonl with .swebench.jsonl in filename
    SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
    echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
@@ -1,43 +0,0 @@
-#!/bin/bash
-set -eo pipefail
-
-INPUT_FILE=$1
-NUM_WORKERS=$2
-DATASET=$3
-SPLIT=$4
-
-if [ -z "$INPUT_FILE" ]; then
-  echo "INPUT_FILE not specified (should be a path to a jsonl file)"
-  exit 1
-fi
-
-if [ -z "$DATASET" ]; then
-  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
-  DATASET="princeton-nlp/SWE-bench_Lite"
-fi
-
-if [ -z "$SPLIT" ]; then
-  echo "SPLIT not specified, use default test"
-  SPLIT="test"
-fi
-
-if [ -z "$NUM_WORKERS" ]; then
-  echo "NUM_WORKERS not specified, use default 1"
-  NUM_WORKERS=1
-fi
-
-echo "... Evaluating on $INPUT_FILE ..."
-
-COMMAND="poetry run python evaluation/swe_bench/eval_infer.py \
-  --eval-num-workers $NUM_WORKERS \
-  --input-file $INPUT_FILE \
-  --dataset $DATASET \
-  --split $SPLIT"
-
-if [ -n "$EVAL_LIMIT" ]; then
-  echo "EVAL_LIMIT: $EVAL_LIMIT"
-  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
-fi
-
-# Run the command
-eval $COMMAND
@@ -9,8 +9,6 @@ AGENT=$3
 EVAL_LIMIT=$4
 MAX_ITER=$5
 NUM_WORKERS=$6
-DATASET=$7
-SPLIT=$8

 if [ -z "$NUM_WORKERS" ]; then
  NUM_WORKERS=1
@@ -33,17 +31,6 @@ if [ -z "$USE_INSTANCE_IMAGE" ]; then
  USE_INSTANCE_IMAGE=true
 fi

-
-if [ -z "$DATASET" ]; then
-  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
-  DATASET="princeton-nlp/SWE-bench_Lite"
-fi
-
-if [ -z "$SPLIT" ]; then
-  echo "SPLIT not specified, use default test"
-  SPLIT="test"
-fi
-
 export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
 echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"

@@ -52,8 +39,6 @@ get_agent_version
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
-echo "DATASET: $DATASET"
-echo "SPLIT: $SPLIT"

 # Default to NOT use Hint
 if [ -z "$USE_HINT_TEXT" ]; then
@@ -66,11 +51,6 @@ if [ "$USE_HINT_TEXT" = false ]; then
  EVAL_NOTE="$EVAL_NOTE-no-hint"
 fi

-if [ -n "$EXP_NAME" ]; then
-  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
-fi
-echo "EVAL_NOTE: $EVAL_NOTE"
-
 unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push

 COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
@@ -79,9 +59,7 @@ COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
  --max-iterations $MAX_ITER \
  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note $EVAL_NOTE \
-  --dataset $DATASET \
-  --split $SPLIT"
+  --eval-note $EVAL_NOTE"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -19,10 +19,10 @@ def extract_modified_files(patch):
    return modified_files


-def process_report(oh_output_file):
+def process_report(od_output_file):
    succ = 0
    fail = 0
-    for line in open(oh_output_file):
+    for line in open(od_output_file):
        line = json.loads(line)
        instance_id = line['instance_id']
        gold_patch = line['swe_instance']['patch']
@@ -48,7 +48,7 @@ def process_report(oh_output_file):

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument('--oh_output_file', help='Path to the OH output file')
+    parser.add_argument('--od_output_file', help='Path to the OD output file')
    args = parser.parse_args()

-    process_report(args.oh_output_file)
+    process_report(args.od_output_file)
@@ -6,15 +6,15 @@ mkdir -p $EVAL_WORKSPACE

 # 1. Prepare REPO
 echo "==== Prepare SWE-bench repo ===="
-OH_SWE_BENCH_REPO_PATH="https://github.com/All-Hands-AI/SWE-bench.git"
-OH_SWE_BENCH_REPO_BRANCH="eval"
-git clone -b $OH_SWE_BENCH_REPO_BRANCH $OH_SWE_BENCH_REPO_PATH $EVAL_WORKSPACE/OH-SWE-bench
+OD_SWE_BENCH_REPO_PATH="https://github.com/All-Hands-AI/OD-SWE-bench.git"
+OD_SWE_BENCH_REPO_BRANCH="eval"
+git clone -b $OD_SWE_BENCH_REPO_BRANCH $OD_SWE_BENCH_REPO_PATH $EVAL_WORKSPACE/OD-SWE-bench

 # 2. Prepare DATA
 echo "==== Prepare SWE-bench data ===="
 EVAL_IMAGE=ghcr.io/all-hands-ai/eval-swe-bench:builder_with_conda
 EVAL_WORKSPACE=$(realpath $EVAL_WORKSPACE)
-chmod +x $EVAL_WORKSPACE/OH-SWE-bench/swebench/harness/prepare_data.sh
+chmod +x $EVAL_WORKSPACE/OD-SWE-bench/swebench/harness/prepare_data.sh
 if [ -d $EVAL_WORKSPACE/eval_data ]; then
    rm -r $EVAL_WORKSPACE/eval_data
 fi
@@ -24,4 +24,4 @@ docker run \
    -u $(id -u):$(id -g) \
    -e HF_DATASETS_CACHE="/tmp" \
    --rm -it $EVAL_IMAGE \
-    bash -c "cd OH-SWE-bench/swebench/harness && /swe_util/miniforge3/bin/conda run -n swe-bench-eval ./prepare_data.sh && mv eval_data /workspace/"
+    bash -c "cd OD-SWE-bench/swebench/harness && /swe_util/miniforge3/bin/conda run -n swe-bench-eval ./prepare_data.sh && mv eval_data /workspace/"
@@ -60,7 +60,7 @@ conda activate swe-bench-eval

 mkdir -p $SWE_TASK_DIR/reset_testbed_temp
 mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
-SWE_BENCH_DIR=/swe_util/OH-SWE-bench
+SWE_BENCH_DIR=/swe_util/OD-SWE-bench
 output=$(
    export PYTHONPATH=$SWE_BENCH_DIR && \
    cd $SWE_BENCH_DIR && \
@@ -5,8 +5,8 @@ import os
 import pathlib
 import subprocess
 import time
-import traceback
-from typing import Any, Awaitable, Callable, TextIO
+from concurrent.futures import ProcessPoolExecutor
+from typing import Any, Awaitable, Callable

 import pandas as pd
 from pydantic import BaseModel
@@ -49,20 +49,15 @@ class EvalMetadata(BaseModel):
 class EvalOutput(BaseModel):
    # NOTE: User-specified
    instance_id: str
+    instruction: str
    # output of the evaluation
    # store anything that is needed for the score calculation
    test_result: dict[str, Any]

-    instruction: str | None = None
-
    # Interaction info
-    metadata: EvalMetadata | None = None
-    # list[tuple[dict[str, Any], dict[str, Any]]] - for compatibility with the old format
-    history: (
-        list[dict[str, Any]] | list[tuple[dict[str, Any], dict[str, Any]]] | None
-    ) = None
-    llm_completions: list[dict[str, Any]]
-    metrics: dict[str, Any] | None = None
+    metadata: EvalMetadata
+    history: list[tuple[dict[str, Any], dict[str, Any]]]
+    metrics: dict[str, Any]
    error: str | None = None

    # Optionally save the input test instance
@@ -70,19 +65,15 @@ class EvalOutput(BaseModel):

    def model_dump(self, *args, **kwargs):
        dumped_dict = super().model_dump(*args, **kwargs)
-        # Remove None values
-        dumped_dict = {k: v for k, v in dumped_dict.items() if v is not None}
        # Apply custom serialization for metadata (to avoid leaking sensitive information)
-        if self.metadata is not None:
-            dumped_dict['metadata'] = self.metadata.model_dump()
+        dumped_dict['metadata'] = self.metadata.model_dump()
        return dumped_dict

    def model_dump_json(self, *args, **kwargs):
        dumped = super().model_dump_json(*args, **kwargs)
        dumped_dict = json.loads(dumped)
        # Apply custom serialization for metadata (to avoid leaking sensitive information)
-        if 'metadata' in dumped_dict:
-            dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
+        dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
        return json.dumps(dumped_dict)


@@ -236,120 +227,56 @@ def prepare_dataset(
    return pd.DataFrame(new_dataset)


-def update_progress(
-    result: EvalOutput,
-    pbar: tqdm,
-    output_fp: TextIO,
-):
-    """Update the progress bar and write the result to the output file."""
-    pbar.update(1)
-    pbar.set_description(f'Instance {result.instance_id}')
-    pbar.set_postfix_str(f'Test Result: {result.test_result}')
-    logger.info(
-        f'Finished evaluation for instance {result.instance_id}: {str(result.test_result)[:300]}...\n'
-    )
-    output_fp.write(json.dumps(result.model_dump()) + '\n')
-    output_fp.flush()
-
-
-def _process_instance_wrapper(
-    process_instance_func: Callable[[pd.Series, EvalMetadata, bool], EvalOutput],
-    instance: pd.Series,
-    metadata: EvalMetadata,
-    use_mp: bool,
-    max_retries: int = 5,
-) -> EvalOutput:
-    """Wrap the process_instance_func to handle retries and errors.
-
-    Retry an instance up to max_retries times if it fails (e.g., due to transient network/runtime issues).
-    """
-    for attempt in range(max_retries + 1):
-        try:
-            result = process_instance_func(instance, metadata, use_mp)
-            return result
-        except Exception as e:
-            error = str(e)
-            stacktrace = traceback.format_exc()
-            if attempt == max_retries:
-                logger.exception(e)
-                msg = (
-                    '-' * 10
-                    + '\n'
-                    + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
-                    + '\n'
-                    + f'[Encountered after {max_retries} retries. Please check the logs and report the issue.]'
-                    + '-' * 10
-                )
-                # Raise an error after all retries & stop the evaluation
-                logger.exception(e)
-                raise RuntimeError(
-                    f'Maximum error retries reached for instance {instance.instance_id}'
-                ) from e
-            msg = (
-                '-' * 10
-                + '\n'
-                + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
-                + '\n'
-                + '-' * 10
-                + f'[The above error occurred. Retrying... (attempt {attempt + 1} of {max_retries})]'
-                + '-' * 10
-                + '\n'
-            )
-            logger.error(msg)
-            if use_mp:
-                print(msg)  # use print to directly print to console
-            time.sleep(5)
-
-
-def _process_instance_wrapper_mp(args):
-    """Wrapper for multiprocessing, especially for imap_unordered."""
-    return _process_instance_wrapper(*args)
-
-
 def run_evaluation(
    dataset: pd.DataFrame,
-    metadata: EvalMetadata | None,
+    metadata: EvalMetadata,
    output_file: str,
    num_workers: int,
    process_instance_func: Callable[
        [pd.Series, EvalMetadata, bool], Awaitable[EvalOutput]
    ],
-    max_retries: int = 5,  # number of retries for each instance
 ):
    use_multiprocessing = num_workers > 1
-
-    if metadata is not None:
-        logger.info(
-            f'Evaluation started with Agent {metadata.agent_class}:\n'
-            f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
-        )
-    else:
-        logger.info(f'Evaluation started with {num_workers} workers.')
-
-    total_instances = len(dataset)
-    pbar = tqdm(total=total_instances, desc='Instances processed')
+    logger.info(
+        f'Evaluation started with Agent {metadata.agent_class}:\n'
+        f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
+    )
+    pbar = tqdm(total=len(dataset))
    output_fp = open(output_file, 'a')

+    def update_progress(future):
+        pbar.update(1)
+        output: EvalOutput = future.result() if use_multiprocessing else future
+
+        pbar.set_description(f'Instance {output.instance_id}')
+        pbar.set_postfix_str(f'Test Result: {output.test_result}')
+        logger.info(
+            f'Finished evaluation for instance {output.instance_id}: {str(output.test_result)[:300]}...\n'
+        )
+        output_fp.write(json.dumps(output.model_dump()) + '\n')
+        output_fp.flush()
+
    try:
        if use_multiprocessing:
-            with mp.Pool(num_workers) as pool:
-                args_iter = (
-                    (process_instance_func, instance, metadata, True, max_retries)
-                    for _, instance in dataset.iterrows()
-                )
-                results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)
-                for result in results:
-                    update_progress(result, pbar, output_fp)
+            with ProcessPoolExecutor(num_workers) as executor:
+                futures = []
+                for _, instance in dataset.iterrows():
+                    future = executor.submit(
+                        process_instance_func,
+                        instance,
+                        metadata,
+                        bool(num_workers > 1),
+                    )
+                    future.add_done_callback(update_progress)
+                    futures.append(future)
+                for future in futures:
+                    future.result()
+        # Use plain for loop for single process for easier debugging
        else:
+            assert num_workers == 1
            for _, instance in dataset.iterrows():
-                result = _process_instance_wrapper(
-                    process_instance_func=process_instance_func,
-                    instance=instance,
-                    metadata=metadata,
-                    use_mp=False,
-                    max_retries=max_retries,
-                )
-                update_progress(result, pbar, output_fp)
+                output = process_instance_func(instance, metadata, False)
+                update_progress(output)

    except KeyboardInterrupt:
        print('\nKeyboardInterrupt received. Cleaning up...\n')
@@ -37,7 +37,7 @@ poetry run python evaluation/webarena/get_success_rate.py evaluation/evaluation_

 ## Submit your evaluation results

-You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).

 ## BrowsingAgent V1.0 result

@@ -1,6 +1,6 @@
 {
  "name": "openhands-frontend",
-  "version": "0.9.4",
+  "version": "0.9.1",
  "private": true,
  "type": "module",
  "engines": {
@@ -8,7 +8,7 @@
  },
  "dependencies": {
    "@monaco-editor/react": "^4.6.0",
-    "@nextui-org/react": "^2.4.8",
+    "@nextui-org/react": "^2.4.6",
    "@react-types/shared": "^3.24.1",
    "@reduxjs/toolkit": "^2.2.7",
    "@vitejs/plugin-react": "^4.3.1",
@@ -16,23 +16,23 @@
    "@xterm/xterm": "^5.4.0",
    "clsx": "^2.1.1",
    "eslint-config-airbnb-typescript": "^18.0.0",
-    "i18next": "^23.15.1",
+    "i18next": "^23.14.0",
    "i18next-browser-languagedetector": "^8.0.0",
    "i18next-http-backend": "^2.6.1",
-    "jose": "^5.9.3",
-    "monaco-editor": "^0.52.0",
+    "jose": "^5.8.0",
+    "monaco-editor": "^0.51.0",
    "react": "^18.3.1",
    "react-dom": "^18.3.1",
    "react-highlight": "^0.15.0",
    "react-hot-toast": "^2.4.1",
-    "react-i18next": "^15.0.2",
+    "react-i18next": "^15.0.1",
    "react-icons": "^5.3.0",
    "react-markdown": "^9.0.1",
    "react-redux": "^9.1.2",
    "react-syntax-highlighter": "^15.5.0",
    "remark-gfm": "^4.0.0",
    "tailwind-merge": "^2.5.2",
-    "vite": "^5.4.7",
+    "vite": "^5.4.2",
    "web-vitals": "^3.5.2"
  },
  "scripts": {
@@ -64,8 +64,8 @@
    "@testing-library/jest-dom": "^6.5.0",
    "@testing-library/react": "^16.0.1",
    "@testing-library/user-event": "^14.5.2",
-    "@types/node": "^22.6.1",
-    "@types/react": "^18.3.8",
+    "@types/node": "^22.5.1",
+    "@types/react": "^18.3.4",
    "@types/react-dom": "^18.3.0",
    "@types/react-highlight": "^0.12.8",
    "@types/react-syntax-highlighter": "^15.5.13",
@@ -82,13 +82,13 @@
    "eslint-plugin-prettier": "^5.2.1",
    "eslint-plugin-react": "^7.35.0",
    "eslint-plugin-react-hooks": "^4.6.2",
-    "husky": "^9.1.6",
+    "husky": "^9.1.5",
    "jsdom": "^25.0.0",
-    "lint-staged": "^15.2.10",
-    "postcss": "^8.4.47",
+    "lint-staged": "^15.2.9",
+    "postcss": "^8.4.41",
    "prettier": "^3.3.3",
-    "tailwindcss": "^3.4.12",
-    "typescript": "^5.6.2",
+    "tailwindcss": "^3.4.10",
+    "typescript": "^5.5.4",
    "vite-tsconfig-paths": "^5.0.1",
    "vitest": "^1.6.0"
  },
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Robert Brennan	4b743e90e2	dont close runtimes	2024-08-30 12:14:02 -04:00
Robert Brennan	505b47380f	try reuse_ids	2024-08-30 11:40:56 -04:00
Robert Brennan	e9f2959ad2	Merge branch 'main' into rb/parallel-tests	2024-08-30 11:37:20 -04:00
Robert Brennan	848fd38e7f	revert workflow	2024-08-30 11:30:42 -04:00
Robert Brennan	8b97144ae8	try reusing runtimes	2024-08-30 11:30:05 -04:00
Robert Brennan	cde8bddf62	give containers different names	2024-08-29 16:44:42 -04:00
Robert Brennan	06449ba69e	try running tests with 5 workers	2024-08-29 15:51:22 -04:00