Remove dead code

Add tests for agent controller
chore(deps): bump @docusaurus/plugin-content-pages in /docs (#3346 )
2026-04-29 03:00:45 -04:00 · 2024-08-12 22:45:46 -04:00 · 2024-08-12 21:26:36 -04:00 · 2024-08-12 18:44:37 +00:00 · 2024-08-12 10:50:09 -07:00 · 2024-08-12 10:49:45 -07:00
695 changed files with 39634 additions and 19810 deletions
--- a/.devcontainer/README.MD
+++ b/.devcontainer/README.MD
@@ -0,0 +1 @@
+The files in this directory configure a development container for GitHub Codespaces.
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,15 @@
+{
+	"name": "OpenDevin Codespaces",
+	"image": "mcr.microsoft.com/devcontainers/universal",
+	"customizations":{
+        "vscode":{
+            "extensions": [
+                "ms-python.python"
+            ]
+        }
+    },
+	"onCreateCommand": "sh ./.devcontainer/on_create.sh",
+	"postCreateCommand": "make build",
+	"postStartCommand": "USE_HOST_NETWORK=True nohup bash -c 'make run &'"
+
+}
--- a/.devcontainer/on_create.sh
+++ b/.devcontainer/on_create.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+sudo apt update
+sudo apt install -y netcat
+sudo add-apt-repository -y ppa:deadsnakes/ppa
+sudo apt install -y python3.11
+curl -sSL https://install.python-poetry.org | python3.11 -
+# chromadb requires SQLite > 3.35 but SQLite in Python3.11.9 comes with 3.31.1
+sudo cp /opt/conda/lib/libsqlite3.so.0 /lib/x86_64-linux-gnu/libsqlite3.so.0
--- a/.github/ISSUE_TEMPLATE/bug_template.yml
+++ b/.github/ISSUE_TEMPLATE/bug_template.yml
@@ -12,7 +12,7 @@ body:
      label: Is there an existing issue for the same bug?
      description: Please check if an issue already exists for the bug you encountered.
      options:
-      - label: I have checked the troubleshooting document at https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting
+      - label: I have checked the troubleshooting document at https://docs.all-hands.dev/modules/usage/troubleshooting
        required: true
      - label: I have checked the existing issues.
        required: true
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,5 +1,11 @@
 **What is the problem that this fixes or functionality that this introduces? Does it fix any open issues?**

-**Give a brief summary of what the PR does, explaining any non-trivial design decisions**

+
+---
+**Give a summary of what the PR does, explaining any non-trivial design decisions**
+
+
+
+---
 **Other references**
--- a/.github/workflows/clean-up.yml
+++ b/.github/workflows/clean-up.yml
@@ -0,0 +1,68 @@
+# Workflow that cleans up outdated and old workflows to prevent out of disk issues
+name: Delete old workflow runs
+
+on:
+  workflow_dispatch:
+    inputs:
+      days:
+        description: 'Days-worth of runs to keep for each workflow'
+        required: true
+        default: '30'
+      minimum_runs:
+        description: 'Minimum runs to keep for each workflow'
+        required: true
+        default: '10'
+      delete_workflow_pattern:
+        description: 'Name or filename of the workflow (if not set, all workflows are targeted)'
+        required: false
+      delete_workflow_by_state_pattern:
+        description: 'Filter workflows by state: active, deleted, disabled_fork, disabled_inactivity, disabled_manually'
+        required: true
+        default: "ALL"
+        type: choice
+        options:
+          - "ALL"
+          - active
+          - deleted
+          - disabled_inactivity
+          - disabled_manually
+      delete_run_by_conclusion_pattern:
+        description: 'Remove runs based on conclusion: action_required, cancelled, failure, skipped, success'
+        required: true
+        default: 'ALL'
+        type: choice
+        options:
+          - 'ALL'
+          - 'Unsuccessful: action_required,cancelled,failure,skipped'
+          - action_required
+          - cancelled
+          - failure
+          - skipped
+          - success
+      dry_run:
+        description: 'Logs simulated changes, no deletions are performed'
+        required: false
+
+jobs:
+  del_runs:
+    runs-on: ubuntu-latest
+    permissions:
+      actions: write
+      contents: read
+    steps:
+      - name: Delete workflow runs
+        uses: Mattraks/delete-workflow-runs@v2
+        with:
+          token: ${{ github.token }}
+          repository: ${{ github.repository }}
+          retain_days: ${{ github.event.inputs.days }}
+          keep_minimum_runs: ${{ github.event.inputs.minimum_runs }}
+          delete_workflow_pattern: ${{ github.event.inputs.delete_workflow_pattern }}
+          delete_workflow_by_state_pattern: ${{ github.event.inputs.delete_workflow_by_state_pattern }}
+          delete_run_by_conclusion_pattern: >-
+            ${{
+              startsWith(github.event.inputs.delete_run_by_conclusion_pattern, 'Unsuccessful:')
+              && 'action_required,cancelled,failure,skipped'
+              || github.event.inputs.delete_run_by_conclusion_pattern
+            }}
+          dry_run: ${{ github.event.inputs.dry_run }}
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -1,3 +1,4 @@
+# Workflow that builds and deploys the documentation website
 name: Deploy Docs to GitHub Pages

 on:
@@ -5,10 +6,13 @@ on:
    branches:
      - main
  pull_request:
+    paths:
+      - 'docs/**'
    branches:
      - main

 jobs:
+  # Build the documentation website
  build:
    name: Build Docusaurus
    runs-on: ubuntu-latest
@@ -25,23 +29,23 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
-
+          python-version: '3.11'
      - name: Generate Python Docs
        run: rm -rf docs/modules/python && pip install pydoc-markdown && pydoc-markdown
      - name: Install dependencies
        run: cd docs && npm ci
      - name: Build website
        run: cd docs && npm run build
-
      - name: Upload Build Artifact
        if: github.ref == 'refs/heads/main'
        uses: actions/upload-pages-artifact@v3
        with:
          path: docs/build

+  # Deploy the documentation website
  deploy:
    name: Deploy to GitHub Pages
+    runs-on: ubuntu-latest
    needs: build
    if: github.ref == 'refs/heads/main' && github.repository == 'OpenDevin/OpenDevin'
    # Grant GITHUB_TOKEN the permissions required to make a Pages deployment
@@ -52,7 +56,6 @@ jobs:
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
-    runs-on: ubuntu-latest
    steps:
      - name: Deploy to GitHub Pages
        id: deployment
--- a/.github/workflows/dummy-agent-test.yml
+++ b/.github/workflows/dummy-agent-test.yml
@@ -1,3 +1,4 @@
+# Workflow that uses the DummyAgent to run a simple task
 name: Run E2E test with dummy agent

 concurrency:
@@ -10,9 +11,6 @@ on:
    - main
  pull_request:

-env:
-  PERSIST_SANDBOX : "false"
-
 jobs:
  test:
    runs-on: ubuntu-latest
@@ -25,7 +23,7 @@ jobs:
      - name: Set up environment
        run: |
          curl -sSL https://install.python-poetry.org | python3 -
-          poetry install --without evaluation
+          poetry install --without evaluation,llama-index
          poetry run playwright install --with-deps chromium
          wget https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json -P /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/1_Pooling/
      - name: Run tests
--- a/.github/workflows/ghcr.yml
+++ b/.github/workflows/ghcr.yml
@@ -1,4 +1,5 @@
-name: Build Publish and Test Docker Image
+# Workflow that builds, tests and then pushes the docker images to the ghcr.io repository
+name: Build Publish and Test Runtime Image

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,25 +20,21 @@ on:
        default: ''

 jobs:
+  # Builds the OpenDevin Docker images
  ghcr_build:
    runs-on: ubuntu-latest
-
    outputs:
      tags: ${{ steps.capture-tags.outputs.tags }}
-
    permissions:
      contents: read
      packages: write
-
    strategy:
      matrix:
-        image: ["sandbox", "opendevin"]
-        platform: ["amd64", "arm64"]
-
+        image: ['opendevin']
+        platform: ['amd64', 'arm64']
    steps:
      - name: Checkout
        uses: actions/checkout@v4
-
      - name: Free Disk Space (Ubuntu)
        uses: jlumbroso/free-disk-space@main
        with:
@@ -52,62 +49,152 @@ jobs:
          large-packages: true
          docker-images: false
          swap-storage: true
-
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v3
-
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@v3
-
      - name: Build and export image
        id: build
        run: ./containers/build.sh ${{ matrix.image }} ${{ github.repository_owner }} ${{ matrix.platform }}
-
      - name: Capture tags
        id: capture-tags
        run: |
          tags=$(cat tags.txt)
          echo "tags=$tags"
          echo "tags=$tags" >> $GITHUB_OUTPUT
-
      - name: Upload Docker image as artifact
        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
          path: /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
+          retention-days: 14

-  test-for-sandbox:
-    name: Test for Sandbox
+  # Builds the runtime Docker images
+  ghcr_build_runtime:
    runs-on: ubuntu-latest
-    needs: ghcr_build
-    env:
-      PERSIST_SANDBOX: "false"
+    outputs:
+      tags: ${{ steps.capture-tags.outputs.tags }}
+    permissions:
+      contents: read
+      packages: write
+    strategy:
+      matrix:
+        image: ['od_runtime']
+        base_image: ['nikolaik/python-nodejs:python3.11-nodejs22']
+        platform: ['amd64', 'arm64']
    steps:
-      - uses: actions/checkout@v4
-
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      - name: Install poetry via pipx
        run: pipx install poetry
-
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
-          cache: "poetry"
-
+          python-version: '3.11'
+          cache: 'poetry'
      - name: Install Python dependencies using Poetry
        run: make install-python-dependencies
+      - name: Create source distribution and Dockerfile
+        run: poetry run python3 opendevin/runtime/utils/runtime_build.py --base_image ${{ matrix.base_image }} --build_folder containers/runtime --force_rebuild
+      - name: Build and export image
+        id: build
+        run: |
+          if [ -f 'containers/runtime/Dockerfile' ]; then
+            echo 'Dockerfile detected, building runtime image...'
+            ./containers/build.sh ${{ matrix.image }} ${{ github.repository_owner }} ${{ matrix.platform }}
+          else
+            echo 'No Dockerfile detected which means an exact image is already built. Pulling the image and saving it to a tar file...'
+            source containers/runtime/config.sh
+            echo "$DOCKER_IMAGE_TAG $DOCKER_IMAGE_HASH_TAG" >> tags.txt
+            echo "Pulling image $DOCKER_IMAGE/$DOCKER_IMAGE_HASH_TAG to /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar"
+            docker pull $DOCKER_IMAGE:$DOCKER_IMAGE_HASH_TAG
+            docker save $DOCKER_IMAGE:$DOCKER_IMAGE_HASH_TAG -o /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
+          fi
+      - name: Capture tags
+        id: capture-tags
+        run: |
+          tags=$(cat tags.txt)
+          echo "tags=$tags"
+          echo "tags=$tags" >> $GITHUB_OUTPUT
+      - name: Upload Docker image as artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
+          path: /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
+          retention-days: 14

-      - name: Download sandbox Docker image
+  # Run unit tests with the EventStream and Server runtime Docker images
+  test_runtime:
+    name: Test Runtime
+    runs-on: ubuntu-latest
+    needs: [ghcr_build_runtime, ghcr_build]
+    strategy:
+      matrix:
+        runtime_type: ['eventstream']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # when set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: true
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'poetry'
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+      - name: Download Runtime Docker image
+        if: matrix.runtime_type == 'eventstream'
+        uses: actions/download-artifact@v4
+        with:
+          name: od_runtime-docker-image-amd64
+          path: /tmp/
+      - name: Download Sandbox Docker image
+        if: matrix.runtime_type == 'server'
        uses: actions/download-artifact@v4
        with:
          name: sandbox-docker-image-amd64
          path: /tmp/
-
-      - name: Load sandbox image and run sandbox tests
+      - name: Load Runtime image and run runtime tests
        run: |
          # Load the Docker image and capture the output
-          output=$(docker load -i /tmp/sandbox_image_amd64.tar)
+          if [ "${{ matrix.runtime_type }}" == "eventstream" ]; then
+            output=$(docker load -i /tmp/od_runtime_image_amd64.tar)
+          else
+            output=$(docker load -i /tmp/sandbox_image_amd64.tar)
+          fi

          # Extract the first image name from the output
          image_name=$(echo "$output" | grep -oP 'Loaded image: \K.*' | head -n 1)
@@ -115,51 +202,48 @@ jobs:
          # Print the full name of the image
          echo "Loaded Docker image: $image_name"

-          SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml -s ./tests/unit/test_sandbox.py
-
+          TEST_RUNTIME=${{ matrix.runtime_type }} SANDBOX_USER_ID=$(id -u) SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml -s ./tests/unit/test_runtime.py
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

-  integration-tests-on-linux:
-    name: Integration Tests on Linux
+  # Run integration tests with the eventstream runtime Docker image
+  runtime_integration_tests_on_linux:
+    name: Runtime Integration Tests on Linux
    runs-on: ubuntu-latest
-    needs: ghcr_build
-    env:
-      PERSIST_SANDBOX: "false"
+    needs: [ghcr_build_runtime]
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.11"]
-        sandbox: ["ssh", "local"]
+        python-version: ['3.11']
+        # server is tested in a separate workflow
+        runtime_type: ['eventstream']
    steps:
      - uses: actions/checkout@v4
-
      - name: Install poetry via pipx
        run: pipx install poetry
-
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'poetry'
-
      - name: Install Python dependencies using Poetry
        run: make install-python-dependencies
-
-      - name: Download sandbox Docker image
+      - name: Download Runtime Docker image
        uses: actions/download-artifact@v4
        with:
-          name: sandbox-docker-image-amd64
+          name: od_runtime-docker-image-amd64
          path: /tmp/
-
-      - name: Load sandbox image and run integration tests
-        env:
-          SANDBOX_BOX_TYPE: ${{ matrix.sandbox }}
+      - name: Load runtime image and run integration tests
        run: |
          # Load the Docker image and capture the output
-          output=$(docker load -i /tmp/sandbox_image_amd64.tar)
+          if [ "${{ matrix.runtime_type }}" == "eventstream" ]; then
+            output=$(docker load -i /tmp/od_runtime_image_amd64.tar)
+          else
+            echo "No Runtime Docker image to load"
+            exit 1
+          fi

          # Extract the first image name from the output
          image_name=$(echo "$output" | grep -oP 'Loaded image: \K.*' | head -n 1)
@@ -167,48 +251,40 @@ jobs:
          # Print the full name of the image
          echo "Loaded Docker image: $image_name"

-          SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true TEST_ONLY=true ./tests/integration/regenerate.sh
-
+          TEST_RUNTIME=${{ matrix.runtime_type }} SANDBOX_USER_ID=$(id -u) SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true TEST_ONLY=true ./tests/integration/regenerate.sh
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

+  # Push the OpenDevin and sandbox Docker images to the ghcr.io repository
  ghcr_push:
    runs-on: ubuntu-latest
-    # don't push if integration tests or sandbox tests fail
-    needs: [ghcr_build, integration-tests-on-linux, test-for-sandbox]
+    needs: [ghcr_build]
    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
-
    env:
      tags: ${{ needs.ghcr_build.outputs.tags }}
-
    permissions:
      contents: read
      packages: write
-
    strategy:
      matrix:
-        image: ["sandbox", "opendevin"]
-        platform: ["amd64", "arm64"]
-
+        image: ['opendevin']
+        platform: ['amd64', 'arm64']
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
-
      - name: Login to GHCR
        uses: docker/login-action@v2
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Download Docker images
        uses: actions/download-artifact@v4
        with:
          name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
          path: /tmp/${{ matrix.platform }}
-
      - name: Load images and push to registry
        run: |
          mv /tmp/${{ matrix.platform }}/${{ matrix.image }}_image_${{ matrix.platform }}.tar .
@@ -223,33 +299,124 @@ jobs:
            docker push $image_name:${tag}_${{ matrix.platform }}
          done

-  create_manifest:
+  # Push the runtime Docker images to the ghcr.io repository
+  ghcr_push_runtime:
    runs-on: ubuntu-latest
-    needs: [ghcr_build, ghcr_push]
+    needs: [ghcr_build_runtime, test_runtime, runtime_integration_tests_on_linux]
    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
-
    env:
-      tags: ${{ needs.ghcr_build.outputs.tags }}
-
-    strategy:
-      matrix:
-        image: ["sandbox", "opendevin"]
-
+      RUNTIME_TAGS: ${{ needs.ghcr_build_runtime.outputs.tags }}
    permissions:
      contents: read
      packages: write
-
+    strategy:
+      matrix:
+        image: ['od_runtime']
+        platform: ['amd64', 'arm64']
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
-
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: true
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
      - name: Login to GHCR
        uses: docker/login-action@v2
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Download Docker images
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
+          path: /tmp/${{ matrix.platform }}
+      - name: List downloaded files
+        run: |
+          ls -la /tmp/${{ matrix.platform }}
+          file /tmp/${{ matrix.platform }}/*
+      - name: Load images and push to registry
+        run: |
+          mv /tmp/${{ matrix.platform }}/${{ matrix.image }}_image_${{ matrix.platform }}.tar ./${{ matrix.image }}_image_${{ matrix.platform }}.tar
+          if ! loaded_image=$(docker load -i ${{ matrix.image }}_image_${{ matrix.platform }}.tar | grep "Loaded image:" | head -n 1 | awk '{print $3}'); then
+            echo "Failed to load Docker image"
+            exit 1
+          fi
+          echo "loaded image = $loaded_image"
+          image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
+          echo "image name = $image_name"
+          echo "$RUNTIME_TAGS" | tr ' ' '\n' | while read -r tag; do
+            echo "tag = $tag"
+            if [ -n "$image_name" ] && [ -n "$tag" ]; then
+              docker tag $loaded_image $image_name:${tag}_${{ matrix.platform }}
+              docker push $image_name:${tag}_${{ matrix.platform }}
+            else
+              echo "Skipping tag and push due to empty image_name or tag"
+            fi
+          done

+  # Creates and pushes the OpenDevin and sandbox Docker image manifests
+  create_manifest:
+    runs-on: ubuntu-latest
+    needs: [ghcr_build, ghcr_push]
+    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
+    env:
+      tags: ${{ needs.ghcr_build.outputs.tags }}
+    strategy:
+      matrix:
+        image: ['opendevin']
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Login to GHCR
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Create and push multi-platform manifest
+        run: |
+          image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
+          echo "image name = $image_name"
+          tags=$(echo ${tags} | tr ' ' '\n')
+          for tag in $tags; do
+            echo 'tag = $tag'
+            docker buildx imagetools create --tag $image_name:$tag \
+              $image_name:${tag}_amd64 \
+              $image_name:${tag}_arm64
+          done
+
+  # Creates and pushes the runtime Docker image manifest
+  create_manifest_runtime:
+    runs-on: ubuntu-latest
+    needs: [ghcr_build_runtime, ghcr_push_runtime]
+    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
+    env:
+      tags: ${{ needs.ghcr_build_runtime.outputs.tags }}
+    strategy:
+      matrix:
+        image: ['od_runtime']
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Login to GHCR
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Create and push multi-platform manifest
        run: |
          image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -1,3 +1,4 @@
+# Workflow that runs lint on the frontend and python code
 name: Lint

 concurrency:
@@ -11,27 +12,26 @@ on:
  pull_request:

 jobs:
+  # Run lint on the frontend code
  lint-frontend:
    name: Lint frontend
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
-
      - name: Install Node.js 20
        uses: actions/setup-node@v4
        with:
          node-version: 20
-
      - name: Install dependencies
        run: |
          cd frontend
          npm install --frozen-lockfile
-
      - name: Lint
        run: |
          cd frontend
          npm run lint

+  # Run lint on the python code
  lint-python:
    name: Lint python
    runs-on: ubuntu-latest
--- a/.github/workflows/review-pr.yml
+++ b/.github/workflows/review-pr.yml
@@ -1,3 +1,4 @@
+# Workflow that uses OpenDevin to review a pull request. PR must be labeled 'review-this'
 name: Use OpenDevin to Review Pull Request

 on:
@@ -12,29 +13,28 @@ jobs:
  dogfood:
    if: contains(github.event.pull_request.labels.*.name, 'review-this')
    runs-on: ubuntu-latest
-    container:
-      image: ghcr.io/opendevin/opendevin
-      volumes:
-        - /var/run/docker.sock:/var/run/docker.sock
-
    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.11'
    - name: install git, github cli
      run: |
-        apt-get install -y git gh
+        sudo apt-get install -y git gh
        git config --global --add safe.directory $PWD
-
    - name: Checkout Repository
      uses: actions/checkout@v4
      with:
        ref: ${{ github.event.pull_request.base.ref }} # check out the target branch
-
    - name: Download Diff
      run: |
        curl -O "${{ github.event.pull_request.diff_url }}" -L
-
    - name: Write Task File
      run: |
-        echo "Your coworker wants to apply a pull request to this project. Read and review ${{ github.event.pull_request.number }}.diff file. Create a review-${{ github.event.pull_request.number }}.txt and write your concise comments and suggestions there." > task.txt
+        echo "Your coworker wants to apply a pull request to this project." > task.txt
+        echo "Read and review ${{ github.event.pull_request.number }}.diff file. Create a review-${{ github.event.pull_request.number }}.txt and write your concise comments and suggestions there." >> task.txt
+        echo "Do not ask me for confirmation at any point." >> task.txt
        echo "" >> task.txt
        echo "Title" >> task.txt
        echo "${{ github.event.pull_request.title }}" >> task.txt
@@ -43,27 +43,25 @@ jobs:
        echo "${{ github.event.pull_request.body }}" >> task.txt
        echo "" >> task.txt
        echo "Diff file is: ${{ github.event.pull_request.number }}.diff" >> task.txt
-
    - name: Set up environment
      run: |
        curl -sSL https://install.python-poetry.org | python3 -
        export PATH="/github/home/.local/bin:$PATH"
-        poetry install --without evaluation
+        poetry install --without evaluation,llama-index
        poetry run playwright install --with-deps chromium
-
    - name: Run OpenDevin
      env:
-        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        SANDBOX_BOX_TYPE: ssh
+        LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+        LLM_MODEL: ${{ vars.LLM_MODEL }}
      run: |
        # Append path to launch poetry
        export PATH="/github/home/.local/bin:$PATH"
        # Append path to correctly import package, note: must set pwd at first
        export PYTHONPATH=$(pwd):$PYTHONPATH
-        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
+        export WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE
+        export WORKSPACE_BASE=$GITHUB_WORKSPACE
+        echo -e "/exit\n" | poetry run python opendevin/core/main.py -i 50 -f task.txt
        rm task.txt
-
    - name: Check if review file is non-empty
      id: check_file
      run: |
@@ -72,7 +70,6 @@ jobs:
          echo "non_empty=true" >> $GITHUB_OUTPUT
        fi
      shell: bash
-
    - name: Create PR review if file is non-empty
      env:
        GH_TOKEN: ${{ github.token }}
--- a/.github/workflows/run-unit-tests.yml
+++ b/.github/workflows/run-unit-tests.yml
@@ -1,3 +1,4 @@
+# Workflow that runs frontend and python unit tests
 name: Run Unit Tests

 concurrency:
@@ -15,123 +16,133 @@ on:
      - 'evaluation/**'
  pull_request:

-env:
-  PERSIST_SANDBOX : "false"

 jobs:
+  # Run frontend unit tests
  fe-test:
    runs-on: ubuntu-latest
-
    strategy:
      matrix:
        node-version: [20]
-
    steps:
      - name: Checkout
        uses: actions/checkout@v4
-
      - name: Set up Node.js
        uses: actions/setup-node@v4
        with:
          node-version: ${{ matrix.node-version }}
-
      - name: Install dependencies
        working-directory: ./frontend
        run: npm ci
-
      - name: Run tests and collect coverage
        working-directory: ./frontend
        run: npm run test:coverage
-
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

+  # Run python unit tests on macOS
  test-on-macos:
    name: Test on macOS
    runs-on: macos-12
    env:
-      INSTALL_DOCKER: "1" # Set to '0' to skip Docker installation
+      INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
    strategy:
      matrix:
-        python-version: ["3.11"]
-
+        python-version: ['3.11']
    steps:
      - uses: actions/checkout@v4
-
      - name: Install poetry via pipx
        run: pipx install poetry
-
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
-          cache: "poetry"
-
+          cache: 'poetry'
      - name: Install Python dependencies using Poetry
-        run: poetry install
-
+        run: poetry install --without evaluation,llama-index
      - name: Install & Start Docker
        if: env.INSTALL_DOCKER == '1'
        run: |
+          INSTANCE_NAME="colima-${GITHUB_RUN_ID}"
+
          # Uninstall colima to upgrade to the latest version
          if brew list colima &>/dev/null; then
-              brew uninstall colima
-              # unlinking colima dependency: go
-              brew uninstall go@1.21
+            brew uninstall colima
+            # unlinking colima dependency: go
+            brew uninstall go@1.21
          fi
          rm -rf ~/.colima ~/.lima
          brew install --HEAD colima
-          brew services start colima
          brew install docker
-          colima delete
-          colima start  --network-address --arch x86_64 --cpu=1 --memory=1
+
+          start_colima() {
+            # Find a free port in the range 10000-20000
+            RANDOM_PORT=$((RANDOM % 10001 + 10000))
+
+            # Original line:
+            if ! colima start --network-address --arch x86_64 --cpu=1 --memory=1 --verbose --ssh-port $RANDOM_PORT; then
+              echo "Failed to start Colima."
+              return 1
+            fi
+            return 0
+          }
+
+          # Attempt to start Colima for 5 total attempts:
+          ATTEMPT_LIMIT=5
+          for ((i=1; i<=ATTEMPT_LIMIT; i++)); do
+
+            if start_colima; then
+              echo "Colima started successfully."
+              break
+            else
+              colima stop -f
+              sleep 10
+              colima delete -f
+              if [ $i -eq $ATTEMPT_LIMIT ]; then
+                exit 1
+              fi
+              sleep 10
+            fi
+          done

          # For testcontainers to find the Colima socket
          # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
-
      - name: Build Environment
        run: make build
-
      - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"
-
+        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_runtime.py"
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+
+  # Run python unit tests on Linux
  test-on-linux:
    name: Test on Linux
    runs-on: ubuntu-latest
    env:
-      INSTALL_DOCKER: "0" # Set to '0' to skip Docker installation
+      INSTALL_DOCKER: '0' # Set to '0' to skip Docker installation
    strategy:
      matrix:
-        python-version: ["3.11"]
-
+        python-version: ['3.11']
    steps:
      - uses: actions/checkout@v4
-
      - name: Install poetry via pipx
        run: pipx install poetry
-
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
-          cache: "poetry"
-
+          cache: 'poetry'
      - name: Install Python dependencies using Poetry
-        run: poetry install --without evaluation
-
+        run: poetry install --without evaluation,llama-index
      - name: Build Environment
        run: make build
-
      - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"
-
+        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_runtime.py"
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
--- a/.github/workflows/solve-issue.yml
+++ b/.github/workflows/solve-issue.yml
@@ -1,3 +1,4 @@
+# Workflow that uses OpenDevin to resolve a GitHub issue. Issue must be labeled 'solve-this'
 name: Use OpenDevin to Resolve GitHub Issue

 on:
@@ -17,14 +18,11 @@ jobs:
      image: ghcr.io/opendevin/opendevin
      volumes:
        - /var/run/docker.sock:/var/run/docker.sock
-
    steps:
    - name: install git, github cli
      run: apt-get install -y git gh
-
    - name: Checkout Repository
      uses: actions/checkout@v4
-
    - name: Write Task File
      env:
        ISSUE_TITLE: ${{ github.event.issue.title }}
@@ -35,22 +33,18 @@ jobs:
        echo "" >> task.txt
        echo "BODY:" >> task.txt
        echo "${ISSUE_BODY}" >> task.txt
-
    - name: Set up environment
      run: |
        curl -sSL https://install.python-poetry.org | python3 -
        export PATH="/github/home/.local/bin:$PATH"
-        poetry install --without evaluation
+        poetry install --without evaluation,llama-index
        poetry run playwright install --with-deps chromium
-
-
    - name: Run OpenDevin
      env:
        ISSUE_TITLE: ${{ github.event.issue.title }}
        ISSUE_BODY: ${{ github.event.issue.body }}
        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        SANDBOX_BOX_TYPE: ssh
      run: |
        # Append path to launch poetry
        export PATH="/github/home/.local/bin:$PATH"
@@ -58,7 +52,6 @@ jobs:
        export PYTHONPATH=$(pwd):$PYTHONPATH
        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
        rm task.txt
-
    - name: Setup Git, Create Branch, and Commit Changes
      run: |
        # Setup Git configuration
@@ -84,7 +77,6 @@ jobs:

        # Push changes
        git push --set-upstream origin $BRANCH_NAME
-
    - name: Fetch Default Branch
      env:
        GH_TOKEN: ${{ github.token }}
@@ -93,7 +85,6 @@ jobs:
        DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef --jq .defaultBranchRef.name)
        echo "Default branch is $DEFAULT_BRANCH"
        echo "DEFAULT_BRANCH=$DEFAULT_BRANCH" >> $GITHUB_ENV
-
    - name: Generate PR
      env:
        GH_TOKEN: ${{ github.token }}
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -1,4 +1,6 @@
+# Workflow that marks issues and PRs with no activity for 30 days with "Stale" and closes them after 7 more days of no activity
 name: 'Close stale issues'
+
 on:
  schedule:
    - cron: '30 1 * * *'
@@ -9,21 +11,9 @@ jobs:
    steps:
      - uses: actions/stale@v9
        with:
-          # Aggressively close issues that have been explicitly labeled `age-out`
-          any-of-labels: age-out
-          stale-issue-message: 'This issue is stale because it has been open for 7 days with no activity. Remove stale label or comment or this will be closed in 1 day.'
-          close-issue-message: 'This issue was closed because it has been stalled for over 7 days with no activity.'
-          stale-pr-message: 'This PR is stale because it has been open for 7 days with no activity. Remove stale label or comment or this will be closed in 1 days.'
-          close-pr-message: 'This PR was closed because it has been stalled for over 7 days with no activity.'
-          days-before-stale: 7
-          days-before-close: 1
-
-      - uses: actions/stale@v9
-        with:
-          # Be more lenient with other issues
          stale-issue-message: 'This issue is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
-          close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
          stale-pr-message: 'This PR is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
-          close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
          days-before-stale: 30
+          close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
+          close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
          days-before-close: 7
--- a/.github/workflows/update-pyproject-version.yml
+++ b/.github/workflows/update-pyproject-version.yml
@@ -1,48 +0,0 @@
-name: Update pyproject.toml Version and Tags
-
-on:
-  release:
-    types:
-      - published
-
-jobs:
-  update-pyproject-and-tags:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0  # Fetch all history for all branches and tags
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install toml
-
-      - name: Get release tag
-        id: get_release_tag
-        run: echo "RELEASE_TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
-
-      - name: Update pyproject.toml with release tag
-        run: |
-          python -c "
-          import toml
-          with open('pyproject.toml', 'r') as f:
-              data = toml.load(f)
-          data['tool']['poetry']['version'] = '${{ env.RELEASE_TAG }}'
-          with open('pyproject.toml', 'w') as f:
-              toml.dump(data, f)
-          "
-
-      - name: Commit and push pyproject.toml changes
-        uses: stefanzweifel/git-auto-commit-action@v4
-        with:
-          commit_message: "Update pyproject.toml version to ${{ env.RELEASE_TAG }}"
-          branch: main
-          file_pattern: pyproject.toml
--- a/.gitignore
+++ b/.gitignore
@@ -169,6 +169,10 @@ evaluation/outputs
 evaluation/swe_bench/eval_workspace*
 evaluation/SWE-bench/data
 evaluation/webarena/scripts/webarena_env.sh
+evaluation/bird/data
+evaluation/gaia/data
+evaluation/gorilla/data
+evaluation/toolqa/data

 # frontend

@@ -210,6 +214,7 @@ cache

 # configuration
 config.toml
+config.toml_
 config.toml.bak

 containers/agnostic_sandbox
@@ -217,3 +222,9 @@ containers/agnostic_sandbox
 # swe-bench-eval
 image_build_logs
 run_instance_logs
+
+od_runtime_*.tar
+
+# docker build
+containers/runtime/Dockerfile
+containers/runtime/project.tar.gz
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,13 +1,13 @@
 # Contributing

-Thanks for your interest in contributing to OpenDevin! We welcome and appreciate contributions. 
+Thanks for your interest in contributing to OpenDevin! We welcome and appreciate contributions.

 ## How Can I Contribute?

 There are many ways that you can contribute:

 1. **Download and use** OpenDevin, and send [issues](https://github.com/OpenDevin/OpenDevin/issues) when you encounter something that isn't working or a feature that you'd like to see.
-2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://opendevin.github.io/OpenDevin/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
+2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
 3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issue](https://github.com/OpenDevin/OpenDevin/labels/good%20first%20issue) issues that may be ones to start on.

 ## Understanding OpenDevin's CodeBase
@@ -83,7 +83,7 @@ git push origin my_branch
   - Set `base repository` to `OpenDevin/OpenDevin`
   - Set `base` to `main`
   - Click `Create pull request`
-  
+
 The PR should appear in [OpenDevin PRs](https://github.com/OpenDevin/OpenDevin/pulls).

 Then the OpenDevin team will review your code.
@@ -114,4 +114,3 @@ You may also check out previous PRs in the [PR list](https://github.com/OpenDevi
 ### 2. Pull Request description
 - If your PR is small (such as a typo fix), you can go brief.
 - If it contains a lot of changes, it's better to write more details.
-
--- a/Development.md
+++ b/Development.md
@@ -39,18 +39,18 @@ make build
 OpenDevin supports a diverse array of Language Models (LMs) through the powerful [litellm](https://docs.litellm.ai) library. By default, we've chosen the mighty GPT-4 from OpenAI as our go-to model, but the world is your oyster! You can unleash the potential of Anthropic's suave Claude, the enigmatic Llama, or any other LM that piques your interest.

 To configure the LM of your choice, run:
-       
+
   ```bash
   make setup-config
   ```
-   
+
   This command will prompt you to enter the LLM API key, model name, and other variables ensuring that OpenDevin is tailored to your specific needs. Note that the model name will apply only when you run headless. If you use the UI, please set the model in the UI.
-   
+
   Note: If you have previously run OpenDevin using the docker command, you may have already set some environmental variables in your terminal. The final configurations are set from highest to lowest priority:
   Environment variables > config.toml variables > default variables

 **Note on Alternative Models:**
-Some alternative models may prove more challenging to tame than others. Fear not, brave adventurer! We shall soon unveil LLM-specific documentation to guide you on your quest. 
+Some alternative models may prove more challenging to tame than others. Fear not, brave adventurer! We shall soon unveil LLM-specific documentation to guide you on your quest.
 And if you've already mastered the art of wielding a model other than OpenAI's GPT, we encourage you to share your setup instructions with us by creating instructions and adding it [to our documentation](https://github.com/OpenDevin/OpenDevin/tree/main/docs/modules/usage/llms).

 For a full list of the LM providers and models available, please consult the [litellm documentation](https://docs.litellm.ai/docs/providers).
@@ -84,10 +84,11 @@ make help
 ```

 ### 8. Testing
+To run tests, refer to the following:
 #### Unit tests

 ```bash
-poetry run pytest ./tests/unit/test_sandbox.py
+poetry run pytest ./tests/unit/test_*.py
 ```

 #### Integration tests
--- a/ISSUE_TRIAGE.md
+++ b/ISSUE_TRIAGE.md
@@ -0,0 +1,25 @@
+# Issue Triage
+These are the procedures and guidelines on how issues are triaged in this repo by the maintainers.
+
+## General
+* Most issues must be tagged with **enhancement** or **bug**
+* Issues may be tagged with what it relates to (**backend**, **frontend**, **agent quality**, etc.)
+
+## Severity
+* **Low**: Minor issues, single user report
+* **Medium**: Affecting multiple users
+* **Critical**: Affecting all users or potential security issues
+
+## Effort
+* Issues may be estimated with effort required (**small effort**, **medium effort**, **large effort**)
+
+## Difficulty
+* Issues with low implementation difficulty may be tagged with **good first issue**
+
+## Not Enough Information
+* User is asked to provide more information (logs, how to reproduce, etc.) when the issue is not clear
+* If an issue is unclear and the author does not provide more information or respond to a request, the issue may be closed as **not planned** (Usually after a week)
+
+## Multiple Requests/Fixes in One Issue
+* These issues will be narrowed down to one request/fix so the issue is more easily tracked and fixed
+* Issues may be broken down into multiple issues if required
--- a/29
+++ b/29
@@ -23,9 +23,6 @@ RESET=$(shell tput -Txterm sgr0)
 build:
 	@echo "$(GREEN)Building project...$(RESET)"
 	@$(MAKE) -s check-dependencies
-ifeq ($(INSTALL_DOCKER),)
-	@$(MAKE) -s pull-docker-image
-endif
 	@$(MAKE) -s install-python-dependencies
 	@$(MAKE) -s install-frontend-dependencies
 	@$(MAKE) -s install-pre-commit-hooks
@@ -124,11 +121,6 @@ check-poetry:
 		exit 1; \
 	fi

-pull-docker-image:
-	@echo "$(YELLOW)Pulling Docker image...$(RESET)"
-	@docker pull $(DOCKER_IMAGE)
-	@echo "$(GREEN)Docker image pulled successfully.$(RESET)"
-
 install-python-dependencies:
 	@echo "$(GREEN)Installing Python dependencies...$(RESET)"
 	@if [ -z "${TZ}" ]; then \
@@ -141,7 +133,7 @@ install-python-dependencies:
 		export HNSWLIB_NO_NATIVE=1; \
 		poetry run pip install chroma-hnswlib; \
 	fi
-	@poetry install
+	@poetry install --without llama-index
 	@if [ -f "/etc/manjaro-release" ]; then \
 		echo "$(BLUE)Detected Manjaro Linux. Installing Playwright dependencies...$(RESET)"; \
 		poetry run pip install playwright; \
@@ -162,11 +154,8 @@ install-frontend-dependencies:
 	@echo "$(YELLOW)Setting up frontend environment...$(RESET)"
 	@echo "$(YELLOW)Detect Node.js version...$(RESET)"
 	@cd frontend && node ./scripts/detect-node-version.js
-	@cd frontend && \
-		echo "$(BLUE)Installing frontend dependencies with npm...$(RESET)" && \
-		npm install && \
-		echo "$(BLUE)Running make-i18n with npm...$(RESET)" && \
-		npm run make-i18n
+	echo "$(BLUE)Installing frontend dependencies with npm...$(RESET)"
+	@cd frontend && npm install
 	@echo "$(GREEN)Frontend dependencies installed successfully.$(RESET)"

 install-pre-commit-hooks:
@@ -249,16 +238,6 @@ setup-config-prompts:
 	 workspace_dir=$${workspace_dir:-$(DEFAULT_WORKSPACE_DIR)}; \
 	 echo "workspace_base=\"$$workspace_dir\"" >> $(CONFIG_FILE).tmp

-	@read -p "Do you want to persist the sandbox container? [true/false] [default: false]: " persist_sandbox; \
-	 persist_sandbox=$${persist_sandbox:-false}; \
-	 if [ "$$persist_sandbox" = "true" ]; then \
-		 read -p "Enter a password for the sandbox container: " ssh_password; \
-		 echo "ssh_password=\"$$ssh_password\"" >> $(CONFIG_FILE).tmp; \
-		 echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
-	 else \
-		echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
-	 fi
-
 	@echo "" >> $(CONFIG_FILE).tmp

 	@echo "[llm]" >> $(CONFIG_FILE).tmp
@@ -319,4 +298,4 @@ help:
 	@echo "  $(GREEN)help$(RESET)                - Display this help message, providing information on available targets."

 # Phony targets
-.PHONY: build check-dependencies check-python check-npm check-docker check-poetry pull-docker-image install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help
+.PHONY: build check-dependencies check-python check-npm check-docker check-poetry install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@
  <a href="https://github.com/OpenDevin/OpenDevin/issues"><img src="https://img.shields.io/github/issues/opendevin/opendevin?style=for-the-badge&color=blue" alt="Issues"></a>
  <a href="https://github.com/OpenDevin/OpenDevin/blob/main/LICENSE"><img src="https://img.shields.io/github/license/opendevin/opendevin?style=for-the-badge&color=blue" alt="MIT License"></a>
  <br/>
-  <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2i1iqdag6-bVmvamiPA9EZUu7oCO6KhA"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
+  <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
  <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
  <a href="https://codecov.io/github/opendevin/opendevin?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/opendevin/opendevin?style=for-the-badge"></a>
 </div>
@@ -33,8 +33,10 @@
 <div align="center">
  <img src="./docs/static/img/logo.png" alt="Logo" width="200" height="200">
  <h1 align="center">OpenDevin: Code Less, Make More</h1>
-  <a href="https://opendevin.github.io/OpenDevin/modules/usage/intro"><img src="https://img.shields.io/badge/Documentation-OpenDevin-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Check out the documentation"></a>
-  <a href="https://huggingface.co/spaces/OpenDevin/evaluation"><img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?style=for-the-badge" alt="Evaluation Benchmark"></a>
+  <a href="https://docs.all-hands.dev/modules/usage/intro"><img src="https://img.shields.io/badge/Documentation-OpenDevin-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Check out the documentation"></a>
+  <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv"></a>
+  <br/>
+  <a href="https://huggingface.co/spaces/OpenDevin/evaluation"><img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark"></a>
 </div>
 <hr>

@@ -45,7 +47,7 @@ OpenDevin agents collaborate with human developers to write code, fix bugs, and
 ![App screenshot](./docs/static/img/screenshot.png)

 ## ⚡ Getting Started
-OpenDevin works best with the most recent version of Docker, `26.0.0`.
+OpenDevin works best with Docker version 26.0.0+ (Docker Desktop 4.31.0+).
 You must be using Linux, Mac OS, or WSL on Windows.

 To start OpenDevin in a docker container, run the following commands in your terminal:
@@ -64,14 +66,14 @@ docker run -it \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name opendevin-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/opendevin/opendevin
+    ghcr.io/opendevin/opendevin:0.8
 ```

 > [!NOTE]
 > By default, this command pulls the `latest` tag, which represents the most recent release of OpenDevin. You have other options as well:
 > - For a specific release version, use `ghcr.io/opendevin/opendevin:<OpenDevin_version>` (replace <OpenDevin_version> with the desired version number).
 > - For the most up-to-date development version, use `ghcr.io/opendevin/opendevin:main`. This version may be **(unstable!)** and is recommended for testing or development purposes only.
-> 
+>
 > Choose the tag that best suits your needs based on stability requirements and desired features.

 You'll find OpenDevin running at [http://localhost:3000](http://localhost:3000) with access to `./workspace`. To have OpenDevin operate on your code, place it in `./workspace`.
@@ -82,12 +84,12 @@ the `Settings` button (gear icon) in the UI. If the required `Model` does not ex

 For the development workflow, see [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md).

-Are you having trouble? Check out our [Troubleshooting Guide](https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting).
+Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).

 ## 🚀 Documentation

 To learn more about the project, and for tips on using OpenDevin,
-**check out our [documentation](https://opendevin.github.io/OpenDevin/modules/usage/intro)**.
+**check out our [documentation](https://docs.all-hands.dev/modules/usage/intro)**.

 There you'll find resources on how to use different LLM providers (like ollama and Anthropic's Claude),
 troubleshooting resources, and advanced configuration options.
@@ -109,7 +111,7 @@ For details, please check [CONTRIBUTING.md](./CONTRIBUTING.md).
 Whether you're a developer, a researcher, or simply enthusiastic about OpenDevin, we'd love to have you in our community.
 Let's make software engineering better together!

- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) - Here we talk about research, architecture, and future development.
+- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) - Here we talk about research, architecture, and future development.
 - [Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback.

 ## 📈 Progress
@@ -138,12 +140,13 @@ Distributed under the MIT License. See [`LICENSE`](./LICENSE) for more informati
 ## 📚 Cite

 ```
-@misc{opendevin2024,
-  author       = {{OpenDevin Team}},
-  title        = {{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
-  year         = {2024},
-  version      = {v1.0},
-  howpublished = {\url{https://github.com/OpenDevin/OpenDevin}},
-  note         = {Accessed: ENTER THE DATE YOU ACCESSED THE PROJECT}
+@misc{opendevin,
+      title={{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
+      author={Xingyao Wang and Boxuan Li and Yufan Song and Frank F. Xu and Xiangru Tang and Mingchen Zhuge and Jiayi Pan and Yueqi Song and Bowen Li and Jaskirat Singh and Hoang H. Tran and Fuqiang Li and Ren Ma and Mingzhang Zheng and Bill Qian and Yanjun Shao and Niklas Muennighoff and Yizhe Zhang and Binyuan Hui and Junyang Lin and Robert Brennan and Hao Peng and Heng Ji and Graham Neubig},
+      year={2024},
+      eprint={2407.16741},
+      archivePrefix={arXiv},
+      primaryClass={cs.SE},
+      url={https://arxiv.org/abs/2407.16741},
 }
 ```
--- a/agenthub/README.md
+++ b/agenthub/README.md
@@ -1,4 +1,4 @@
-# Agent Framework Research
+# Agent Hub

 In this folder, there may exist multiple implementations of `Agent` that will be used by the framework.

--- a/agenthub/init.py
+++ b/agenthub/init.py
@@ -14,12 +14,10 @@ from . import (  # noqa: E402
    codeact_swe_agent,
    delegator_agent,
    dummy_agent,
-    monologue_agent,
    planner_agent,
 )

 __all__ = [
-    'monologue_agent',
    'codeact_agent',
    'codeact_swe_agent',
    'planner_agent',
--- a/agenthub/browsing_agent/browsing_agent.py
+++ b/agenthub/browsing_agent/browsing_agent.py
@@ -7,6 +7,7 @@ from agenthub.browsing_agent.response_parser import BrowsingResponseParser
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.message import Message, TextContent
 from opendevin.events.action import (
    Action,
    AgentFinishAction,
@@ -99,8 +100,7 @@ class BrowsingAgent(Agent):
        self,
        llm: LLM,
    ) -> None:
-        """
-        Initializes a new instance of the BrowsingAgent class.
+        """Initializes a new instance of the BrowsingAgent class.

        Parameters:
        - llm (LLM): The llm to be used by this agent
@@ -120,16 +120,13 @@ class BrowsingAgent(Agent):
        self.reset()

    def reset(self) -> None:
-        """
-        Resets the Browsing Agent.
-        """
+        """Resets the Browsing Agent."""
        super().reset()
        self.cost_accumulator = 0
        self.error_accumulator = 0

    def step(self, state: State) -> Action:
-        """
-        Performs one step using the Browsing Agent.
+        """Performs one step using the Browsing Agent.
        This includes gathering information on previous steps and prompting the model to make a browsing command to execute.

        Parameters:
@@ -140,7 +137,7 @@ class BrowsingAgent(Agent):
        - MessageAction(content) - Message action to run (e.g. ask for clarification)
        - AgentFinishAction() - end the interaction
        """
-        messages = []
+        messages: list[Message] = []
        prev_actions = []
        cur_axtree_txt = ''
        error_prefix = ''
@@ -195,20 +192,23 @@ class BrowsingAgent(Agent):
                )
                return MessageAction('Error encountered when browsing.')

-        if (goal := state.get_current_user_intent()) is None:
+        goal, _ = state.get_current_user_intent()
+
+        if goal is None:
            goal = state.inputs['task']
+
        system_msg = get_system_message(
            goal,
            self.action_space.describe(with_long_description=False, with_examples=True),
        )

-        messages.append({'role': 'system', 'content': system_msg})
+        messages.append(Message(role='system', content=[TextContent(text=system_msg)]))

        prompt = get_prompt(error_prefix, cur_axtree_txt, prev_action_str)
-        messages.append({'role': 'user', 'content': prompt})
+        messages.append(Message(role='user', content=[TextContent(text=prompt)]))
        logger.debug(prompt)
        response = self.llm.completion(
-            messages=messages,
+            messages=[message.model_dump() for message in messages],
            temperature=0.0,
            stop=[')```', ')\n```'],
        )
--- a/agenthub/browsing_agent/prompt.py
+++ b/agenthub/browsing_agent/prompt.py
@@ -75,7 +75,8 @@ class PromptElement:
    Prompt elements are used to build the prompt. Use flags to control which
    prompt elements are visible. We use class attributes as a convenient way
    to implement static prompts, but feel free to override them with instance
-    attributes or @property decorator."""
+    attributes or @property decorator.
+    """

    _prompt = ''
    _abstract_ex = ''
@@ -200,11 +201,10 @@ def fit_tokens(
    model_name : str, optional
        The name of the model used when tokenizing.

-    Returns
+    Returns:
    -------
    str : the prompt after shrinking.
    """
-
    if max_prompt_chars is None:
        return shrinkable.prompt

@@ -579,8 +579,8 @@ the form is not visible yet or some fields are disabled. I need to replan.
 def diff(previous, new):
    """Return a string showing the difference between original and new.

-    If the difference is above diff_threshold, return the diff string."""
-
+    If the difference is above diff_threshold, return the diff string.
+    """
    if previous == new:
        return 'Identical', []

--- a/agenthub/browsing_agent/response_parser.py
+++ b/agenthub/browsing_agent/response_parser.py
@@ -37,9 +37,8 @@ class BrowsingResponseParser(ResponseParser):


 class BrowsingActionParserMessage(ActionParser):
-    """
-    Parser action:
-        - BrowseInteractiveAction(browser_actions) - unexpected response format, message back to user
+    """Parser action:
+    - BrowseInteractiveAction(browser_actions) - unexpected response format, message back to user
    """

    def __init__(
@@ -60,9 +59,8 @@ class BrowsingActionParserMessage(ActionParser):


 class BrowsingActionParserBrowseInteractive(ActionParser):
-    """
-    Parser action:
-        - BrowseInteractiveAction(browser_actions) - handle send message to user function call in BrowserGym
+    """Parser action:
+    - BrowseInteractiveAction(browser_actions) - handle send message to user function call in BrowserGym
    """

    def __init__(
--- a/agenthub/browsing_agent/utils.py
+++ b/agenthub/browsing_agent/utils.py
@@ -7,7 +7,6 @@ import yaml

 def yaml_parser(message):
    """Parse a yaml message for the retry function."""
-
    # saves gpt-3.5 from some yaml parsing errors
    message = re.sub(r':\s*\n(?=\S|\n)', ': ', message)

@@ -47,7 +46,6 @@ def _compress_chunks(text, identifier, skip_list, split_regex='\n\n+'):

 def compress_string(text):
    """Compress a string by replacing redundant paragraphs and lines with identifiers."""
-
    # Perform paragraph-level compression
    def_dict, compressed_text = _compress_chunks(
        text, identifier='§', skip_list=[], split_regex='\n\n+'
@@ -79,12 +77,12 @@ def extract_html_tags(text, keys):
    keys : list of str
        The HTML tags to extract the content from.

-    Returns
+    Returns:
    -------
    dict
        A dictionary mapping each key to a list of subset in `text` that match the key.

-    Notes
+    Notes:
    -----
    All text and keys will be converted to lowercase before matching.

@@ -126,7 +124,7 @@ def parse_html_tags(text, keys=(), optional_keys=(), merge_multiple=False):
    optional_keys : list of str
        The HTML tags to extract the content from, but are optional.

-    Returns
+    Returns:
    -------
    dict
        A dictionary mapping each key to subset of `text` that match the key.
--- a/agenthub/codeact_agent/action_parser.py
+++ b/agenthub/codeact_agent/action_parser.py
@@ -12,13 +12,12 @@ from opendevin.events.action import (


 class CodeActResponseParser(ResponseParser):
-    """
-    Parser action:
-        - CmdRunAction(command) - bash command to run
-        - IPythonRunCellAction(code) - IPython code to run
-        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
-        - MessageAction(content) - Message action to run (e.g. ask for clarification)
-        - AgentFinishAction() - end the interaction
+    """Parser action:
+    - CmdRunAction(command) - bash command to run
+    - IPythonRunCellAction(code) - IPython code to run
+    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    - AgentFinishAction() - end the interaction
    """

    def __init__(self):
@@ -53,9 +52,8 @@ class CodeActResponseParser(ResponseParser):


 class CodeActActionParserFinish(ActionParser):
-    """
-    Parser action:
-        - AgentFinishAction() - end the interaction
+    """Parser action:
+    - AgentFinishAction() - end the interaction
    """

    def __init__(
@@ -76,10 +74,9 @@ class CodeActActionParserFinish(ActionParser):


 class CodeActActionParserCmdRun(ActionParser):
-    """
-    Parser action:
-        - CmdRunAction(command) - bash command to run
-        - AgentFinishAction() - end the interaction
+    """Parser action:
+    - CmdRunAction(command) - bash command to run
+    - AgentFinishAction() - end the interaction
    """

    def __init__(
@@ -101,14 +98,13 @@ class CodeActActionParserCmdRun(ActionParser):
        # a command was found
        command_group = self.bash_command.group(1).strip()
        if command_group.strip() == 'exit':
-            return AgentFinishAction()
+            return AgentFinishAction(thought=thought)
        return CmdRunAction(command=command_group, thought=thought)


 class CodeActActionParserIPythonRunCell(ActionParser):
-    """
-    Parser action:
-        - IPythonRunCellAction(code) - IPython code to run
+    """Parser action:
+    - IPythonRunCellAction(code) - IPython code to run
    """

    def __init__(
@@ -137,9 +133,8 @@ class CodeActActionParserIPythonRunCell(ActionParser):


 class CodeActActionParserAgentDelegate(ActionParser):
-    """
-    Parser action:
-        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    """Parser action:
+    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
    """

    def __init__(
@@ -164,9 +159,8 @@ class CodeActActionParserAgentDelegate(ActionParser):


 class CodeActActionParserMessage(ActionParser):
-    """
-    Parser action:
-        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    """Parser action:
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
    """

    def __init__(
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@@ -8,7 +8,7 @@ from agenthub.codeact_agent.prompt import (
 )
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import config
+from opendevin.core.message import ImageContent, Message, TextContent
 from opendevin.events.action import (
    Action,
    AgentDelegateAction,
@@ -22,6 +22,7 @@ from opendevin.events.observation import (
    CmdOutputObservation,
    IPythonRunCellObservation,
 )
+from opendevin.events.observation.observation import Observation
 from opendevin.events.serialization.event import truncate_content
 from opendevin.llm.llm import LLM
 from opendevin.runtime.plugins import (
@@ -34,62 +35,6 @@ from opendevin.runtime.tools import RuntimeTool
 ENABLE_GITHUB = True


-def action_to_str(action: Action) -> str:
-    if isinstance(action, CmdRunAction):
-        return f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
-    elif isinstance(action, IPythonRunCellAction):
-        return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
-    elif isinstance(action, AgentDelegateAction):
-        return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
-    elif isinstance(action, MessageAction):
-        return action.content
-    return ''
-
-
-def get_action_message(action: Action) -> dict[str, str] | None:
-    if (
-        isinstance(action, AgentDelegateAction)
-        or isinstance(action, CmdRunAction)
-        or isinstance(action, IPythonRunCellAction)
-        or isinstance(action, MessageAction)
-    ):
-        return {
-            'role': 'user' if action.source == 'user' else 'assistant',
-            'content': action_to_str(action),
-        }
-    return None
-
-
-def get_observation_message(obs) -> dict[str, str] | None:
-    max_message_chars = config.get_llm_config_from_agent(
-        'CodeActAgent'
-    ).max_message_chars
-    if isinstance(obs, CmdOutputObservation):
-        content = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
-        content += (
-            f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
-        )
-        return {'role': 'user', 'content': content}
-    elif isinstance(obs, IPythonRunCellObservation):
-        content = 'OBSERVATION:\n' + obs.content
-        # replace base64 images with a placeholder
-        splitted = content.split('\n')
-        for i, line in enumerate(splitted):
-            if '![image](data:image/png;base64,' in line:
-                splitted[i] = (
-                    '![image](data:image/png;base64, ...) already displayed to user'
-                )
-        content = '\n'.join(splitted)
-        content = truncate_content(content, max_message_chars)
-        return {'role': 'user', 'content': content}
-    elif isinstance(obs, AgentDelegateObservation):
-        content = 'OBSERVATION:\n' + truncate_content(
-            str(obs.outputs), max_message_chars
-        )
-        return {'role': 'user', 'content': content}
-    return None
-
-
 # FIXME: We can tweak these two settings to create MicroAgents specialized toward different area
 def get_system_message() -> str:
    if ENABLE_GITHUB:
@@ -110,7 +55,7 @@ class CodeActAgent(Agent):

    ### Overview

-    This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.13463), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
+    This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).

    The conceptual idea is illustrated below. At each turn, the agent can:

@@ -158,8 +103,7 @@ class CodeActAgent(Agent):
        self,
        llm: LLM,
    ) -> None:
-        """
-        Initializes a new instance of the CodeActAgent class.
+        """Initializes a new instance of the CodeActAgent class.

        Parameters:
        - llm (LLM): The llm to be used by this agent
@@ -167,15 +111,72 @@ class CodeActAgent(Agent):
        super().__init__(llm)
        self.reset()

+    def action_to_str(self, action: Action) -> str:
+        if isinstance(action, CmdRunAction):
+            return (
+                f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
+            )
+        elif isinstance(action, IPythonRunCellAction):
+            return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
+        elif isinstance(action, AgentDelegateAction):
+            return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
+        elif isinstance(action, MessageAction):
+            return action.content
+        elif isinstance(action, AgentFinishAction) and action.source == 'agent':
+            return action.thought
+        return ''
+
+    def get_action_message(self, action: Action) -> Message | None:
+        if (
+            isinstance(action, AgentDelegateAction)
+            or isinstance(action, CmdRunAction)
+            or isinstance(action, IPythonRunCellAction)
+            or isinstance(action, MessageAction)
+            or (isinstance(action, AgentFinishAction) and action.source == 'agent')
+        ):
+            content = [TextContent(text=self.action_to_str(action))]
+
+            if isinstance(action, MessageAction) and action.images_urls:
+                content.append(ImageContent(image_urls=action.images_urls))
+
+            return Message(
+                role='user' if action.source == 'user' else 'assistant', content=content
+            )
+        return None
+
+    def get_observation_message(self, obs: Observation) -> Message | None:
+        max_message_chars = self.llm.config.max_message_chars
+        if isinstance(obs, CmdOutputObservation):
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
+            text += (
+                f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
+            )
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, IPythonRunCellObservation):
+            text = 'OBSERVATION:\n' + obs.content
+            # replace base64 images with a placeholder
+            splitted = text.split('\n')
+            for i, line in enumerate(splitted):
+                if '![image](data:image/png;base64,' in line:
+                    splitted[i] = (
+                        '![image](data:image/png;base64, ...) already displayed to user'
+                    )
+            text = '\n'.join(splitted)
+            text = truncate_content(text, max_message_chars)
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, AgentDelegateObservation):
+            text = 'OBSERVATION:\n' + truncate_content(
+                str(obs.outputs), max_message_chars
+            )
+            return Message(role='user', content=[TextContent(text=text)])
+        return None
+
    def reset(self) -> None:
-        """
-        Resets the CodeAct Agent.
-        """
+        """Resets the CodeAct Agent."""
        super().reset()

    def step(self, state: State) -> Action:
-        """
-        Performs one step using the CodeAct Agent.
+        """Performs one step using the CodeAct Agent.
        This includes gathering info on previous steps and prompting the model to make a command to execute.

        Parameters:
@@ -188,17 +189,16 @@ class CodeActAgent(Agent):
        - MessageAction(content) - Message action to run (e.g. ask for clarification)
        - AgentFinishAction() - end the interaction
        """
-
        # if we're done, go back
        latest_user_message = state.history.get_last_user_message()
        if latest_user_message and latest_user_message.strip() == '/exit':
            return AgentFinishAction()

        # prepare what we want to send to the LLM
-        messages: list[dict[str, str]] = self._get_messages(state)
+        messages = self._get_messages(state)

        response = self.llm.completion(
-            messages=messages,
+            messages=[message.model_dump() for message in messages],
            stop=[
                '</execute_ipython>',
                '</execute_bash>',
@@ -208,34 +208,61 @@ class CodeActAgent(Agent):
        )
        return self.action_parser.parse(response)

-    def _get_messages(self, state: State) -> list[dict[str, str]]:
-        messages = [
-            {'role': 'system', 'content': self.system_message},
-            {'role': 'user', 'content': self.in_context_example},
+    def _get_messages(self, state: State) -> list[Message]:
+        messages: list[Message] = [
+            Message(role='system', content=[TextContent(text=self.system_message)]),
+            Message(role='user', content=[TextContent(text=self.in_context_example)]),
        ]

        for event in state.history.get_events():
            # create a regular message from an event
-            message = (
-                get_action_message(event)
-                if isinstance(event, Action)
-                else get_observation_message(event)
-            )
+            if isinstance(event, Action):
+                message = self.get_action_message(event)
+            elif isinstance(event, Observation):
+                message = self.get_observation_message(event)
+            else:
+                raise ValueError(f'Unknown event type: {type(event)}')

            # add regular message
            if message:
-                messages.append(message)
+                # handle error if the message is the SAME role as the previous message
+                # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
+                # there should not have two consecutive messages from the same role
+                if messages and messages[-1].role == message.role:
+                    messages[-1].content.extend(message.content)
+                else:
+                    messages.append(message)

        # the latest user message is important:
        # we want to remind the agent of the environment constraints
        latest_user_message = next(
-            (m for m in reversed(messages) if m['role'] == 'user'), None
+            (
+                m
+                for m in reversed(messages)
+                if m.role == 'user'
+                and any(isinstance(c, TextContent) for c in m.content)
+            ),
+            None,
        )

-        # add a reminder to the prompt
+        # Get the last user text inside content
        if latest_user_message:
-            latest_user_message['content'] += (
-                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>'
+            latest_user_message_text = next(
+                (
+                    t
+                    for t in reversed(latest_user_message.content)
+                    if isinstance(t, TextContent)
+                )
            )
+            # add a reminder to the prompt
+            reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
+
+            if latest_user_message_text:
+                latest_user_message_text.text = (
+                    latest_user_message_text.text + reminder_text
+                )
+            else:
+                latest_user_message_text = TextContent(text=reminder_text)
+                latest_user_message.content.append(latest_user_message_text)

        return messages
--- a/agenthub/codeact_swe_agent/action_parser.py
+++ b/agenthub/codeact_swe_agent/action_parser.py
@@ -11,9 +11,8 @@ from opendevin.events.action import (


 class CodeActSWEActionParserFinish(ActionParser):
-    """
-    Parser action:
-        - AgentFinishAction() - end the interaction
+    """Parser action:
+    - AgentFinishAction() - end the interaction
    """

    def __init__(
@@ -34,10 +33,9 @@ class CodeActSWEActionParserFinish(ActionParser):


 class CodeActSWEActionParserCmdRun(ActionParser):
-    """
-    Parser action:
-        - CmdRunAction(command) - bash command to run
-        - AgentFinishAction() - end the interaction
+    """Parser action:
+    - CmdRunAction(command) - bash command to run
+    - AgentFinishAction() - end the interaction
    """

    def __init__(
@@ -64,9 +62,8 @@ class CodeActSWEActionParserCmdRun(ActionParser):


 class CodeActSWEActionParserIPythonRunCell(ActionParser):
-    """
-    Parser action:
-        - IPythonRunCellAction(code) - IPython code to run
+    """Parser action:
+    - IPythonRunCellAction(code) - IPython code to run
    """

    def __init__(
@@ -95,9 +92,8 @@ class CodeActSWEActionParserIPythonRunCell(ActionParser):


 class CodeActSWEActionParserMessage(ActionParser):
-    """
-    Parser action:
-        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    """Parser action:
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
    """

    def __init__(
--- a/agenthub/codeact_swe_agent/codeact_swe_agent.py
+++ b/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -7,7 +7,7 @@ from agenthub.codeact_swe_agent.prompt import (
 from agenthub.codeact_swe_agent.response_parser import CodeActSWEResponseParser
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import config
+from opendevin.core.message import ImageContent, Message, TextContent
 from opendevin.events.action import (
    Action,
    AgentFinishAction,
@@ -19,6 +19,7 @@ from opendevin.events.observation import (
    CmdOutputObservation,
    IPythonRunCellObservation,
 )
+from opendevin.events.observation.observation import Observation
 from opendevin.events.serialization.event import truncate_content
 from opendevin.llm.llm import LLM
 from opendevin.runtime.plugins import (
@@ -29,54 +30,6 @@ from opendevin.runtime.plugins import (
 from opendevin.runtime.tools import RuntimeTool


-def action_to_str(action: Action) -> str:
-    if isinstance(action, CmdRunAction):
-        return f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
-    elif isinstance(action, IPythonRunCellAction):
-        return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
-    elif isinstance(action, MessageAction):
-        return action.content
-    return ''
-
-
-def get_action_message(action: Action) -> dict[str, str] | None:
-    if (
-        isinstance(action, CmdRunAction)
-        or isinstance(action, IPythonRunCellAction)
-        or isinstance(action, MessageAction)
-    ):
-        return {
-            'role': 'user' if action.source == 'user' else 'assistant',
-            'content': action_to_str(action),
-        }
-    return None
-
-
-def get_observation_message(obs) -> dict[str, str] | None:
-    max_message_chars = config.get_llm_config_from_agent(
-        'CodeActSWEAgent'
-    ).max_message_chars
-    if isinstance(obs, CmdOutputObservation):
-        content = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
-        content += (
-            f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
-        )
-        return {'role': 'user', 'content': content}
-    elif isinstance(obs, IPythonRunCellObservation):
-        content = 'OBSERVATION:\n' + obs.content
-        # replace base64 images with a placeholder
-        splitted = content.split('\n')
-        for i, line in enumerate(splitted):
-            if '![image](data:image/png;base64,' in line:
-                splitted[i] = (
-                    '![image](data:image/png;base64, ...) already displayed to user'
-                )
-        content = '\n'.join(splitted)
-        content = truncate_content(content, max_message_chars)
-        return {'role': 'user', 'content': content}
-    return None
-
-
 def get_system_message() -> str:
    return f'{SYSTEM_PREFIX}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'

@@ -113,8 +66,7 @@ class CodeActSWEAgent(Agent):
        self,
        llm: LLM,
    ) -> None:
-        """
-        Initializes a new instance of the CodeActAgent class.
+        """Initializes a new instance of the CodeActAgent class.

        Parameters:
        - llm (LLM): The llm to be used by this agent
@@ -122,15 +74,62 @@ class CodeActSWEAgent(Agent):
        super().__init__(llm)
        self.reset()

+    def action_to_str(self, action: Action) -> str:
+        if isinstance(action, CmdRunAction):
+            return (
+                f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
+            )
+        elif isinstance(action, IPythonRunCellAction):
+            return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
+        elif isinstance(action, MessageAction):
+            return action.content
+        return ''
+
+    def get_action_message(self, action: Action) -> Message | None:
+        if (
+            isinstance(action, CmdRunAction)
+            or isinstance(action, IPythonRunCellAction)
+            or isinstance(action, MessageAction)
+        ):
+            content = [TextContent(text=self.action_to_str(action))]
+
+            if isinstance(action, MessageAction) and action.images_urls:
+                content.append(ImageContent(image_urls=action.images_urls))
+
+            return Message(
+                role='user' if action.source == 'user' else 'assistant', content=content
+            )
+
+        return None
+
+    def get_observation_message(self, obs: Observation) -> Message | None:
+        max_message_chars = self.llm.config.max_message_chars
+        if isinstance(obs, CmdOutputObservation):
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
+            text += (
+                f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
+            )
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, IPythonRunCellObservation):
+            text = 'OBSERVATION:\n' + obs.content
+            # replace base64 images with a placeholder
+            splitted = text.split('\n')
+            for i, line in enumerate(splitted):
+                if '![image](data:image/png;base64,' in line:
+                    splitted[i] = (
+                        '![image](data:image/png;base64, ...) already displayed to user'
+                    )
+            text = '\n'.join(splitted)
+            text = truncate_content(text, max_message_chars)
+            return Message(role='user', content=[TextContent(text=text)])
+        return None
+
    def reset(self) -> None:
-        """
-        Resets the CodeAct Agent.
-        """
+        """Resets the CodeAct Agent."""
        super().reset()

    def step(self, state: State) -> Action:
-        """
-        Performs one step using the CodeAct Agent.
+        """Performs one step using the CodeAct Agent.
        This includes gathering info on previous steps and prompting the model to make a command to execute.

        Parameters:
@@ -142,17 +141,16 @@ class CodeActSWEAgent(Agent):
        - MessageAction(content) - Message action to run (e.g. ask for clarification)
        - AgentFinishAction() - end the interaction
        """
-
        # if we're done, go back
        latest_user_message = state.history.get_last_user_message()
        if latest_user_message and latest_user_message.strip() == '/exit':
            return AgentFinishAction()

        # prepare what we want to send to the LLM
-        messages: list[dict[str, str]] = self._get_messages(state)
+        messages: list[Message] = self._get_messages(state)

        response = self.llm.completion(
-            messages=messages,
+            messages=[message.model_dump() for message in messages],
            stop=[
                '</execute_ipython>',
                '</execute_bash>',
@@ -162,34 +160,55 @@ class CodeActSWEAgent(Agent):

        return self.response_parser.parse(response)

-    def _get_messages(self, state: State) -> list[dict[str, str]]:
-        messages = [
-            {'role': 'system', 'content': self.system_message},
-            {'role': 'user', 'content': self.in_context_example},
+    def _get_messages(self, state: State) -> list[Message]:
+        messages: list[Message] = [
+            Message(role='system', content=[TextContent(text=self.system_message)]),
+            Message(role='user', content=[TextContent(text=self.in_context_example)]),
        ]

        for event in state.history.get_events():
            # create a regular message from an event
-            message = (
-                get_action_message(event)
-                if isinstance(event, Action)
-                else get_observation_message(event)
-            )
+            if isinstance(event, Action):
+                message = self.get_action_message(event)
+            elif isinstance(event, Observation):
+                message = self.get_observation_message(event)
+            else:
+                raise ValueError(f'Unknown event type: {type(event)}')

            # add regular message
            if message:
-                messages.append(message)
+                # handle error if the message is the SAME role as the previous message
+                # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
+                # there should not have two consecutive messages from the same role
+                if messages and messages[-1].role == message.role:
+                    messages[-1].content.extend(message.content)
+                else:
+                    messages.append(message)

        # the latest user message is important:
        # we want to remind the agent of the environment constraints
        latest_user_message = next(
-            (m for m in reversed(messages) if m['role'] == 'user'), None
+            (m for m in reversed(messages) if m.role == 'user'), None
        )

-        # add a reminder to the prompt
+        # Get the last user text inside content
        if latest_user_message:
-            latest_user_message['content'] += (
-                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
+            latest_user_message_text = next(
+                (
+                    t
+                    for t in reversed(latest_user_message.content)
+                    if isinstance(t, TextContent)
+                )
            )
+            # add a reminder to the prompt
+            reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
+
+            if latest_user_message_text:
+                latest_user_message_text.text = (
+                    latest_user_message_text.text + reminder_text
+                )
+            else:
+                latest_user_message_text = TextContent(text=reminder_text)
+                latest_user_message.content.append(latest_user_message_text)

        return messages
--- a/agenthub/codeact_swe_agent/response_parser.py
+++ b/agenthub/codeact_swe_agent/response_parser.py
@@ -9,12 +9,11 @@ from opendevin.events.action import Action


 class CodeActSWEResponseParser(ResponseParser):
-    """
-    Parser action:
-        - CmdRunAction(command) - bash command to run
-        - IPythonRunCellAction(code) - IPython code to run
-        - MessageAction(content) - Message action to run (e.g. ask for clarification)
-        - AgentFinishAction() - end the interaction
+    """Parser action:
+    - CmdRunAction(command) - bash command to run
+    - IPythonRunCellAction(code) - IPython code to run
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    - AgentFinishAction() - end the interaction
    """

    def __init__(self):
--- a/agenthub/delegator_agent/agent.py
+++ b/agenthub/delegator_agent/agent.py
@@ -14,8 +14,7 @@ class DelegatorAgent(Agent):
    current_delegate: str = ''

    def __init__(self, llm: LLM):
-        """
-        Initialize the Delegator Agent with an LLM
+        """Initialize the Delegator Agent with an LLM

        Parameters:
        - llm (LLM): The llm to be used by this agent
@@ -23,8 +22,7 @@ class DelegatorAgent(Agent):
        super().__init__(llm)

    def step(self, state: State) -> Action:
-        """
-        Checks to see if current step is completed, returns AgentFinishAction if True.
+        """Checks to see if current step is completed, returns AgentFinishAction if True.
        Otherwise, delegates the task to the next agent in the pipeline.

        Parameters:
@@ -36,7 +34,7 @@ class DelegatorAgent(Agent):
        """
        if self.current_delegate == '':
            self.current_delegate = 'study'
-            task = state.get_current_user_intent()
+            task, _ = state.get_current_user_intent()
            return AgentDelegateAction(
                agent='StudyRepoForTaskAgent', inputs={'task': task}
            )
@@ -47,7 +45,7 @@ class DelegatorAgent(Agent):
        if not isinstance(last_observation, AgentDelegateObservation):
            raise Exception('Last observation is not an AgentDelegateObservation')

-        goal = state.get_current_user_intent()
+        goal, _ = state.get_current_user_intent()
        if self.current_delegate == 'study':
            self.current_delegate = 'coder'
            return AgentDelegateAction(
--- a/agenthub/dummy_agent/agent.py
+++ b/agenthub/dummy_agent/agent.py
@@ -1,8 +1,8 @@
-import time
-from typing import TypedDict
+from typing import TypedDict, Union

 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.schema import AgentState
 from opendevin.events.action import (
    Action,
    AddTaskAction,
@@ -17,6 +17,7 @@ from opendevin.events.action import (
    ModifyTaskAction,
 )
 from opendevin.events.observation import (
+    AgentStateChangedObservation,
    CmdOutputObservation,
    FileReadObservation,
    FileWriteObservation,
@@ -48,32 +49,40 @@ class DummyAgent(Agent):
        super().__init__(llm)
        self.steps: list[ActionObs] = [
            {
-                'action': AddTaskAction(parent='0', goal='check the current directory'),
-                'observations': [NullObservation('')],
+                'action': AddTaskAction(
+                    parent='None', goal='check the current directory'
+                ),
+                'observations': [],
            },
            {
-                'action': AddTaskAction(parent='0.0', goal='run ls'),
-                'observations': [NullObservation('')],
+                'action': AddTaskAction(parent='0', goal='run ls'),
+                'observations': [],
            },
            {
-                'action': ModifyTaskAction(task_id='0.0', state='in_progress'),
-                'observations': [NullObservation('')],
+                'action': ModifyTaskAction(task_id='0', state='in_progress'),
+                'observations': [],
            },
            {
                'action': MessageAction('Time to get started!'),
-                'observations': [NullObservation('')],
+                'observations': [],
            },
            {
                'action': CmdRunAction(command='echo "foo"'),
                'observations': [
-                    CmdOutputObservation('foo', command_id=-1, command='echo "foo"')
+                    CmdOutputObservation(
+                        'foo', command_id=-1, command='echo "foo"', exit_code=0
+                    )
                ],
            },
            {
                'action': FileWriteAction(
                    content='echo "Hello, World!"', path='hello.sh'
                ),
-                'observations': [FileWriteObservation('', path='hello.sh')],
+                'observations': [
+                    FileWriteObservation(
+                        content='echo "Hello, World!"', path='hello.sh'
+                    )
+                ],
            },
            {
                'action': FileReadAction(path='hello.sh'),
@@ -85,14 +94,17 @@ class DummyAgent(Agent):
                'action': CmdRunAction(command='bash hello.sh'),
                'observations': [
                    CmdOutputObservation(
-                        'Hello, World!', command_id=-1, command='bash hello.sh'
+                        'bash: hello.sh: No such file or directory',
+                        command_id=-1,
+                        command='bash workspace/hello.sh',
+                        exit_code=127,
                    )
                ],
            },
            {
                'action': BrowseURLAction(url='https://google.com'),
                'observations': [
-                    # BrowserOutputObservation('<html></html>', url='https://google.com', screenshot=""),
+                    # BrowserOutputObservation('<html><body>Simulated Google page</body></html>',url='https://google.com',screenshot=''),
                ],
            },
            {
@@ -100,47 +112,99 @@ class DummyAgent(Agent):
                    browser_actions='goto("https://google.com")'
                ),
                'observations': [
-                    # BrowserOutputObservation('<html></html>', url='https://google.com', screenshot=""),
+                    # BrowserOutputObservation('<html><body>Simulated Google page after interaction</body></html>',url='https://google.com',screenshot=''),
                ],
            },
            {
-                'action': AgentFinishAction(),
-                'observations': [],
+                'action': AgentRejectAction(),
+                'observations': [NullObservation('')],
            },
            {
-                'action': AgentRejectAction(),
-                'observations': [],
+                'action': AgentFinishAction(
+                    outputs={}, thought='Task completed', action='finish'
+                ),
+                'observations': [AgentStateChangedObservation('', AgentState.FINISHED)],
            },
        ]

    def step(self, state: State) -> Action:
-        time.sleep(0.1)
+        if state.iteration >= len(self.steps):
+            return AgentFinishAction()
+
+        current_step = self.steps[state.iteration]
+        action = current_step['action']
+
+        # If the action is AddTaskAction or ModifyTaskAction, update the parent ID or task_id
+        if isinstance(action, AddTaskAction):
+            if action.parent == 'None':
+                action.parent = ''  # Root task has no parent
+            elif action.parent == '0':
+                action.parent = state.root_task.id
+            elif action.parent.startswith('0.'):
+                action.parent = f'{state.root_task.id}{action.parent[1:]}'
+        elif isinstance(action, ModifyTaskAction):
+            if action.task_id == '0':
+                action.task_id = state.root_task.id
+            elif action.task_id.startswith('0.'):
+                action.task_id = f'{state.root_task.id}{action.task_id[1:]}'
+            # Ensure the task_id doesn't start with a dot
+            if action.task_id.startswith('.'):
+                action.task_id = action.task_id[1:]
+        elif isinstance(action, (BrowseURLAction, BrowseInteractiveAction)):
+            try:
+                return self.simulate_browser_action(action)
+            except (
+                Exception
+            ):  # This could be a specific exception for browser unavailability
+                return self.handle_browser_unavailable(action)
+
        if state.iteration > 0:
            prev_step = self.steps[state.iteration - 1]

-            # a step is (action, observations list)
-            if 'observations' in prev_step:
-                # one obs, at most
+            if 'observations' in prev_step and prev_step['observations']:
                expected_observations = prev_step['observations']
-
-                # check if the history matches the expected observations
                hist_events = state.history.get_last_events(len(expected_observations))
-                for i in range(len(expected_observations)):
+
+                if len(hist_events) < len(expected_observations):
+                    print(
+                        f'Warning: Expected {len(expected_observations)} observations, but got {len(hist_events)}'
+                    )
+
+                for i in range(min(len(expected_observations), len(hist_events))):
                    hist_obs = event_to_dict(hist_events[i])
                    expected_obs = event_to_dict(expected_observations[i])
-                    if (
-                        'command_id' in hist_obs['extras']
-                        and hist_obs['extras']['command_id'] != -1
-                    ):
-                        del hist_obs['extras']['command_id']
-                        hist_obs['content'] = ''
-                    if (
-                        'command_id' in expected_obs['extras']
-                        and expected_obs['extras']['command_id'] != -1
-                    ):
-                        del expected_obs['extras']['command_id']
-                        expected_obs['content'] = ''
-                    assert (
-                        hist_obs == expected_obs
-                    ), f'Expected observation {expected_obs}, got {hist_obs}'
-        return self.steps[state.iteration]['action']
+
+                    # Remove dynamic fields for comparison
+                    for obs in [hist_obs, expected_obs]:
+                        obs.pop('id', None)
+                        obs.pop('timestamp', None)
+                        obs.pop('cause', None)
+                        obs.pop('source', None)
+                        if 'extras' in obs:
+                            obs['extras'].pop('command_id', None)
+
+                    if hist_obs != expected_obs:
+                        print(
+                            f'Warning: Observation mismatch. Expected {expected_obs}, got {hist_obs}'
+                        )
+
+        return action
+
+    def simulate_browser_action(
+        self, action: Union[BrowseURLAction, BrowseInteractiveAction]
+    ) -> Action:
+        # Instead of simulating, we'll reject the browser action
+        return self.handle_browser_unavailable(action)
+
+    def handle_browser_unavailable(
+        self, action: Union[BrowseURLAction, BrowseInteractiveAction]
+    ) -> Action:
+        # Create a message action to inform that browsing is not available
+        message = 'Browser actions are not available in the DummyAgent environment.'
+        if isinstance(action, BrowseURLAction):
+            message += f' Unable to browse URL: {action.url}'
+        elif isinstance(action, BrowseInteractiveAction):
+            message += (
+                f' Unable to perform interactive browsing: {action.browser_actions}'
+            )
+        return MessageAction(content=message)
--- a/agenthub/micro/agent.py
+++ b/agenthub/micro/agent.py
@@ -2,7 +2,7 @@ from jinja2 import BaseLoader, Environment

 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import config
+from opendevin.core.message import ImageContent, Message, TextContent
 from opendevin.core.utils import json
 from opendevin.events.action import Action
 from opendevin.events.serialization.action import action_from_dict
@@ -23,40 +23,37 @@ def parse_response(orig_response: str) -> Action:


 def to_json(obj, **kwargs):
-    """
-    Serialize an object to str format
-    """
+    """Serialize an object to str format"""
    return json.dumps(obj, **kwargs)


-def history_to_json(history: ShortTermHistory, max_events=20, **kwargs):
-    """
-    Serialize and simplify history to str format
-    """
-    # TODO: get agent specific llm config
-    llm_config = config.get_llm_config()
-    max_message_chars = llm_config.max_message_chars
-
-    processed_history = []
-    event_count = 0
-
-    for event in history.get_events(reverse=True):
-        if event_count >= max_events:
-            break
-        processed_history.append(event_to_memory(event, max_message_chars))
-        event_count += 1
-
-    # history is in reverse order, let's fix it
-    processed_history.reverse()
-
-    return json.dumps(processed_history, **kwargs)
-
-
 class MicroAgent(Agent):
    VERSION = '1.0'
    prompt = ''
    agent_definition: dict = {}

+    def history_to_json(
+        self, history: ShortTermHistory, max_events: int = 20, **kwargs
+    ):
+        """
+        Serialize and simplify history to str format
+        """
+        processed_history = []
+        event_count = 0
+
+        for event in history.get_events(reverse=True):
+            if event_count >= max_events:
+                break
+            processed_history.append(
+                event_to_memory(event, self.llm.config.max_message_chars)
+            )
+            event_count += 1
+
+        # history is in reverse order, let's fix it
+        processed_history.reverse()
+
+        return json.dumps(processed_history, **kwargs)
+
    def __init__(self, llm: LLM):
        super().__init__(llm)
        if 'name' not in self.agent_definition:
@@ -66,16 +63,20 @@ class MicroAgent(Agent):
        del self.delegates[self.agent_definition['name']]

    def step(self, state: State) -> Action:
+        last_user_message, last_image_urls = state.get_current_user_intent()
        prompt = self.prompt_template.render(
            state=state,
            instructions=instructions,
            to_json=to_json,
-            history_to_json=history_to_json,
+            history_to_json=self.history_to_json,
            delegates=self.delegates,
-            latest_user_message=state.get_current_user_intent(),
+            latest_user_message=last_user_message,
        )
-        messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
+        content = [TextContent(text=prompt)]
+        if last_image_urls:
+            content.append(ImageContent(image_urls=last_image_urls))
+        message = Message(role='user', content=content)
+        resp = self.llm.completion(messages=[message.model_dump()])
        action_resp = resp['choices'][0]['message']['content']
        action = parse_response(action_resp)
        return action
--- a/agenthub/micro/commit_writer/README.md
+++ b/agenthub/micro/commit_writer/README.md
@@ -3,7 +3,7 @@
 CommitWriterAgent can help write git commit message. Example:

 ```bash
-WORKSPACE_MOUNT_PATH="`PWD`" SANDBOX_BOX_TYPE="ssh" \
+WORKSPACE_MOUNT_PATH="`PWD`" \
  poetry run python opendevin/core/main.py -t "dummy task" -c CommitWriterAgent -d ./
 ```

--- a/agenthub/monologue_agent/.dockerfileignore
+++ b/agenthub/monologue_agent/.dockerfileignore
--- a/agenthub/monologue_agent/.dockerignore
+++ b/agenthub/monologue_agent/.dockerignore
@@ -1,2 +0,0 @@
-.envrc
-workspace
--- a/agenthub/monologue_agent/README.md
+++ b/agenthub/monologue_agent/README.md
@@ -1,8 +0,0 @@
-# LLM control loop
-This is currently a standalone utility. It will need to be integrated into OpenDevin's backend.
-
-## Usage
-```bash
-# Run this in project root
-./agenthub/monologue_agent/build-and-run.sh "write a bash script that prints 'hello world'"
-```
--- a/agenthub/monologue_agent/TODO.md
+++ b/agenthub/monologue_agent/TODO.md
@@ -1,8 +0,0 @@
-# TODO
-There's a lot of low-hanging fruit for this agent:
-
-* Strip `<script>`, `<style>`, and other non-text tags from the HTML before sending it to the LLM
-* Keep track of the working directory when the agent uses `cd`
-* Improve memory condensing--condense earlier memories more aggressively
-* Limit the time that `run` can wait (in case agent runs an interactive command and it's hanging)
-* Figure out how to run background processes, e.g. `node server.js` to start a server
--- a/agenthub/monologue_agent/init.py
+++ b/agenthub/monologue_agent/init.py
@@ -1,5 +0,0 @@
-from opendevin.controller.agent import Agent
-
-from .agent import MonologueAgent
-
-Agent.register('MonologueAgent', MonologueAgent)
--- a/agenthub/monologue_agent/agent.py
+++ b/agenthub/monologue_agent/agent.py
@@ -1,191 +0,0 @@
-import agenthub.monologue_agent.utils.prompts as prompts
-from agenthub.monologue_agent.response_parser import MonologueResponseParser
-from agenthub.monologue_agent.utils.prompts import INITIAL_THOUGHTS
-from opendevin.controller.agent import Agent
-from opendevin.controller.state.state import State
-from opendevin.core.config import config
-from opendevin.core.exceptions import AgentNoInstructionError
-from opendevin.core.schema import ActionType
-from opendevin.events.action import (
-    Action,
-    BrowseURLAction,
-    CmdRunAction,
-    FileReadAction,
-    FileWriteAction,
-    MessageAction,
-    NullAction,
-)
-from opendevin.events.observation import (
-    BrowserOutputObservation,
-    CmdOutputObservation,
-    FileReadObservation,
-    NullObservation,
-    Observation,
-)
-from opendevin.events.serialization.event import event_to_memory
-from opendevin.llm.llm import LLM
-from opendevin.memory.condenser import MemoryCondenser
-from opendevin.runtime.tools import RuntimeTool
-
-if config.get_agent_config('MonologueAgent').memory_enabled:
-    from opendevin.memory.memory import LongTermMemory
-
-
-class MonologueAgent(Agent):
-    VERSION = '1.0'
-    """
-    The Monologue Agent utilizes long and short term memory to complete tasks.
-    Long term memory is stored as a LongTermMemory object and the model uses it to search for examples from the past.
-    Short term memory is stored as a Monologue object and the model can condense it as necessary.
-    """
-
-    _initialized = False
-    initial_thoughts: list[dict[str, str]]
-    memory: 'LongTermMemory | None'
-    memory_condenser: MemoryCondenser
-    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]
-    response_parser = MonologueResponseParser()
-
-    def __init__(self, llm: LLM):
-        """
-        Initializes the Monologue Agent with an llm.
-
-        Parameters:
-        - llm (LLM): The llm to be used by this agent
-        """
-        super().__init__(llm)
-
-    def _initialize(self, task: str):
-        """
-        Utilizes the INITIAL_THOUGHTS list to give the agent a context for its capabilities
-        and how to navigate the WORKSPACE_MOUNT_PATH_IN_SANDBOX in `config` (e.g., /workspace by default).
-        Short circuited to return when already initialized.
-        Will execute again when called after reset.
-
-        Parameters:
-        - task: The initial goal statement provided by the user
-
-        Raises:
-        - AgentNoInstructionError: If task is not provided
-        """
-
-        if self._initialized:
-            return
-
-        if task is None or task == '':
-            raise AgentNoInstructionError()
-
-        self.initial_thoughts = []
-        if config.get_agent_config('MonologueAgent').memory_enabled:
-            self.memory = LongTermMemory()
-        else:
-            self.memory = None
-
-        self.memory_condenser = MemoryCondenser()
-
-        self._add_initial_thoughts(task)
-        self._initialized = True
-
-    def _add_initial_thoughts(self, task):
-        max_message_chars = config.get_llm_config_from_agent(
-            'MonologueAgent'
-        ).max_message_chars
-        previous_action = ''
-        for thought in INITIAL_THOUGHTS:
-            thought = thought.replace('$TASK', task)
-            if previous_action != '':
-                observation: Observation = NullObservation(content='')
-                if previous_action in {ActionType.RUN, ActionType.PUSH}:
-                    observation = CmdOutputObservation(
-                        content=thought, command_id=0, command=''
-                    )
-                elif previous_action == ActionType.READ:
-                    observation = FileReadObservation(content=thought, path='')
-                elif previous_action == ActionType.BROWSE:
-                    observation = BrowserOutputObservation(
-                        content=thought, url='', screenshot=''
-                    )
-                self.initial_thoughts.append(
-                    event_to_memory(observation, max_message_chars)
-                )
-                previous_action = ''
-            else:
-                action: Action = NullAction()
-                if thought.startswith('RUN'):
-                    command = thought.split('RUN ')[1]
-                    action = CmdRunAction(command)
-                    previous_action = ActionType.RUN
-                elif thought.startswith('WRITE'):
-                    parts = thought.split('WRITE ')[1].split(' > ')
-                    path = parts[1]
-                    content = parts[0]
-                    action = FileWriteAction(path=path, content=content)
-                elif thought.startswith('READ'):
-                    path = thought.split('READ ')[1]
-                    action = FileReadAction(path=path)
-                    previous_action = ActionType.READ
-                elif thought.startswith('BROWSE'):
-                    url = thought.split('BROWSE ')[1]
-                    action = BrowseURLAction(url=url)
-                    previous_action = ActionType.BROWSE
-                else:
-                    action = MessageAction(thought)
-                self.initial_thoughts.append(event_to_memory(action, max_message_chars))
-
-    def step(self, state: State) -> Action:
-        """
-        Modifies the current state by adding the most recent actions and observations, then prompts the model to think about it's next action to take using monologue, memory, and hint.
-
-        Parameters:
-        - state (State): The current state based on previous steps taken
-
-        Returns:
-        - Action: The next action to take based on LLM response
-        """
-        max_message_chars = config.get_llm_config_from_agent(
-            'MonologueAgent'
-        ).max_message_chars
-        goal = state.get_current_user_intent()
-        self._initialize(goal)
-
-        recent_events: list[dict[str, str]] = []
-
-        # add the events from state.history
-        for event in state.history.get_events():
-            recent_events.append(event_to_memory(event, max_message_chars))
-
-        # add the last messages to long term memory
-        if self.memory is not None:
-            last_action = state.history.get_last_action()
-            last_observation = state.history.get_last_observation()
-
-            # this should still work
-            # we will need to do this differently: find out if there really is an action or an observation in this step
-            if last_action:
-                self.memory.add_event(event_to_memory(last_action, max_message_chars))
-            if last_observation:
-                self.memory.add_event(
-                    event_to_memory(last_observation, max_message_chars)
-                )
-
-        # the action prompt with initial thoughts and recent events
-        prompt = prompts.get_request_action_prompt(
-            goal, self.initial_thoughts, recent_events
-        )
-
-        messages: list[dict[str, str]] = [
-            {'role': 'user', 'content': prompt},
-        ]
-
-        # format all as a single message, a monologue
-        resp = self.llm.completion(messages=messages)
-
-        action = self.response_parser.parse(resp)
-        self.latest_action = action
-        return action
-
-    def reset(self) -> None:
-        super().reset()
-
-        # Reset the initial monologue and memory
-        self._initialized = False
--- a/agenthub/monologue_agent/utils/prompts.py
+++ b/agenthub/monologue_agent/utils/prompts.py
@@ -1,212 +0,0 @@
-from opendevin.core.config import config
-from opendevin.core.utils import json
-from opendevin.events.action import (
-    Action,
-)
-from opendevin.events.serialization.action import action_from_dict
-
-ACTION_PROMPT = """
-You're a thoughtful robot. Your main task is this:
-%(task)s
-
-Don't expand the scope of your task--just complete it as written.
-
-This is your internal monologue, in JSON format:
-
-%(monologue)s
-
-Your most recent thought is at the bottom of that monologue. Continue your train of thought.
-What is your next single thought or action? Your response must be in JSON format.
-It must be a single object, and it must contain two fields:
-* `action`, which is one of the actions below
-* `args`, which is a map of key-value pairs, specifying the arguments for that action
-
-Here are the possible actions:
-* `read` - reads the content of a file. Arguments:
-  * `path` - the path of the file to read
-* `write` - writes the content to a file. Arguments:
-  * `path` - the path of the file to write
-  * `content` - the content to write to the file
-* `run` - runs a command. Arguments:
-  * `command` - the command to run
-* `browse` - opens a web page. Arguments:
-  * `url` - the URL to open
-* `push` - Push a branch from the current repo to github:
-  * `owner` - the owner of the repo to push to
-  * `repo` - the name of the repo to push to
-  * `branch` - the name of the branch to push
-* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
-  * `content` - the message to record
-  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
-* `finish` - if you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
-
-You MUST take time to think in between read, write, run, browse, and push actions--do this with the `message` action.
-You should never act twice in a row without thinking. But if your last several
-actions are all `message` actions, you should consider taking a different action.
-
-Notes:
-* you are logged in as %(user)s, but sudo will always work without a password.
-* all non-background commands will be forcibly stopped if they remain running for over %(timeout)s seconds.
-* your environment is Debian Linux. You can install software with `sudo apt-get`, but remember to use -y.
-* don't run interactive commands, or commands that don't return (e.g. `node server.js`). You may run commands in the background (e.g. `node server.js &`)
-* don't run interactive text editors (e.g. `nano` or 'vim'), instead use the 'write' or 'read' action.
-* don't run gui applications (e.g. software IDEs (like vs code or codium), web browsers (like firefox or chromium), or other complex software packages). Use non-interactive cli applications, or special actions instead.
-* whenever an action fails, always send a `message` about why it may have happened before acting again.
-
-What is your next single thought or action? Again, you must reply with JSON, and only with JSON. You must respond with exactly one 'action' object.
-
-%(hint)s
-"""
-
-MONOLOGUE_SUMMARY_PROMPT = """
-Below is the internal monologue of an automated LLM agent. Each
-thought is an item in a JSON array. The thoughts may be memories,
-actions taken by the agent, or outputs from those actions.
-Please return a new, smaller JSON array, which summarizes the
-internal monologue. You can summarize individual thoughts, and
-you can condense related thoughts together with a description
-of their content.
-
-%(monologue)s
-
-Make the summaries as pithy and informative as possible.
-Be specific about what happened and what was learned. The summary
-will be used as keywords for searching for the original memory.
-Be sure to preserve any key words or important information.
-
-Your response must be in JSON format. It must be an object with the
-key `new_monologue`, which is a JSON array containing the summarized monologue.
-Each entry in the array must have an `action` key, and an `args` key.
-The action key may be `summarize`, and `args.summary` should contain the summary.
-You can also use the same action and args from the source monologue.
-"""
-
-INITIAL_THOUGHTS = [
-    'I exist!',
-    'Hmm...looks like I can type in a command line prompt',
-    'Looks like I have a web browser too!',
-    "Here's what I want to do: $TASK",
-    'How am I going to get there though?',
-    'It seems like I have some kind of short term memory.',
-    'Each of my thoughts seems to be stored in a JSON array.',
-    'It seems whatever I say next will be added as an object to the list.',
-    "It looks like it's easy for me to use the command line too! I just have to perform a run action and include the command I want to run in the command argument. The command output just jumps into my head!",
-    'RUN echo "hello world"',
-    'hello world',
-    'Cool! I bet I can write files too using the write action.',
-    'WRITE echo "console.log(\'hello world\')" > test.js',
-    '',
-    "I just created test.js. I'll try and run it now.",
-    'RUN node test.js',
-    'hello world',
-    'It works!',
-    "I'm going to try reading it now using the read action.",
-    'READ test.js',
-    "console.log('hello world')",
-    'Nice! I can read files too!',
-    'And if I want to use the browser, I just need to use the browse action and include the url I want to visit in the url argument',
-    "Let's try that...",
-    'BROWSE google.com',
-    '<form><input type="text"></input><button type="submit"></button></form>',
-    'I can browse the web too!',
-    'And once I have completed my task, I can use the finish action to stop working.',
-    "But I should only use the finish action when I'm absolutely certain that I've completed my task and have tested my work.",
-    'Very cool. Now to accomplish my task.',
-    "I'll need a strategy. And as I make progress, I'll need to keep refining that strategy. I'll need to set goals, and break them into sub-goals.",
-    'In between actions, I must always take some time to think, strategize, and set new goals. I should never take two actions in a row.',
-    "OK so my task is to $TASK. I haven't made any progress yet. Where should I start?",
-    'It seems like there might be an existing project here. I should probably start by running `pwd` and `ls` to orient myself.',
-]
-
-
-def get_summarize_monologue_prompt(thoughts: list[dict]):
-    """
-    Gets the prompt for summarizing the monologue
-
-    Returns:
-    - str: A formatted string with the current monologue within the prompt
-    """
-    return MONOLOGUE_SUMMARY_PROMPT % {
-        'monologue': json.dumps({'old_monologue': thoughts}, indent=2),
-    }
-
-
-def get_request_action_prompt(
-    task: str,
-    thoughts: list[dict],
-    recent_events: list[dict],
-):
-    """
-    Gets the action prompt formatted with appropriate values.
-
-    Parameters:
-    - task (str): The current task the agent is trying to accomplish
-    - thoughts (list[dict]): The agent's current thoughts
-
-    Returns:
-    - str: Formatted prompt string with hint, task, monologue, and background commands included
-    """
-
-    hint = ''
-    if len(recent_events) > 0:
-        latest_event = recent_events[-1]
-        if 'action' in latest_event:
-            if (
-                latest_event['action'] == 'message'
-                and 'source' in latest_event
-                and latest_event['source'] == 'agent'
-            ):
-                hint = (
-                    "You've been thinking a lot lately. Maybe it's time to take action?"
-                )
-            elif latest_event['action'] == 'error':
-                hint = 'Looks like that last command failed. Maybe you need to fix it, or try something else.'
-    else:
-        hint = "You're just getting started! What should you do first?"
-
-    user = 'opendevin' if config.run_as_devin else 'root'
-
-    monologue = thoughts + recent_events
-
-    return ACTION_PROMPT % {
-        'task': task,
-        'monologue': json.dumps(monologue, indent=2),
-        'hint': hint,
-        'user': user,
-        'timeout': config.sandbox.timeout,
-        'WORKSPACE_MOUNT_PATH_IN_SANDBOX': config.workspace_mount_path_in_sandbox,
-    }
-
-
-def parse_action_response(orig_response: str) -> Action:
-    """
-    Parses a string to find an action within it
-
-    Parameters:
-    - response (str): The string to be parsed
-
-    Returns:
-    - Action: The action that was found in the response string
-    """
-    # attempt to load the JSON dict from the response
-    action_dict = json.loads(orig_response)
-
-    if 'content' in action_dict:
-        # The LLM gets confused here. Might as well be robust
-        action_dict['contents'] = action_dict.pop('content')
-
-    return action_from_dict(action_dict)
-
-
-def parse_summary_response(response: str) -> list[dict]:
-    """
-    Parses a summary of the monologue
-
-    Parameters:
-    - response (str): The response string to be parsed
-
-    Returns:
-    - list[dict]: The list of summaries output by the model
-    """
-    parsed = json.loads(response)
-    return parsed['new_monologue']
--- a/agenthub/planner_agent/agent.py
+++ b/agenthub/planner_agent/agent.py
@@ -1,11 +1,12 @@
-from agenthub.monologue_agent.response_parser import MonologueResponseParser
+from agenthub.planner_agent.response_parser import PlannerResponseParser
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.message import ImageContent, Message, TextContent
 from opendevin.events.action import Action, AgentFinishAction
 from opendevin.llm.llm import LLM
 from opendevin.runtime.tools import RuntimeTool

-from .prompt import get_prompt
+from .prompt import get_prompt_and_images


 class PlannerAgent(Agent):
@@ -15,11 +16,10 @@ class PlannerAgent(Agent):
    The agent is given its previous action-observation pairs, current task, and hint based on last action taken at every step.
    """
    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]
-    response_parser = MonologueResponseParser()
+    response_parser = PlannerResponseParser()

    def __init__(self, llm: LLM):
-        """
-        Initialize the Planner Agent with an LLM
+        """Initialize the Planner Agent with an LLM

        Parameters:
        - llm (LLM): The llm to be used by this agent
@@ -27,8 +27,7 @@ class PlannerAgent(Agent):
        super().__init__(llm)

    def step(self, state: State) -> Action:
-        """
-        Checks to see if current step is completed, returns AgentFinishAction if True.
+        """Checks to see if current step is completed, returns AgentFinishAction if True.
        Otherwise, creates a plan prompt and sends to model for inference, returning the result as the next action.

        Parameters:
@@ -38,14 +37,19 @@ class PlannerAgent(Agent):
        - AgentFinishAction: If the last state was 'completed', 'verified', or 'abandoned'
        - Action: The next action to take based on llm response
        """
-
        if state.root_task.state in [
            'completed',
            'verified',
            'abandoned',
        ]:
            return AgentFinishAction()
-        prompt = get_prompt(state)
-        messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
+
+        prompt, image_urls = get_prompt_and_images(
+            state, self.llm.config.max_message_chars
+        )
+        content = [TextContent(text=prompt)]
+        if image_urls:
+            content.append(ImageContent(image_urls=image_urls))
+        message = Message(role='user', content=content)
+        resp = self.llm.completion(messages=[message.model_dump()])
        return self.response_parser.parse(resp)
--- a/agenthub/planner_agent/prompt.py
+++ b/agenthub/planner_agent/prompt.py
@@ -1,5 +1,4 @@
 from opendevin.controller.state.state import State
-from opendevin.core.config import config
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.schema import ActionType
 from opendevin.core.utils import json
@@ -101,7 +100,6 @@ What is your next thought or action? Again, you must reply with JSON, and only w

 def get_hint(latest_action_id: str) -> str:
    """Returns action type hint based on given action_id"""
-
    hints = {
        '': "You haven't taken any actions yet. Start by using `ls` to check out what files you're working with.",
        ActionType.RUN: 'You should think about the command you just ran, what output it gave, and how that affects your plan.',
@@ -117,9 +115,11 @@ def get_hint(latest_action_id: str) -> str:
    return hints.get(latest_action_id, '')


-def get_prompt(state: State) -> str:
-    """
-    Gets the prompt for the planner agent.
+def get_prompt_and_images(
+    state: State, max_message_chars: int
+) -> tuple[str, list[str]]:
+    """Gets the prompt for the planner agent.
+
    Formatted with the most recent action-observation pairs, current task, and hint based on last action

    Parameters:
@@ -128,10 +128,6 @@ def get_prompt(state: State) -> str:
    Returns:
    - str: The formatted string prompt with historical values
    """
-    max_message_chars = config.get_llm_config_from_agent(
-        'PlannerAgent'
-    ).max_message_chars
-
    # the plan
    plan_str = json.dumps(state.root_task.to_dict(), indent=2)

@@ -167,23 +163,23 @@ def get_prompt(state: State) -> str:
    logger.info('HINT:\n' + hint, extra={'msg_type': 'DETAIL'})

    # the last relevant user message (the task)
-    task = state.get_current_user_intent()
+    message, image_urls = state.get_current_user_intent()

    # finally, fill in the prompt
    return prompt % {
-        'task': task,
+        'task': message,
        'plan': plan_str,
        'history': history_str,
        'hint': hint,
        'plan_status': plan_status,
-    }
+    }, image_urls


 def parse_response(response: str) -> Action:
-    """
-    Parses the model output to find a valid action to take
+    """Parses the model output to find a valid action to take
    Parameters:
    - response (str): A response from the model that potentially contains an Action.
+
    Returns:
    - Action: A valid next action to perform from model output
    """
--- a/agenthub/monologue_agent/response_parser.py
+++ b/agenthub/monologue_agent/response_parser.py
@@ -6,7 +6,7 @@ from opendevin.events.action import (
 from opendevin.events.serialization.action import action_from_dict


-class MonologueResponseParser(ResponseParser):
+class PlannerResponseParser(ResponseParser):
    def __init__(self):
        super().__init__()

@@ -19,8 +19,7 @@ class MonologueResponseParser(ResponseParser):
        return response['choices'][0]['message']['content']

    def parse_action(self, action_str: str) -> Action:
-        """
-        Parses a string to find an action within it
+        """Parses a string to find an action within it

        Parameters:
        - response (str): The string to be parsed
--- a/config.template.toml
+++ b/config.template.toml
@@ -25,9 +25,6 @@ workspace_base = "./workspace"
 # Disable color in terminal output
 #disable_color = false

-# Enable auto linting after editing
-#enable_auto_lint = false
-
 # Enable saving and restoring the session when run from CLI
 #enable_cli_session = false

@@ -58,26 +55,11 @@ workspace_base = "./workspace"
 # Path to rewrite the workspace mount path to
 #workspace_mount_rewrite = ""

-# Persist the sandbox
-persist_sandbox = false
-
 # Run as devin
 #run_as_devin = true

 # Runtime environment
-#runtime = "server"
-
-# SSH hostname for the sandbox
-#ssh_hostname = "localhost"
-
-# SSH password for the sandbox
-#ssh_password = ""
-
-# SSH port for the sandbox
-#ssh_port = 63710
-
-# Use host network
-#use_host_network = false
+#runtime = "eventstream"

 # Name of the default agent
 #default_agent = "CodeActAgent"
@@ -188,14 +170,17 @@ llm_config = 'gpt3'
 # Sandbox timeout in seconds
 #timeout = 120

-# Sandbox type (ssh, e2b, local)
-#box_type = "ssh"
-
 # Sandbox user ID
 #user_id = 1000

 # Container image to use for the sandbox
-#container_image = "ghcr.io/opendevin/sandbox:main"
+#container_image = "nikolaik/python-nodejs:python3.11-nodejs22"
+
+# Use host network
+#use_host_network = false
+
+# Enable auto linting after editing
+#enable_auto_lint = false

 #################################### Eval ####################################
 # Configuration for the evaluation, please refer to the specific evaluation
--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -1,5 +1,5 @@
 ARG OPEN_DEVIN_BUILD_VERSION=dev
-FROM node:21.7.2-bookworm-slim as frontend-builder
+FROM node:21.7.2-bookworm-slim AS frontend-builder

 WORKDIR /app

@@ -10,10 +10,10 @@ RUN npm ci
 COPY ./frontend ./
 RUN npm run make-i18n && npm run build

-FROM python:3.12.3-slim as backend-builder
+FROM python:3.12.3-slim AS backend-builder

 WORKDIR /app
-ENV PYTHONPATH '/app'
+ENV PYTHONPATH='/app'

 ENV POETRY_NO_INTERACTION=1 \
    POETRY_VIRTUALENVS_IN_PROJECT=1 \
@@ -26,17 +26,19 @@ RUN apt-get update -y \

 COPY ./pyproject.toml ./poetry.lock ./
 RUN touch README.md
-RUN poetry install --without evaluation --no-root && rm -rf $POETRY_CACHE_DIR
+RUN export POETRY_CACHE_DIR && poetry install --without evaluation,llama-index --no-root && rm -rf $POETRY_CACHE_DIR

-FROM python:3.12.3-slim as runtime
+FROM python:3.12.3-slim AS runtime

 WORKDIR /app

+ARG OPEN_DEVIN_BUILD_VERSION #re-declare for this section
+
 ENV RUN_AS_DEVIN=true
 # A random number--we need this to be different from the user's UID on the host machine
 ENV OPENDEVIN_USER_ID=42420
+ENV SANDBOX_API_HOSTNAME=host.docker.internal
 ENV USE_HOST_NETWORK=false
-ENV SSH_HOSTNAME=host.docker.internal
 ENV WORKSPACE_BASE=/opt/workspace_base
 ENV OPEN_DEVIN_BUILD_VERSION=$OPEN_DEVIN_BUILD_VERSION
 RUN mkdir -p $WORKSPACE_BASE
@@ -44,8 +46,10 @@ RUN mkdir -p $WORKSPACE_BASE
 RUN apt-get update -y \
    && apt-get install -y curl ssh sudo

-RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs # Default is 1000, but OSX is often 501
-RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs # Default is 60000, but we've seen up to 200000
+# Default is 1000, but OSX is often 501
+RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs
+# Default is 60000, but we've seen up to 200000
+RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs

 RUN groupadd app
 RUN useradd -l -m -u $OPENDEVIN_USER_ID -s /bin/bash opendevin && \
@@ -66,6 +70,9 @@ RUN playwright install --with-deps chromium
 COPY --chown=opendevin:app --chmod=770 ./opendevin ./opendevin
 COPY --chown=opendevin:app --chmod=777 ./opendevin/runtime/plugins ./opendevin/runtime/plugins
 COPY --chown=opendevin:app --chmod=770 ./agenthub ./agenthub
+COPY --chown=opendevin:app --chmod=770 ./pyproject.toml ./pyproject.toml
+COPY --chown=opendevin:app --chmod=770 ./poetry.lock ./poetry.lock
+COPY --chown=opendevin:app --chmod=770 ./README.md ./README.md

 RUN python opendevin/core/download.py # No-op to download assets
 RUN chown -R opendevin:app /app/logs && chmod -R 770 /app/logs # This gets created by the download.py script
--- a/containers/app/entrypoint.sh
+++ b/containers/app/entrypoint.sh
@@ -22,7 +22,9 @@ if [[ "$SANDBOX_USER_ID" -eq 0 ]]; then
  echo "Running OpenDevin as root"
  export RUN_AS_DEVIN=false
  mkdir -p /root/.cache/ms-playwright/
-  mv /home/opendevin/.cache/ms-playwright/ /root/.cache/
+  if [ -d "/home/opendevin/.cache/ms-playwright/" ]; then
+    mv /home/opendevin/.cache/ms-playwright/ /root/.cache/
+  fi
  "$@"
 else
  echo "Setting up enduser with id $SANDBOX_USER_ID"
@@ -52,7 +54,9 @@ else

  mkdir -p /home/enduser/.cache/huggingface/hub/
  mkdir -p /home/enduser/.cache/ms-playwright/
-  mv /home/opendevin/.cache/ms-playwright/ /home/enduser/.cache/
+  if [ -d "/home/opendevin/.cache/ms-playwright/" ]; then
+    mv /home/opendevin/.cache/ms-playwright/ /home/enduser/.cache/
+  fi

  usermod -aG $DOCKER_SOCKET_GID enduser
  echo "Running as enduser"
--- a/containers/build.sh
+++ b/containers/build.sh
@@ -27,11 +27,14 @@ echo "Tags: ${tags[@]}"

 if [[ "$image_name" == "opendevin" ]]; then
  dir="./containers/app"
+elif [[ "$image_name" == "od_runtime" ]]; then
+  dir="./containers/runtime"
 else
  dir="./containers/$image_name"
 fi

-if [[ ! -f "$dir/Dockerfile" ]]; then
+if [[ (! -f "$dir/Dockerfile") && "$image_name" != "od_runtime" ]]; then
+  # Allow runtime to be built without a Dockerfile
  echo "No Dockerfile found"
  exit 1
 fi
@@ -46,6 +49,16 @@ if [[ -n "$org_name" ]]; then
  DOCKER_ORG="$org_name"
 fi

+# If $DOCKER_IMAGE_TAG is set, add it to the tags
+if [[ -n "$DOCKER_IMAGE_TAG" ]]; then
+  tags+=("$DOCKER_IMAGE_TAG")
+fi
+# If $DOCKER_IMAGE_HASH_TAG is set, add it to the tags
+if [[ -n "$DOCKER_IMAGE_HASH_TAG" ]]; then
+  tags+=("$DOCKER_IMAGE_HASH_TAG")
+fi
+
+
 DOCKER_REPOSITORY="$DOCKER_REGISTRY/$DOCKER_ORG/$DOCKER_IMAGE"
 DOCKER_REPOSITORY=${DOCKER_REPOSITORY,,} # lowercase
 echo "Repo: $DOCKER_REPOSITORY"
--- a/containers/runtime/README.md
+++ b/containers/runtime/README.md
@@ -0,0 +1,11 @@
+# Dynamic constructed Dockerfile
+
+This folder builds runtime image (sandbox), which will use a `Dockerfile` that is dynamically generated depends on the `base_image` AND a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that's based on the current commit of `opendevin`.
+
+The following command will generate Dockerfile for `ubuntu:22.04` and the source distribution `.tar` into `containers/runtime`.
+
+```bash
+poetry run python3 opendevin/runtime/utils/runtime_build.py \
+    --base_image ubuntu:22.04 \
+    --build_folder containers/runtime
+```
--- a/containers/runtime/config.sh
+++ b/containers/runtime/config.sh
@@ -0,0 +1,6 @@
+DOCKER_REGISTRY=ghcr.io
+DOCKER_ORG=opendevin
+DOCKER_BASE_DIR="./containers/runtime"
+# These two variables will be appended by the runtime_build.py script
+# DOCKER_IMAGE=
+# DOCKER_IMAGE_TAG=
--- a/dev_config/python/mypy.ini
+++ b/dev_config/python/mypy.ini
@@ -7,5 +7,3 @@ warn_unreachable = True
 warn_redundant_casts = True
 no_implicit_optional = True
 strict_optional = True
-
-exclude = agenthub/monologue_agent/regression
--- a/dev_config/python/ruff.toml
+++ b/dev_config/python/ruff.toml
@@ -1,7 +1,3 @@
-exclude = [
-    "agenthub/monologue_agent/regression/",
-]
-
 [lint]
 select = [
    "E",
--- a/docs/docusaurus.config.ts
+++ b/docs/docusaurus.config.ts
@@ -4,7 +4,7 @@ import { themes as prismThemes } from "prism-react-renderer";

 const config: Config = {
  title: "OpenDevin",
-  tagline: "Code Less, Make More",
+  tagline: "An Open Platform for AI Software Developers as Generalist Agents",
  favicon: "img/logo.png",

  // Set the production url of your site here
@@ -32,6 +32,10 @@ const config: Config = {
    },
  },

+  markdown: {
+    mermaid: true,
+  },
+  themes: ['@docusaurus/theme-mermaid'],
  presets: [
    [
      "classic",
@@ -77,7 +81,6 @@ const config: Config = {
          position: "left",
          label: "Codebase",
        },
-        { to: "/faq", label: "FAQ", position: "left" },
        {
          href: "https://github.com/OpenDevin/OpenDevin",
          label: "GitHub",
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/about.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/about.md
@@ -31,7 +31,7 @@ Pour plus de détails, veuillez consulter [ce document](https://github.com/OpenD

 Nous avons maintenant à la fois un espace de travail Slack pour la collaboration sur la construction d'OpenDevin et un serveur Discord pour discuter de tout ce qui est lié, par exemple, à ce projet, aux LLM, aux agents, etc.

- [Espace de travail Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
+- [Espace de travail Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw)
 - [Serveur Discord](https://discord.gg/ESHStjSjD4)

 Si vous souhaitez contribuer, n'hésitez pas à rejoindre notre communauté. Simplifions l'ingénierie logicielle ensemble !
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/agents.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/agents.md
@@ -61,42 +61,6 @@ _Exemple de CodeActAgent avec `gpt-4-turbo-2024-04-09` effectuant une tâche de
 [] Support de la navigation sur le web
 [] Compléter le workflow pour l'agent CodeAct afin de soumettre des PRs Github

-## Agent Monologue
-
-### Description
-
-L'agent Monologue utilise la mémoire à long terme et à court terme pour accomplir des tâches.
-La mémoire à long terme est stockée en tant qu'objet LongTermMemory et le modèle l'utilise pour rechercher des exemples du passé.
-La mémoire à court terme est stockée en tant qu'objet Monologue et le modèle peut la condenser si nécessaire.
-
-### Actions
-
-`Action`,
-`NullAction`,
-`CmdRunAction`,
-`FileWriteAction`,
-`FileReadAction`,
-`BrowseURLAction`,
-`GithubPushAction`,
-`AgentThinkAction`
-
-### Observations
-
-`Observation`,
-`NullObservation`,
-`CmdOutputObservation`,
-`FileReadObservation`,
-`BrowserOutputObservation`
-
-### Méthodes
-
-| Méthode         | Description                                                                                                                                   |
-| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
-| `__init__`      | Initialise l'agent avec une mémoire à long terme et un monologue interne                                                                      |
-| `_add_event`    | Ajoute des événements au monologue de l'agent et condense avec un résumé automatiquement si le monologue est trop long                            |
-| `_initialize`   | Utilise la liste `INITIAL_THOUGHTS` pour donner à l'agent un contexte pour ses capacités et comment naviguer dans le `/workspace`                    |
-| `step`          | Modifie l'état actuel en ajoutant les actions et observations les plus récentes, puis invite le modèle à réfléchir à la prochaine action à entreprendre. |
-
 ## Agent Planificateur

 ### Description
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
@@ -41,7 +41,6 @@ Créez un fichier ```config.toml``` dans le répertoire OpenDevin et entrez ces
 ```toml
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="image_personnalisée"
 ```
@@ -92,7 +91,6 @@ Si vous voyez cette erreur dans la sortie de la console, il s'agit du fait que O
 ```toml
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="image_personnalisée"
 sandbox_user_id="1001"
@@ -104,4 +102,4 @@ Si vous voyez un message d'erreur indiquant que le port est utilisé ou indispon

 ## Discuter

-Pour d'autres problèmes ou questions rejoignez le [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) ou le [Discord](https://discord.gg/ESHStjSjD4) et demandez!
+Pour d'autres problèmes ou questions rejoignez le [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) ou le [Discord](https://discord.gg/ESHStjSjD4) et demandez!
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/intro.mdx
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/intro.mdx
@@ -42,7 +42,7 @@ Explorez le code source d'OpenDevin sur [GitHub](https://github.com/OpenDevin/Op
  />
 </a>
 <br></br>
-<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
+<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw">
  <img
    src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
    alt="Join our Slack community"
@@ -72,8 +72,6 @@ WORKSPACE_BASE=$(pwd)/workspace
 docker run -it \
    --pull=always \
    -e SANDBOX_USER_ID=$(id -u) \
-    -e PERSIST_SANDBOX="true" \
-    -e SSH_PASSWORD="make something up here" \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
    -v /var/run/docker.sock:/var/run/docker.sock \
@@ -93,7 +91,7 @@ Si vous souhaitez utiliser la version **(instable !)** la plus récente, vous po

 Pour le workflow de développement, consultez [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md).

-Avez-vous des problèmes ? Consultez notre [Guide de dépannage](https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting).
+Avez-vous des problèmes ? Consultez notre [Guide de dépannage](https://docs.all-hands.dev/modules/usage/troubleshooting).

 :::warning
 OpenDevin est actuellement en cours de développement, mais vous pouvez déjà exécuter la version alpha pour voir le système de bout en bout en action.
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/troubleshooting/troubleshooting.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/troubleshooting/troubleshooting.md
@@ -25,7 +25,7 @@ Si vous utilisez Windows et que vous rencontrez des problèmes, consultez notre
 ### Symptômes

 ```bash
-Erreur lors de la création du contrôleur. Veuillez vérifier que Docker est en cours d'exécution et visitez `https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting` pour plus d'informations sur le débogage.
+Erreur lors de la création du contrôleur. Veuillez vérifier que Docker est en cours d'exécution et visitez `https://docs.all-hands.dev/modules/usage/troubleshooting` pour plus d'informations sur le débogage.
 ```

 ```bash
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/about.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/about.md
@@ -31,7 +31,7 @@ OpenDevin 是一个社区驱动的项目，我们欢迎每个人的贡献。无

 我们现在有一个 Slack 工作区，用于合作建设 OpenDevin，还设有一个 Discord 服务器，用于讨论与该项目、LLM、代理等相关的任何事情。

- [Slack 工作区](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
+- [Slack 工作区](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw)
 - [Discord 服务器](https://discord.gg/ESHStjSjD4)

 如果您愿意贡献，请随时加入我们的社区。让我们一起简化软件工程！
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/agents.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/agents.md
@@ -61,42 +61,6 @@ _CodeActAgent使用`gpt-4-turbo-2024-04-09`执行数据科学任务（线性回
 [] 支持Web浏览
 [] 完成CodeAct agent提交Github PR的工作流程

-## Monologue Agent
-
-### 描述
-
-Monologue Agent利用长短期记忆来完成任务。
-长期记忆存储为LongTermMemory对象，模型使用它来搜索过去的示例。
-短期记忆存储为Monologue对象，模型可以根据需要进行压缩。
-
-### 动作
-
-`Action`,
-`NullAction`,
-`CmdRunAction`,
-`FileWriteAction`,
-`FileReadAction`,
-`BrowseURLAction`,
-`GithubPushAction`,
-`AgentThinkAction`
-
-### 观测
-
-`Observation`,
-`NullObservation`,
-`CmdOutputObservation`,
-`FileReadObservation`,
-`BrowserOutputObservation`
-
-### 方法
-
-| 方法           | 描述                                                                                                                                       |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
-| `__init__`     | 使用长期记忆和内部独白初始化Agent                                                                                                            |
-| `_add_event`   | 将事件附加到Agent的独白中，如独白过长自动与摘要一起压缩                                                                                    |
-| `_initialize`  | 使用`INITIAL_THOUGHTS`列表为agent提供其能力的上下文以及如何导航`/workspace`                                                                 |
-| `step`         | 通过添加最近的动作和观测修改当前状态，然后提示模型考虑其接下来的动作。                                                                     |
-
 ## Planner Agent

 ### 描述
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
@@ -40,7 +40,6 @@ docker build -t custom_image .
 ```
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="custom_image"
 ```
@@ -92,7 +91,6 @@ dockerfile_content = (
 ```
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="custom_image"
 sandbox_user_id="1001"
@@ -104,4 +102,4 @@ sandbox_user_id="1001"

 ## 讨论

-对于其他问题或疑问，请加入 [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) 或 [Discord](https://discord.gg/ESHStjSjD4)，并提问！
+对于其他问题或疑问，请加入 [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) 或 [Discord](https://discord.gg/ESHStjSjD4)，并提问！
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/intro.mdx
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/intro.mdx
@@ -42,7 +42,7 @@ OpenDevin 是一个**自主 AI 软件工程师**，能够执行复杂的工程
  />
 </a>
 <br></br>
-<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
+<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw">
  <img
    src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
    alt="Join our Slack community"
@@ -72,8 +72,6 @@ WORKSPACE_BASE=$(pwd)/workspace
 docker run -it \
    --pull=always \
    -e SANDBOX_USER_ID=$(id -u) \
-    -e PERSIST_SANDBOX="true" \
-    -e SSH_PASSWORD="make something up here" \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
    -v /var/run/docker.sock:/var/run/docker.sock \
@@ -93,7 +91,7 @@ OpenDevin 只会访问这个工作区文件夹。它在一个安全的 docker

 有关开发工作流程，请参阅 [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md)。

-遇到问题了吗？查看我们的 [故障排除指南](https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting)。
+遇到问题了吗？查看我们的 [故障排除指南](https://docs.all-hands.dev/modules/usage/troubleshooting)。

 :::warning
 OpenDevin 目前正在开发中，但你已经可以运行 alpha 版本来查看端到端系统的运作情况。
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/troubleshooting/troubleshooting.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/troubleshooting/troubleshooting.md
@@ -23,7 +23,7 @@ sidebar_position: 5
 ### 症状

 ```bash
-创建控制器时出错。请检查 Docker 是否正在运行，并访问 `https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting` 获取更多调试信息。
+创建控制器时出错。请检查 Docker 是否正在运行，并访问 `https://docs.all-hands.dev/modules/usage/troubleshooting` 获取更多调试信息。
 ```

 ```bash
--- a/docs/modules/usage/about.md
+++ b/docs/modules/usage/about.md
@@ -31,7 +31,7 @@ For details, please check [this document](https://github.com/OpenDevin/OpenDevin

 We have both Slack workspace for the collaboration on building OpenDevin and Discord server for discussion about anything related, e.g., this project, LLM, agent, etc.

- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
+- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw)
 - [Discord server](https://discord.gg/ESHStjSjD4)

 If you would love to contribute, feel free to join our community. Let's simplify software engineering together!
--- a/docs/modules/usage/agents.md
+++ b/docs/modules/usage/agents.md
@@ -56,42 +56,6 @@ _Example of CodeActAgent with `gpt-4-turbo-2024-04-09` performing a data science
 | `__init__`      | Initializes an agent with `llm` and a list of messages `list[Mapping[str, str]]`                                                                |
 | `step`          | Performs one step using the CodeAct Agent. This includes gathering info on previous steps and prompting the model to make a command to execute. |

-## Monologue Agent
-
-### Description
-
-The Monologue Agent utilizes long and short term memory to complete tasks.
-Long term memory is stored as a LongTermMemory object and the model uses it to search for examples from the past.
-Short term memory is stored as a Monologue object and the model can condense it as necessary.
-
-### Actions
-
-`Action`,
-`NullAction`,
-`CmdRunAction`,
-`FileWriteAction`,
-`FileReadAction`,
-`BrowseURLAction`,
-`GithubPushAction`,
-`AgentThinkAction`
-
-### Observations
-
-`Observation`,
-`NullObservation`,
-`CmdOutputObservation`,
-`FileReadObservation`,
-`BrowserOutputObservation`
-
-### Methods
-
-| Method          | Description                                                                                                                                   |
-| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
-| `__init__`      | Initializes the agent with a long term memory, and an internal monologue                                                                      |
-| `_add_event`    | Appends events to the monologue of the agent and condenses with summary automatically if the monologue is too long                            |
-| `_initialize`   | Utilizes the `INITIAL_THOUGHTS` list to give the agent a context for its capabilities and how to navigate the `/workspace`                    |
-| `step`          | Modifies the current state by adding the most recent actions and observations, then prompts the model to think about its next action to take. |
-
 ## Planner Agent

 ### Description
--- a/docs/modules/usage/custom_sandbox_guide.md
+++ b/docs/modules/usage/custom_sandbox_guide.md
@@ -4,22 +4,35 @@ sidebar_position: 6

 # 💿 How to Create and Use a Custom Docker Sandbox

-The default OpenDevin sandbox comes with a [minimal ubuntu configuration](https://github.com/OpenDevin/OpenDevin/blob/main/containers/sandbox/Dockerfile). 
+The default OpenDevin sandbox comes with a [minimal ubuntu configuration](https://github.com/OpenDevin/OpenDevin/blob/main/containers/sandbox/Dockerfile).

 Your use case may need additional software installed by default.

 There are two ways you can do so:
+
 1. Use an existing image from docker hub. For instance, if you want to have `nodejs` installed, you can do so by using the `node:20` image
 2. Creating your own custom docker image and using it

 If you want to take the first approach, you can skip the `Create Your Docker Image` section.

+For a more feature-rich environment, you might consider using pre-built images like **[nikolaik/python-nodejs](https://hub.docker.com/r/nikolaik/python-nodejs)**, which comes with both Python and Node.js pre-installed, along with many other useful tools and libraries, like:
+
+- Node.js: 22.x
+- npm: 10.x
+- yarn: stable
+- Python: latest
+- pip: latest
+- pipenv: latest
+- poetry: latest
+- uv: latest
+
 ## Setup

 Make sure you are able to run OpenDevin using the [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) first.

 ## Create Your Docker Image
-To create a custom docker image, it must be debian/ubuntu based. 
+
+To create a custom docker image, it must be debian/ubuntu based.

 For example, if we want OpenDevin to have access to the `node` binary, we would use the following Dockerfile:

@@ -34,7 +47,7 @@ RUN apt-get update && apt-get install -y
 RUN apt-get install -y nodejs
 ```

-Next build your docker image with the name of your choice, for example `custom_image`. 
+Next build your docker image with the name of your choice, for example `custom_image`.

 To do this you can create a directory and put your file inside it with the name `Dockerfile`, and inside the directory run the following command:

@@ -50,19 +63,19 @@ This will produce a new image called ```custom_image``` that will be available i

 ## Specify your sandbox image in config.toml file

-OpenDevin configuration occurs via the top-level `config.toml` file. 
+OpenDevin configuration occurs via the top-level `config.toml` file.

 Create a `config.toml` file in the OpenDevin directory and enter these contents:

 ```toml
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="custom_image"
 ```

 For `sandbox_container_image`, you can specify either:
+
 1. The name of your custom image that you built in the previous step (e.g., `”custom_image”`)
 2. A pre-existing image from Docker Hub (e.g., `”node:20”` if you want a sandbox with Node.js pre-installed)

@@ -79,7 +92,7 @@ Congratulations!

 The relevant code is defined in [ssh_box.py](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/docker/ssh_box.py) and [image_agnostic_util.py](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/docker/image_agnostic_util.py).

-In particular, ssh_box.py checks the config object for ```config.sandbox_container_image``` and then attempts to retrieve the image using [get_od_sandbox_image](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/docker/image_agnostic_util.py#L72) which is defined in image_agnostic_util.py.
+In particular, `ssh_box.py` checks the config object for ```config.sandbox_container_image``` and then attempts to retrieve the image using [get_od_sandbox_image](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/docker/image_agnostic_util.py#L72) which is defined in image_agnostic_util.py.

 When first using a custom image, it will not be found and thus it will be built (on subsequent runs the built image will be found and returned).

@@ -109,12 +122,12 @@ dockerfile_content = (
 ## Troubleshooting / Errors

 ### Error: ```useradd: UID 1000 is not unique```
+
 If you see this error in the console output it is because OpenDevin is trying to create the opendevin user in the sandbox with a UID of 1000, however this UID is already being used in the image (for some reason). To fix this change the sandbox_user_id field in the config.toml file to a different value:

 ```toml
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="custom_image"
 sandbox_user_id="1001"
@@ -122,8 +135,8 @@ sandbox_user_id="1001"

 ### Port use errors

-If you see an error about a port being in use or unavailable, try deleting all running Docker Containers (run `docker ps` and `docker rm` relevant containers) and then re-running ```make run```
+If you see an error about a port being in use or unavailable, try deleting all running Docker Containers (run `docker ps` and `docker rm` relevant containers) and then re-running ```make run``` .

 ## Discuss

-For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
+For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
--- a/docs/modules/usage/evaluation_harness.md
+++ b/docs/modules/usage/evaluation_harness.md
@@ -0,0 +1,257 @@
+---
+sidebar_position: 6
+---
+
+# 📈 How to contribute to OpenDevin Evaluation Harness
+
+This guide provides an overview of how to integrate your own evaluation benchmark into the OpenDevin framework.
+
+## Before everything begins: Setup Environment and LLM Configuration
+
+Please follow instruction [here](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup your local development environment and LLM.
+
+OpenDevin in development mode uses `config.toml` to keep track of most configurations.
+
+Here's an example configuration file you can use to define and use multiple LLMs:
+
+```toml
+[llm]
+# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
+model = "gpt-4o-2024-05-13"
+api_key = "sk-XXX"
+
+[llm.eval_gpt4_1106_preview_llm]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[llm.eval_some_openai_compatible_model_llm]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+
+## How to use OpenDevin in the command line
+
+OpenDevin can be run from the command line using the following format:
+
+```bash
+poetry run python ./opendevin/core/main.py \
+        -i <max_iterations> \
+        -t "<task_description>" \
+        -c <agent_class> \
+        -l <llm_config>
+```
+
+For example:
+
+```bash
+poetry run python ./opendevin/core/main.py \
+        -i 10 \
+        -t "Write me a bash script that prints hello world." \
+        -c CodeActAgent \
+        -l llm
+```
+
+This command runs OpenDevin with:
+- A maximum of 10 iterations
+- The specified task description
+- Using the CodeActAgent
+- With the LLM configuration defined in the `llm` section of your `config.toml` file
+
+## How does OpenDevin work
+
+The main entry point for OpenDevin is in `opendevin/core/main.py`. Here's a simplified flow of how it works:
+
+1. Parse command-line arguments and load the configuration.
+2. Create a runtime environment using `create_runtime()`.
+3. Initialize the specified agent.
+4. Run the controller using `run_controller()`, which:
+   - Attaches the runtime to the agent
+   - Executes the agent's task
+   - Returns a final state when complete
+
+The `run_controller()` function is the core of OpenDevin's execution. It manages the interaction between the agent, the runtime, and the task, handling things like user input simulation and event processing.
+
+
+## Easiest way to get started: Exploring Existing Benchmarks
+
+We encourage you to review the various evaluation benchmarks available in the [`evaluation/` directory](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation) of our repository.
+
+To integrate your own benchmark, we suggest starting with the one that most closely resembles your needs. This approach can significantly streamline your integration process, allowing you to build upon existing structures and adapt them to your specific requirements.
+
+## How to create an evaluation workflow
+
+To create an evaluation workflow for your benchmark, follow these steps:
+
+1. Create a configuration:
+   ```python
+   def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
+       config = AppConfig(
+           default_agent=metadata.agent_class,
+           runtime='eventstream',
+           max_iterations=metadata.max_iterations,
+           sandbox=SandboxConfig(
+               container_image='your_container_image',
+               enable_auto_lint=True,
+               timeout=300,
+           ),
+       )
+       config.set_llm_config(metadata.llm_config)
+       return config
+   ```
+
+2. Initialize the runtime and set up the evaluation environment:
+   ```python
+   async def initialize_runtime(runtime: Runtime, instance: pd.Series):
+       # Set up your evaluation environment here
+       # For example, setting environment variables, preparing files, etc.
+       pass
+   ```
+
+3. Create a function to process each instance:
+   ```python
+   async def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
+       config = get_config(instance, metadata)
+       runtime = await create_runtime(config, sid=instance.instance_id)
+       await initialize_runtime(runtime, instance)
+
+       instruction = get_instruction(instance, metadata)
+
+       state = await run_controller(
+           config=config,
+           task_str=instruction,
+           runtime=runtime,
+           fake_user_response_fn=your_user_response_function,
+       )
+
+       # Evaluate the agent's actions
+       evaluation_result = await evaluate_agent_actions(runtime, instance)
+
+       return EvalOutput(
+           instance_id=instance.instance_id,
+           instruction=instruction,
+           test_result=evaluation_result,
+           metadata=metadata,
+           history=state.history.compatibility_for_eval_history_pairs(),
+           metrics=state.metrics.get() if state.metrics else None,
+           error=state.last_error if state and state.last_error else None,
+       )
+   ```
+
+4. Run the evaluation:
+   ```python
+   metadata = make_metadata(llm_config, dataset_name, agent_class, max_iterations, eval_note, eval_output_dir)
+   output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+   instances = prepare_dataset(your_dataset, output_file, eval_n_limit)
+
+   await run_evaluation(
+       instances,
+       metadata,
+       output_file,
+       num_workers,
+       process_instance
+   )
+   ```
+
+This workflow sets up the configuration, initializes the runtime environment, processes each instance by running the agent and evaluating its actions, and then collects the results into an `EvalOutput` object. The `run_evaluation` function handles parallelization and progress tracking.
+
+Remember to customize the `get_instruction`, `your_user_response_function`, and `evaluate_agent_actions` functions according to your specific benchmark requirements.
+
+By following this structure, you can create a robust evaluation workflow for your benchmark within the OpenDevin framework.
+
+Certainly! I'll add a section explaining the user_response_fn and include a description of the workflow and interaction. Here's an updated version of the guideline with the new section:
+
+
+## Understanding the `user_response_fn`
+
+The `user_response_fn` is a crucial component in OpenDevin's evaluation workflow. It simulates user interaction with the agent, allowing for automated responses during the evaluation process. This function is particularly useful when you want to provide consistent, predefined responses to the agent's queries or actions.
+
+
+### Workflow and Interaction
+
+The correct workflow for handling actions and the `user_response_fn` is as follows:
+
+1. Agent receives a task and starts processing
+2. Agent emits an Action
+3. If the Action is executable (e.g., CmdRunAction, IPythonRunCellAction):
+   - The Runtime processes the Action
+   - Runtime returns an Observation
+4. If the Action is not executable (typically a MessageAction):
+   - The `user_response_fn` is called
+   - It returns a simulated user response
+5. The agent receives either the Observation or the simulated response
+6. Steps 2-5 repeat until the task is completed or max iterations are reached
+
+Here's a more accurate visual representation:
+
+```
+                 [Agent]
+                    |
+                    v
+               [Emit Action]
+                    |
+                    v
+            [Is Action Executable?]
+           /                       \
+         Yes                        No
+          |                          |
+          v                          v
+     [Runtime]               [user_response_fn]
+          |                          |
+          v                          v
+  [Return Observation]    [Simulated Response]
+           \                        /
+            \                      /
+             v                    v
+           [Agent receives feedback]
+                    |
+                    v
+         [Continue or Complete Task]
+```
+
+In this workflow:
+
+- Executable actions (like running commands or executing code) are handled directly by the Runtime.
+- Non-executable actions (typically when the agent wants to communicate or ask for clarification) are handled by the `user_response_fn`.
+- The agent then processes the feedback, whether it's an Observation from the Runtime or a simulated response from the `user_response_fn`.
+
+This approach allows for automated handling of both concrete actions and simulated user interactions, making it suitable for evaluation scenarios where you want to test the agent's ability to complete tasks with minimal human intervention.
+
+### Example Implementation
+
+Here's an example of a `user_response_fn` used in the SWE-Bench evaluation:
+
+```python
+def codeact_user_response(state: State | None) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'If you think you have solved the task, please first send your answer to user through message and then <execute_bash> exit </execute_bash>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
+    )
+
+    if state and state.history:
+        # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
+        user_msgs = [
+            event
+            for event in state.history.get_events()
+            if isinstance(event, MessageAction) and event.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+```
+
+This function does the following:
+
+1. Provides a standard message encouraging the agent to continue working.
+2. Checks how many times the agent has attempted to communicate with the user.
+3. If the agent has made multiple attempts, it provides an option to give up.
+
+By using this function, you can ensure consistent behavior across multiple evaluation runs and prevent the agent from getting stuck waiting for human input.
--- a/docs/modules/usage/intro.mdx
+++ b/docs/modules/usage/intro.mdx
@@ -42,7 +42,7 @@ Explore the codebase of OpenDevin on [GitHub](https://github.com/OpenDevin/OpenD
  />
 </a>
 <br></br>
-<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
+<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw">
  <img
    src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
    alt="Join our Slack community"
--- a/docs/modules/usage/llms/llms.md
+++ b/docs/modules/usage/llms/llms.md
@@ -15,13 +15,14 @@ OpenDevin will issue many prompts to the LLM you configure. Most of these LLMs c
 The `LLM_MODEL` environment variable controls which model is used in programmatic interactions.
 But when using the OpenDevin UI, you'll need to choose your model in the settings window.

-The following environment variables might be necessary for some LLMs:
+The following environment variables might be necessary for some LLMs/providers:

 - `LLM_API_KEY`
 - `LLM_BASE_URL`
 - `LLM_EMBEDDING_MODEL`
 - `LLM_EMBEDDING_DEPLOYMENT_NAME`
 - `LLM_API_VERSION`
+- `LLM_DROP_PARAMS`

 We have a few guides for running OpenDevin with specific model providers:

--- a/docs/modules/usage/llms/localLLMs.md
+++ b/docs/modules/usage/llms/localLLMs.md
@@ -172,9 +172,9 @@ docker run \
    -it \
    --pull=always \
    -e SANDBOX_USER_ID=$(id -u) \
-    -e LLM_MODEL="openai/lmstudio"
+    -e LLM_MODEL="openai/lmstudio" \
    -e LLM_BASE_URL="http://host.docker.internal:1234/v1" \
-    -e CUSTOM_LLM_PROVIDER="openai"
+    -e CUSTOM_LLM_PROVIDER="openai" \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
    -v /var/run/docker.sock:/var/run/docker.sock \
--- a/docs/modules/usage/openshift-example.md
+++ b/docs/modules/usage/openshift-example.md
@@ -0,0 +1,300 @@
+---
+sidebar_position: 6
+---
+
+# 💿 How to use OpenDevin in OpenShift/K8S
+
+There are different ways and scenarios that you can do, we're just mentioning one example here:
+1. Create a PV "as a cluster admin" to map workspace_base data and docker directory to the pod through the worker node.
+2. Create a PVC to be able to mount those PVs to the POD
+3. Create a POD which contains two containers; the OpenDevin and Sandbox containers.
+
+## Steps to follow the above example.
+
+> Note: Make sure you are logged in to the cluster first with the proper account for each step. PV creation requires cluster administrator!
+
+> Make sure you have read/write permissions on the hostPath used below (i.e. /tmp/workspace)
+
+1. Create the PV:
+Sample yaml file below can be used by a cluster admin to create the PV.
+- workspace-pv.yaml
+
+```yamlfile
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: workspace-pv
+spec:
+  capacity:
+    storage: 2Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  hostPath:
+    path: /tmp/workspace
+```
+
+```bash
+# apply yaml file
+$ oc create -f workspace-pv.yaml
+persistentvolume/workspace-pv created
+
+# review:
+$ oc get pv
+NAME                                       CAPACITY   ACCESS MODES   RECLAIM POLICY   STATUS      CLAIM                STORAGECLASS     REASON   AGE
+workspace-pv                               2Gi        RWO            Retain           Available                                                  7m23s
+```
+
+- docker-pv.yaml
+
+```yamlfile
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: docker-pv
+spec:
+  capacity:
+    storage: 2Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  hostPath:
+    path: /var/run/docker.sock
+```
+
+```bash
+# apply yaml file
+$ oc create -f docker-pv.yaml
+persistentvolume/docker-pv created
+
+# review:
+oc get pv
+NAME                                       CAPACITY   ACCESS MODES   RECLAIM POLICY   STATUS      CLAIM                STORAGECLASS     REASON   AGE
+docker-pv                                  2Gi        RWO            Retain           Available                                                  6m55s
+workspace-pv                               2Gi        RWO            Retain           Available                                                  7m23s
+```
+
+2. Create the PVC:
+Sample PVC yaml file below:
+
+- workspace-pvc.yaml
+
+```yamlfile
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: workspace-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+```
+
+```bash
+# create the pvc
+$ oc create -f workspace-pvc.yaml
+persistentvolumeclaim/workspace-pvc created
+
+# review
+$ oc get pvc
+NAME            STATUS    VOLUME   CAPACITY   ACCESS MODES   STORAGECLASS     AGE
+workspace-pvc   Pending                                      hcloud-volumes   4s
+
+$ oc get events
+LAST SEEN   TYPE     REASON                 OBJECT                                MESSAGE
+8s          Normal   WaitForFirstConsumer   persistentvolumeclaim/workspace-pvc   waiting for first consumer to be created before binding
+```
+
+- docker-pvc.yaml
+
+```yamlfile
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: docker-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+```
+
+```bash
+# create pvc
+$ oc create -f docker-pvc.yaml
+persistentvolumeclaim/docker-pvc created
+
+# review
+$ oc get pvc
+NAME            STATUS    VOLUME   CAPACITY   ACCESS MODES   STORAGECLASS     AGE
+docker-pvc      Pending                                      hcloud-volumes   4s
+workspace-pvc   Pending                                      hcloud-volumes   2m53s
+
+$ oc get events
+LAST SEEN   TYPE     REASON                 OBJECT                                MESSAGE
+10s         Normal   WaitForFirstConsumer   persistentvolumeclaim/docker-pvc      waiting for first consumer to be created before binding
+10s         Normal   WaitForFirstConsumer   persistentvolumeclaim/workspace-pvc   waiting for first consumer to be created before binding
+```
+
+3. Create the POD yaml file:
+Sample POD yaml file below:
+
+- pod.yaml
+
+```yamlfile
+apiVersion: v1
+kind: Pod
+metadata:
+  name: opendevin-app-2024
+  labels:
+    app: opendevin-app-2024
+spec:
+  containers:
+  - name: opendevin-app-2024
+    image: ghcr.io/opendevin/opendevin:0.7.1
+    env:
+    - name: SANDBOX_USER_ID
+      value: "1000"
+    - name: WORKSPACE_MOUNT_PATH
+      value: "/opt/workspace_base"
+    volumeMounts:
+    - name: workspace-volume
+      mountPath: /opt/workspace_base
+    - name: docker-sock
+      mountPath: /var/run/docker.sock
+    ports:
+    - containerPort: 3000
+  - name: opendevin-sandbox-2024
+    image: ghcr.io/opendevin/sandbox:main
+    ports:
+    - containerPort: 51963
+    command: ["/usr/sbin/sshd", "-D", "-p 51963", "-o", "PermitRootLogin=yes"]
+  volumes:
+  - name: workspace-volume
+    persistentVolumeClaim:
+      claimName: workspace-pvc
+  - name: docker-sock
+    persistentVolumeClaim:
+      claimName: docker-pvc
+```
+
+```bash
+# create the pod
+$ oc create -f pod.yaml
+W0716 11:22:07.776271  107626 warnings.go:70] would violate PodSecurity "restricted:v1.24": allowPrivilegeEscalation != false (containers "opendevin-app-2024", "opendevin-sandbox-2024" must set securityContext.allowPrivilegeEscalation=false), unrestricted capabilities (containers "opendevin-app-2024", "opendevin-sandbox-2024" must set securityContext.capabilities.drop=["ALL"]), runAsNonRoot != true (pod or containers "opendevin-app-2024", "opendevin-sandbox-2024" must set securityContext.runAsNonRoot=true), seccompProfile (pod or containers "opendevin-app-2024", "opendevin-sandbox-2024" must set securityContext.seccompProfile.type to "RuntimeDefault" or "Localhost")
+pod/opendevin-app-2024 created
+
+# Above warning can be ignored for now as we will not modify SCC restrictions.
+
+# review
+$ oc get pods
+NAME                 READY   STATUS    RESTARTS   AGE
+opendevin-app-2024   0/2     Pending   0          5s
+
+$ oc get pods
+NAME                 READY   STATUS              RESTARTS   AGE
+opendevin-app-2024   0/2     ContainerCreating   0          15s
+
+$ oc get events
+LAST SEEN   TYPE     REASON                   OBJECT                                MESSAGE
+38s         Normal   WaitForFirstConsumer     persistentvolumeclaim/docker-pvc      waiting for first consumer to be created before binding
+23s         Normal   ExternalProvisioning     persistentvolumeclaim/docker-pvc      waiting for a volume to be created, either by external provisioner "csi.hetzner.cloud" or manually created by system administrator
+27s         Normal   Provisioning             persistentvolumeclaim/docker-pvc      External provisioner is provisioning volume for claim "opendevin/docker-pvc"
+17s         Normal   ProvisioningSucceeded    persistentvolumeclaim/docker-pvc      Successfully provisioned volume pvc-2b1d223a-1c8f-4990-8e3d-68061a9ae252
+16s         Normal   Scheduled                pod/opendevin-app-2024                Successfully assigned opendevin/opendevin-app-2024 to worker1.hub.internal.blakane.com
+9s          Normal   SuccessfulAttachVolume   pod/opendevin-app-2024                AttachVolume.Attach succeeded for volume "pvc-2b1d223a-1c8f-4990-8e3d-68061a9ae252"
+9s          Normal   SuccessfulAttachVolume   pod/opendevin-app-2024                AttachVolume.Attach succeeded for volume "pvc-31f15b25-faad-4665-a25f-201a530379af"
+6s          Normal   AddedInterface           pod/opendevin-app-2024                Add eth0 [10.128.2.48/23] from openshift-sdn
+6s          Normal   Pulled                   pod/opendevin-app-2024                Container image "ghcr.io/opendevin/opendevin:0.7.1" already present on machine
+6s          Normal   Created                  pod/opendevin-app-2024                Created container opendevin-app-2024
+6s          Normal   Started                  pod/opendevin-app-2024                Started container opendevin-app-2024
+6s          Normal   Pulled                   pod/opendevin-app-2024                Container image "ghcr.io/opendevin/sandbox:main" already present on machine
+5s          Normal   Created                  pod/opendevin-app-2024                Created container opendevin-sandbox-2024
+5s          Normal   Started                  pod/opendevin-app-2024                Started container opendevin-sandbox-2024
+83s         Normal   WaitForFirstConsumer     persistentvolumeclaim/workspace-pvc   waiting for first consumer to be created before binding
+27s         Normal   Provisioning             persistentvolumeclaim/workspace-pvc   External provisioner is provisioning volume for claim "opendevin/workspace-pvc"
+17s         Normal   ProvisioningSucceeded    persistentvolumeclaim/workspace-pvc   Successfully provisioned volume pvc-31f15b25-faad-4665-a25f-201a530379af
+
+$ oc get pods
+NAME                 READY   STATUS    RESTARTS   AGE
+opendevin-app-2024   2/2     Running   0          23s
+
+$ oc get pvc
+NAME            STATUS   VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS     AGE
+docker-pvc      Bound    pvc-2b1d223a-1c8f-4990-8e3d-68061a9ae252   10Gi       RWO            hcloud-volumes   10m
+workspace-pvc   Bound    pvc-31f15b25-faad-4665-a25f-201a530379af   10Gi       RWO            hcloud-volumes   13m
+
+```
+
+4. Create a NodePort service.
+Sample service creation command below:
+
+```bash
+# create the service of type NodePort
+$ oc create svc nodeport  opendevin-app-2024  --tcp=3000:3000
+service/opendevin-app-2024 created
+
+# review
+
+$ oc get svc
+NAME                 TYPE       CLUSTER-IP      EXTERNAL-IP   PORT(S)          AGE
+opendevin-app-2024   NodePort   172.30.225.42   <none>        3000:30495/TCP   4s
+
+$ oc describe svc opendevin-app-2024
+Name:                     opendevin-app-2024
+Namespace:                opendevin
+Labels:                   app=opendevin-app-2024
+Annotations:              <none>
+Selector:                 app=opendevin-app-2024
+Type:                     NodePort
+IP Family Policy:         SingleStack
+IP Families:              IPv4
+IP:                       172.30.225.42
+IPs:                      172.30.225.42
+Port:                     3000-3000  3000/TCP
+TargetPort:               3000/TCP
+NodePort:                 3000-3000  30495/TCP
+Endpoints:                10.128.2.48:3000
+Session Affinity:         None
+External Traffic Policy:  Cluster
+Events:                   <none>
+```
+
+6. Connect to OpenDevin UI, configure the Agent, then test:
+
+![image](https://github.com/user-attachments/assets/12f94804-a0c7-4744-b873-e003c9caf40e)
+
+
+## Challenges
+Some of the challenages that would be needed to improve:
+
+1. Install GIT into the container:
+   This can be resolved by building a custom image which includes GIT software and use that image during pod deplyment.
+
+Example below: "to be tested!"
+
+```dockerfile
+FROM ghcr.io/opendevin/opendevin:0.7.1
+
+# Install Git
+RUN apt-get update && apt-get install -y git
+
+# Ensure /opt/workspace_base is writable
+RUN mkdir -p /opt/workspace_base && chown -R 1000:1000 /opt/workspace_base
+
+# Verify Git installation
+RUN git --version
+```
+
+2. Mount a shared development directory "i.e. one hosted in EC2 instance" to the POD:
+   This can be also done by sharing the developement directory to the worker node through a sharing software (NFS), then creating a pv and pvc as described above to access that directory.
+
+3. Not all Agents working! Just tested CoderAgent with an openai API key and produced results.
+
+
+## Discuss
+
+For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
--- a/docs/modules/usage/runtime.md
+++ b/docs/modules/usage/runtime.md
@@ -0,0 +1,181 @@
+---
+sidebar_position: 4
+---
+
+# 📦 EventStream Runtime
+
+The OpenDevin EventStream Runtime is the core component that enables secure and flexible execution of AI agent's action.
+It creates a sandboxed environment using Docker, where arbitrary code can be run safely without risking the host system.
+
+
+## Why do we need a sandboxed runtime?
+
+OpenDevin needs to execute arbitrary code in a secure, isolated environment for several reasons:
+
+1. Security: Executing untrusted code can pose significant risks to the host system. A sandboxed environment prevents malicious code from accessing or modifying the host system's resources.
+
+2. Consistency: A sandboxed environment ensures that code execution is consistent across different machines and setups, eliminating "it works on my machine" issues.
+
+3. Resource Control: Sandboxing allows for better control over resource allocation and usage, preventing runaway processes from affecting the host system.
+
+4. Isolation: Different projects or users can work in isolated environments without interfering with each other or the host system.
+
+5. Reproducibility: Sandboxed environments make it easier to reproduce bugs and issues, as the execution environment is consistent and controllable.
+
+## How does our Runtime work?
+
+The OpenDevin Runtime system uses a client-server architecture implemented with Docker containers. Here's an overview of how it works:
+
+```mermaid
+graph TD
+    A[User-provided Custom Docker Image] --> B[OpenDevin Backend]
+    B -->|Builds| C[OD Runtime Image]
+    C -->|Launches| D[Runtime Client]
+    D -->|Initializes| E[Browser]
+    D -->|Initializes| F[Bash Shell]
+    D -->|Initializes| G[Plugins]
+    G -->|Initializes| L[Jupyter Server]
+
+    B -->|Spawn| H[Agent]
+    B -->|Spawn| I[EventStream]
+    I <--->|Execute Action to
+    Get Observation
+    via REST API
+    | D
+
+    H -->|Generate Action| I
+    I -->|Obtain Observation| H
+
+    subgraph "Docker Container"
+    D
+    E
+    F
+    G
+    L
+    end
+```
+
+1. User Input: The user provides a custom base Docker image.
+
+2. Image Building: OpenDevin builds a new Docker image (the "OD runtime image") based on the user-provided image. This new image includes OpenDevin-specific code, primarily the "runtime client."
+
+3. Container Launch: When OpenDevin starts, it launches a Docker container using the OD runtime image.
+
+4. Client Initialization: The runtime client initializes inside the container, setting up necessary components like a bash shell and loading any specified plugins.
+
+5. Communication: The OpenDevin backend (`runtime.py`) communicates with the runtime client over RESTful API, sending actions and receiving observations.
+
+6. Action Execution: The runtime client receives actions from the backend, executes them in the sandboxed environment, and sends back observations.
+
+7. Observation Return: The client sends execution results back to the OpenDevin backend as observations.
+
+
+The role of the client is crucial:
+- It acts as an intermediary between the OpenDevin backend and the sandboxed environment.
+- It executes various types of actions (shell commands, file operations, Python code, etc.) safely within the container.
+- It manages the state of the sandboxed environment, including the current working directory and loaded plugins.
+- It formats and returns observations to the backend, ensuring a consistent interface for processing results.
+
+
+## Advanced: How OpenDevin builds and maintains OD Runtime images
+
+OpenDevin uses a sophisticated approach to build and manage runtime images. This process ensures efficiency, consistency, and flexibility in creating and maintaining Docker images for both production and development environments.
+
+Check out [relavant code](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/utils/runtime_build.py) if you are interested in more details.
+
+### Image Tagging System
+
+OpenDevin uses a dual-tagging system for its runtime images to balance reproducibility with flexibility:
+
+1. Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`
+   Example: `od_runtime:abc123def456`
+
+   - This tag is based on the MD5 hash of the Docker build folder, which includes the source code (of runtime client and related dependencies) and Dockerfile.
+   - Identical hash tags guarantee that the images were built with exactly the same source code and Dockerfile.
+   - This ensures reproducibility: the same hash always means the same image contents.
+
+2. Generic tag: `{target_image_repo}:{target_image_tag}`
+   Example: `od_runtime:od_v0.8.3_ubuntu_tag_22.04`
+
+   - This tag follows the format: `od_runtime:od_v{OD_VERSION}_{BASE_IMAGE_NAME}_tag_{BASE_IMAGE_TAG}`
+   - It represents the latest build for a particular base image and OpenDevin version combination.
+   - This tag is updated whenever a new image is built from the same base image, even if the source code changes.
+
+The hash-based tag ensures exact reproducibility, while the generic tag provides a stable reference to the latest version of a particular configuration. This dual-tagging approach allows OpenDevin to efficiently manage both development and production environments.
+
+### Build Process
+
+1. Image Naming Convention:
+   - Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`
+     Example: `od_runtime:abc123def456`
+   - Generic tag: `{target_image_repo}:{target_image_tag}`
+     Example: `od_runtime:od_v0.8.3_ubuntu_tag_22.04`
+
+2. Build Process:
+   - a. Convert the base image name to an OD runtime image name.
+      Example: `ubuntu:22.04` -> `od_runtime:od_v0.8.3_ubuntu_tag_22.04`
+   - b. Generate a build context (Dockerfile and OpenDevin source code) and calculate its hash.
+   - c. Check for an existing image with the calculated hash.
+   - d. If not found, check for a recent compatible image to use as a base.
+   - e. If no compatible image exists, build from scratch using the original base image.
+   - f. Tag the new image with both hash-based and generic tags.
+
+3. Image Reuse and Rebuilding Logic:
+   The system follows these steps to determine whether to build a new image or use an existing one from a user-provided (base) image (e.g., `ubuntu:22.04`):
+
+   a. If an image exists with the same hash (e.g., `od_runtime:abc123def456`), it will be reused as is.
+
+   b. If the exact hash is not found, the system will try to rebuild using the latest generic image (e.g., `od_runtime:od_v0.8.3_ubuntu_tag_22.04`) as a base. This saves time by leveraging existing dependencies.
+
+   c. If neither the hash-tagged nor the generic-tagged image is found, the system will build the image completely from scratch.
+
+4. Caching and Efficiency:
+   - The system attempts to reuse existing images when possible to save build time.
+   - If an exact match (by hash) is found, it's used without rebuilding.
+   - If a compatible image is found, it's used as a base for rebuilding, saving time on dependency installation.
+
+Here's a flowchart illustrating the build process:
+
+```mermaid
+flowchart TD
+    A[Start] --> B{Convert base image name}
+    B --> |ubuntu:22.04 -> od_runtime:od_v0.8.3_ubuntu_tag_22.04| C[Generate build context and hash]
+    C --> D{Check for existing image with hash}
+    D -->|Found od_runtime:abc123def456| E[Use existing image]
+    D -->|Not found| F{Check for od_runtime:od_v0.8.3_ubuntu_tag_22.04}
+    F -->|Found| G[Rebuild based on recent image]
+    F -->|Not found| H[Build from scratch]
+    G --> I[Tag with hash and generic tags]
+    H --> I
+    E --> J[End]
+    I --> J
+```
+
+This approach ensures that:
+
+1. Identical source code and Dockerfile always produce the same image (via hash-based tags).
+2. The system can quickly rebuild images when minor changes occur (by leveraging recent compatible images).
+3. The generic tag (e.g., `od_runtime:od_v0.8.3_ubuntu_tag_22.04`) always points to the latest build for a particular base image and OpenDevin version combination.
+
+By using this method, OpenDevin maintains an efficient and flexible system for building and managing runtime images, adapting to both development needs and production requirements.
+
+
+## Advanced: Runtime Plugin System
+
+The OpenDevin Runtime supports a plugin system that allows for extending functionality and customizing the runtime environment. Plugins are initialized when the runtime client starts up.
+
+Check [an example of Jupyter plugin here](https://github.com/OpenDevin/OpenDevin/blob/9c44d94cef32e6426ebd8deeeb52963153b2348a/opendevin/runtime/plugins/jupyter/__init__.py#L30-L63) if you want to implement your own plugin.
+
+*More details about the Plugin system are still under construction - contributions are welcomed!*
+
+Key aspects of the plugin system:
+
+1. Plugin Definition: Plugins are defined as Python classes that inherit from a base `Plugin` class.
+
+2. Plugin Registration: Available plugins are registered in an `ALL_PLUGINS` dictionary.
+
+3. Plugin Specification: Plugins are associate with `Agent.sandbox_plugins: list[PluginRequirement]`. Users can specify which plugins to load when initializing the runtime.
+
+4. Initialization: Plugins are initialized asynchronously when the runtime client starts.
+
+5. Usage: The runtime client can use initialized plugins to extend its capabilities (e.g., the JupyterPlugin for running IPython cells).
--- a/docs/modules/usage/troubleshooting/troubleshooting.md
+++ b/docs/modules/usage/troubleshooting/troubleshooting.md
@@ -34,7 +34,7 @@ If you're running on Windows and having trouble, check out our [guide for Window
 **Symptoms**

 ```bash
-Error creating controller. Please check Docker is running and visit `https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting` for more debugging information.
+Error creating controller. Please check Docker is running and visit `https://docs.all-hands.dev/modules/usage/troubleshooting` for more debugging information.
 ```

 ```bash
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
--- a/docs/package.json
+++ b/docs/package.json
@@ -16,21 +16,22 @@
  },
  "dependencies": {
    "@docusaurus/core": "^3.4.0",
-    "@docusaurus/plugin-content-pages": "^3.4.0",
+    "@docusaurus/plugin-content-pages": "^3.5.1",
    "@docusaurus/preset-classic": "^3.4.0",
+    "@docusaurus/theme-mermaid": "^3.4.0",
    "@mdx-js/react": "^3.0.0",
    "clsx": "^2.0.0",
    "prism-react-renderer": "^2.3.0",
    "react": "^18.3.1",
    "react-dom": "^18.3.1",
    "react-icons": "^5.2.1",
-    "react-use": "^17.5.0"
+    "react-use": "^17.5.1"
  },
  "devDependencies": {
-    "@docusaurus/module-type-aliases": "^3.4.0",
+    "@docusaurus/module-type-aliases": "^3.5.1",
    "@docusaurus/tsconfig": "^3.4.0",
    "@docusaurus/types": "^3.4.0",
-    "typescript": "~5.5.3"
+    "typescript": "~5.5.4"
  },
  "browserslist": {
    "production": [
--- a/docs/src/components/CustomFooter.tsx
+++ b/docs/src/components/CustomFooter.tsx
@@ -17,11 +17,9 @@ function CustomFooter() {
            </a>
          </div>
        </div>
-        <div className="footer-community">
-          <Translate id="footer.community">Community</Translate>
-        </div>
+
        <div className="footer-icons">
-          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA" target="_blank" rel="noopener noreferrer">
+          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw" target="_blank" rel="noopener noreferrer">
            <FaSlack />
          </a>
          <a href="https://discord.gg/ESHStjSjD4" target="_blank" rel="noopener noreferrer">
--- a/docs/src/components/Demo/Demo.tsx
+++ b/docs/src/components/Demo/Demo.tsx
@@ -6,7 +6,7 @@ export function Demo() {

  return (
    <div
-      style={{ paddingBottom: "30px", paddingTop: "20px", textAlign: "center" }}
+      style={{ paddingBottom: "10px", paddingTop: "10px", textAlign: "center" }}
    >
      <video
        playsInline
--- a/docs/src/components/HomepageHeader/HomepageHeader.tsx
+++ b/docs/src/components/HomepageHeader/HomepageHeader.tsx
@@ -14,15 +14,28 @@ export function HomepageHeader() {
        <Heading as="h1" className="header-title">
          {siteConfig.title}
        </Heading>
+
        <p className="header-subtitle">{siteConfig.tagline}</p>
-        <div className="header-buttons">
-          <Link
-            className="button button--secondary button--lg"
-            to="/modules/usage/intro"
-          >
-            <Translate id="homepage.getStarted">Get Started</Translate>
-          </Link>
+
+        <div className="header-links">
+          <a href="https://github.com/OpenDevin/OpenDevin">
+            <img src="https://img.shields.io/badge/Code-Github-purple?logo=github&logoColor=white&style=for-the-badge" alt="Code" />
+          </a>
+          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2i1iqdag6-bVmvamiPA9EZUu7oCO6KhA">
+            <img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" />
+          </a>
+          <a href="https://discord.gg/ESHStjSjD4">
+            <img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" />
+          </a>
+
+          <a href="https://arxiv.org/abs/2407.16741">
+            <img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv" />
+          </a>
+          <a href="https://huggingface.co/spaces/OpenDevin/evaluation">
+            <img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark" />
+          </a>
        </div>
+
        <Demo />
      </div>
    </div>
--- a/docs/src/components/Welcome/Welcome.tsx
+++ b/docs/src/components/Welcome/Welcome.tsx
@@ -1,20 +0,0 @@
-import "../../css/welcome.css";
-import Translate from '@docusaurus/Translate';
-
-export function Welcome() {
-  return (
-    <div className="text-white">
-      <div className="welcome-container">
-        <img src="img/logo.png" className="welcome-logo" />
-        <p className="welcome-text">
-          <Translate id="welcome.message">
-          Welcome to OpenDevin, an open-source autonomous AI software engineer
-          that is capable of executing
-          complex engineering tasks and collaborating actively with users on
-          software development projects.
-          </Translate>
-        </p>
-      </div>
-    </div>
-  );
-}
--- a/docs/src/css/faq.css
+++ b/docs/src/css/faq.css
@@ -1,66 +0,0 @@
-/* faq.css */
-
-.faq-container {
-    margin: auto;
-    padding: 24px;
-    display: flex;
-    flex-direction: column;
-    gap: 8px;
-    margin-bottom: 24px;
-  }
-  
-  .faq-title {
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    font-size: 2rem;
-    padding: 8px;
-    text-transform: uppercase;
-    font-weight: bold;
-  }
-  
-  @media (min-width: 1024px) {
-    .faq-title {
-      font-size: 6rem;
-    }
-  }
-  
-  .faq-section {
-    display: flex;
-    flex-direction: column;
-    gap: 8px;
-    width: 100%;
-    margin-bottom: 24px;
-  }
-  
-  .faq-section-title {
-    text-transform: uppercase;
-    font-weight: bold;
-    font-size: 2rem;
-    letter-spacing: 0.1em;
-  }
-  
-  .highlight {
-    font-weight: 600;
-    color: var(--logo);
-  }
-  
-  .faq-steps ol {
-    padding-left: 24px;
-  }
-  
-  .command-box {
-    display: flex;
-    flex-direction: column;
-    padding: 8px;
-    background-color: #e0e0e0;
-    border-radius: 0.375rem;
-    height: 6vh;
-    text-transform: uppercase;
-    color: #4a5568;
-  }
-  
-  .command-box + .command-box {
-    height: 8vh;
-  }
-  
--- a/docs/src/css/footer.css
+++ b/docs/src/css/footer.css
@@ -3,12 +3,12 @@
 .custom-footer {
    background-color: dark;
    color: white;
-    height: 25vh;
+    height: 200px;
    /* background: linear-gradient(to bottom, #1a1a1a, #1a1a1a); */
    background: linear-gradient(to bottom, #1f2937, #000000);

  }
-  
+
  .footer-content {
    display: flex;
    flex-direction: column;
@@ -17,56 +17,55 @@
    padding: 8px;
    height: 100%;
  }
-  
+
  .footer-top {
    display: flex;
    gap: 8px;
    align-items: center;
  }
-  
+
  .footer-title {
    font-weight: bold;
    font-size: 1.125rem;
  }
-  
+
  @media (min-width: 768px) {
    .footer-title {
      font-size: 1.875rem;
    }
  }
-  
+
  .footer-link a {
    font-size: 0.875rem;
    text-decoration: none;
    color: gray;
    transition: color 0.3s ease;
  }
-  
+
  .footer-link a:hover {
    color: white;
  }
-  
+
  .footer-community {
    text-transform: uppercase;
    font-weight: 300;
  }
-  
+
  .footer-icons {
    display: flex;
    gap: 24px;
    font-size: 1.875rem;
  }
-  
+
  .footer-icons a {
    color:gray;
    transition: color 0.3s ease;
  }
-  
+
  .footer-icons a:hover {
    color: white;
  }
-  
+
  .footer-bottom {
    text-transform: uppercase;
  }
-  
--- a/docs/src/css/homepageHeader.css
+++ b/docs/src/css/homepageHeader.css
@@ -1,36 +1,47 @@
 /* homepageHeader.css */

 .homepage-header {
-    height: 100vh;
-    color: white;
-    background: linear-gradient(to top, #64748b, #000000);
-  }
-  
-  .header-content {
-    display: flex;
-    flex-direction: column;
-    gap: 8px;
-    align-items: center;
-    padding: 24px;
-    font-weight: 300;
-    width: 100%;
-  }
-  
+  height: 800px;
+  color: white;
+  background: linear-gradient(to top, #64748b, #000000);
+}
+
+.header-content {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  padding: 2rem;
+  font-weight: 300;
+  width: 100%;
+}
+
+.header-title {
+  font-size: 3rem;
+}
+
+@media (min-width: 768px) {
  .header-title {
-    font-size: 3rem;
+    font-size: 4rem;
  }
-  
-  @media (min-width: 768px) {
-    .header-title {
-      font-size: 5rem;
-    }
-  }
-  
-  .header-subtitle {
-    font-size: 1.25rem;
-  }
-  
-  .header-buttons {
-    margin-top: 24px;
-  }
-  
+}
+
+.header-subtitle {
+  font-size: 1.5rem;
+}
+
+.header-links {
+  display: flex;
+  flex-wrap: wrap;
+  justify-content: center;
+  gap: 10px;
+  max-width: 680px;
+}
+
+.header-links a {
+  display: inline-block;
+  transition: transform 0.2s ease-in-out;
+}
+
+.header-links a:hover {
+  transform: translateY(-2px);
+}
--- a/docs/src/css/welcome.css
+++ b/docs/src/css/welcome.css
@@ -1,53 +0,0 @@
-/* welcome.css */
-
-.text-white {
-    color: white;
-  }
-
-  .welcome-container {
-    display: flex;
-    justify-content: center;
-    align-items: center;
-    flex-direction: column;
-    background: linear-gradient(to bottom, #64748b, #1f2937);
-  }
-
-  @media (min-width: 768px) {
-    .welcome-container {
-      flex-direction: row;
-      background: linear-gradient(to bottom, #64748b, #1f2937);
-    }
-  }
-
-  .welcome-logo {
-    height: 45vh;
-    width: 45vw;
-  }
-
-  @media (max-width: 640px) {
-    .welcome-logo {
-      height: 40vw;
-      width: 40vw;
-    }
-  }
-
-  @media (min-width: 768px) {
-    .welcome-logo {
-      height: auto;
-      width: 350px;
-    }
-  }
-
-  .welcome-text {
-    padding: 24px;
-    margin-bottom: 24px;
-    font-weight: 300;
-    font-size: 1.125rem;
-  }
-
-  @media (min-width: 768px) {
-    .welcome-text {
-      padding: 8px;
-      font-size: 1.5rem;
-    }
-  }
--- a/docs/src/pages/faq.tsx
+++ b/docs/src/pages/faq.tsx
@@ -1,129 +0,0 @@
-import Layout from '@theme/Layout';
-import '../css/faq.css';
-import Translate, { translate } from '@docusaurus/Translate';
-
-export default function FAQ() {
-  const githubLink = (
-    <a href="https://github.com/OpenDevin/OpenDevin/issues" target="_blank">GitHub</a>
-  );
-  const discordLink = (
-    <a href="https://discord.gg/mBuDGRzzES" target="_blank">Discord</a>
-  );
-  const slackLink = (
-    <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA" target="_blank">Slack</a>
-  );
-
-  return (
-    <Layout
-      title={translate({ id: 'faq.title', message: 'FAQ' })}
-      description={translate({ id: 'faq.description', message: 'Frequently Asked Questions' })}
-    >
-      <div id="faq" className="faq-container">
-        <div className="faq-title">
-          <Translate id="faq.title" description="FAQ Title">Frequently Asked Questions</Translate>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.1" description="First Section Title">What is OpenDevin?</Translate>
-          </div>
-          <p>
-            <span className="highlight"><Translate id="faq.section.highlight" description="Highlight Text">OpenDevin</Translate></span>{" "}
-            <Translate id="faq.section.description.1" description="Description for OpenDevin">
-              is an autonomous software engineer that can solve software engineering
-              and web-browsing tasks end-to-end. It can perform data science queries, such
-              as "Find the number of pull requests to the OpenDevin repository in the last
-              month," and software engineering tasks, such as "Please add tests to this
-              file and verify that all the tests pass. If they don't fix the file."
-            </Translate>
-          </p>
-          <p>
-            <Translate id="faq.section.description.2" description="Further Description for OpenDevin">
-              At the same time, OpenDevin is a platform and community for agent developers
-              to test out and evaluate new agents.
-            </Translate>
-          </p>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.2" description="Support Section Title">Support</Translate>
-          </div>
-          <div>
-            <Translate
-              id="faq.section.support.answer"
-              description="Support Answer"
-              values={{
-                githubLink: githubLink,
-                discordLink: discordLink,
-                slackLink: slackLink,
-              }}
-            >
-              {`Please file a bug on {githubLink} if you notice a problem that likely affects others. If you're having trouble installing, or have general questions, reach out on {discordLink} or {slackLink}.`}
-            </Translate>
-          </div>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.3" description="GitHub Issue Section Title">How to fix a GitHub issue with OpenDevin?</Translate>
-          </div>
-          <div className="faq-steps">
-            <Translate id="faq.section.github.steps.intro" description="GitHub Steps Introduction">
-              To fix an issue on GitHub using OpenDevin, send a prompt to OpenDevin asking it to follow
-              steps like the following:
-            </Translate>
-            <ol>
-              <li><Translate id="faq.section.github.step1" description="GitHub Step 1">Read the issue https://github.com/OpenDevin/OpenDevin/issues/1611</Translate></li>
-              <li><Translate id="faq.section.github.step2" description="GitHub Step 2">Clone the repository and check out a new branch</Translate></li>
-              <li><Translate id="faq.section.github.step3" description="GitHub Step 3">Based on the instructions in the issue description, modify files to fix the issue</Translate></li>
-              <li><Translate id="faq.section.github.step4" description="GitHub Step 4">Push the resulting output to GitHub using the GITHUB_TOKEN environment variable</Translate></li>
-              <li><Translate id="faq.section.github.step5" description="GitHub Step 5">Tell me the link that I need to go to to send a pull request</Translate></li>
-            </ol>
-            <Translate id="faq.section.github.steps.preRun" description="GitHub Steps Pre-Run">
-              Before you run OpenDevin, you can do:
-            </Translate>
-            <div className="command-box">
-              export SANDBOX_ENV_GITHUB_TOKEN=XXX
-            </div>
-            <Translate id="faq.section.github.steps.tokenInfo" description="GitHub Steps Token Info">
-              where XXX is a GitHub token that you created that has permissions to push to the OpenDevin repo. If you don’t have write permission to the OpenDevin repo, you might need to change that to:
-            </Translate>
-            <div className="command-box">
-              Push the resulting output to my fork at https://github.com/USERNAME/OpenDevin/ using the GITHUB_TOKEN environment variable
-            </div>
-            <Translate id="faq.section.github.steps.usernameInfo" description="GitHub Steps Username Info">
-              where USERNAME is your GitHub username.
-            </Translate>
-          </div>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.4" description="Devin Section Title">How is OpenDevin different from Devin?</Translate>
-          </div>
-          <p>
-            <a href="https://www.cognition.ai/blog/introducing-devin"><Translate id="faq.section.devin.linkText" description="Devin Link Text">Devin</Translate></a>&nbsp;
-            <Translate id="faq.section.devin.description" description="Devin Description">
-              is a commercial product by Cognition Inc., that served as the initial
-              inspiration for OpenDevin. They both aim to do a good job at solving software
-              engineering tasks, but OpenDevin you can download, use, and modify, while Devin
-              you can only use through the Cognition site. In addition, OpenDevin has evolved
-              beyond the initial inspiration, and now serves as a community-driven ecosystem for
-              agent development in general, and we'd love to have you join and
-            </Translate>
-            <a href="https://github.com/OpenDevin/OpenDevin/blob/main/CONTRIBUTING.md"><Translate id="faq.section.devin.contribute" description="Contribute Link">contribute</Translate></a>!
-          </p>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.5" description="ChatGPT Section Title">How is OpenDevin different from ChatGPT?</Translate>
-          </div>
-          <p>
-            <Translate id="faq.section.chatgpt.description" description="ChatGPT Description">
-              ChatGPT you can access online, it does not interface with local files, and
-              its ability to execute code is limited. So it can write code, but it is not
-              easy to test or execute it.
-            </Translate>
-          </p>
-        </div>
-      </div>
-    </Layout>
-  );
-}
--- a/docs/src/pages/index.tsx
+++ b/docs/src/pages/index.tsx
@@ -4,12 +4,11 @@ import { HomepageHeader } from "../components/HomepageHeader/HomepageHeader";
 import { Welcome } from "../components/Welcome/Welcome";
 import { translate } from '@docusaurus/Translate';

-export function Header({ title, summary, description }): JSX.Element {
+export function Header({ title, summary }): JSX.Element {
  return (
    <div>
      <h1>{title}</h1>
-      <h2 style={{ fontSize: "40px" }}>{summary}</h2>
-      <h3 className="headerDescription">{description}</h3>
+      <h2 style={{ fontSize: "3rem" }}>{summary}</h2>
    </div>
  );
 }
@@ -17,22 +16,15 @@ export function Header({ title, summary, description }): JSX.Element {
 export default function Home(): JSX.Element {
  const { siteConfig } = useDocusaurusContext();
  return (
-    <>
    <Layout
      title={`${siteConfig.title}`}
      description={translate({
        id: 'homepage.description',
-        message: 'AI-powered code generation for software engineering.',
+        message: 'An Open Platform for AI Software Developers as Generalist Agents',
        description: 'The homepage description',
      })}
    >
-      <div>
-        <HomepageHeader />
-        <div>
-          <Welcome />
-        </div>
-      </div>
+    <HomepageHeader />
    </Layout>
-    </>
  );
 }
--- a/docs/static/img/system_architecture_overview.png
+++ b/docs/static/img/system_architecture_overview.png
--- a/docs/static/img/teaser.mp4
+++ b/docs/static/img/teaser.mp4
--- a/evaluation/EDA/README.md
+++ b/evaluation/EDA/README.md
@@ -2,9 +2,10 @@

 This folder contains evaluation harness for evaluating agents on the Entity-deduction-Arena Benchmark, from the paper [Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games](https://arxiv.org/abs/2310.01468), presented in ACL 2024 main conference.

-## Configure OpenDevin and your LLM
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.

 ## Start the evaluation

--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -1,29 +1,27 @@
 import asyncio
-import logging
 import os

 import pandas as pd
-
-# import huggingface_hub
 from datasets import load_dataset

 from evaluation.EDA.game import Q20Game, Q20GameCelebrity
 from evaluation.utils.shared import (
    EvalMetadata,
+    EvalOutput,
    make_metadata,
-    monologue_user_response,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
-
-# from evaluation.EDA.scorer import question_scorer
 from opendevin.controller.state.state import State
-from opendevin.core.config import config, get_llm_config_arg, get_parser
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
+from opendevin.core.main import create_runtime, run_controller

 game = None

@@ -48,7 +46,6 @@ def codeact_user_response_eda(state: State) -> str:

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response_eda,
-    'MonologueAgent': monologue_user_response,
 }

 AGENT_CLS_TO_INST_SUFFIX = {
@@ -56,39 +53,44 @@ AGENT_CLS_TO_INST_SUFFIX = {
 }


-def process_instance(
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='python:3.11-bookworm',
+            enable_auto_lint=False,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+) -> EvalOutput:
+    config = get_config(metadata)
+    instance_id = instance['text'].strip()
+
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-    eval_output_dir = metadata.eval_output_dir
    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            eval_output_dir, 'logs', f'instance_{instance["text"].strip()}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {instance["text"].strip()}.\nLOG:   tail -f {log_file}'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance_id}.')

    # Prepare instruction
-    _game_class = {'things': Q20Game, 'celebs': Q20GameCelebrity}
+    _game_class = {'eda-things': Q20Game, 'eda-celebs': Q20GameCelebrity}

    guesser_kargs = {
        'max_new_tokens': 64,
@@ -112,23 +114,16 @@ def process_instance(

    instruction = f'{game.first_user_utterance}'
    logger.info(f'Instruction: {instruction}')
-
-    # instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-    # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    runtime = await create_runtime(config, sid=instance['text'].strip())

-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                agent.__class__.__name__
-            ],
-            sid=instance['text'].strip(),
-        )
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
    )
    # ======= Attempt to evaluate the agent's edits =======
    # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
@@ -149,21 +144,20 @@ def process_instance(
    histories = state.history.compatibility_for_eval_history_pairs()

    # Save the output
-    output = {
-        'instance_id': instance['text'].strip(),
-        'instance': instance,
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': {
+    output = EvalOutput(
+        instance_id=instance_id,
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
            'success': test_result,
            'final_message': final_message,
            'ground_truth': instance['text'],
        },
-    }
-
+    )
    return output


@@ -190,12 +184,16 @@ if __name__ == '__main__':
    )
    args, _ = parser.parse_known_args()

-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
-
    eda_dataset = load_dataset(
        'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
    )
+    eda_dataset.rename(columns={'text': 'instance_id'}, inplace=True)
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config,
@@ -213,16 +211,15 @@ if __name__ == '__main__':

    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
    prepared_dataset = prepare_dataset(
-        eda_dataset.to_pandas(), output_file, args.eval_n_limit, 'text'
+        eda_dataset.to_pandas(), output_file, args.eval_n_limit
    )

-    agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
-
-    run_evaluation(
-        prepared_dataset,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        'text',
+    asyncio.run(
+        run_evaluation(
+            prepared_dataset,
+            metadata,
+            output_file,
+            args.eval_num_workers,
+            process_instance,
+        )
    )
--- a/evaluation/EDA/scripts/run_infer.sh
+++ b/evaluation/EDA/scripts/run_infer.sh
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -12,15 +12,59 @@ all the preprocessing/evaluation/analysis scripts.

 ## Supported Benchmarks

+To learn more about how to integrate your benchmark into OpenDevin, check out [tutorial here](https://docs.all-hands.dev/modules/usage/evaluation_harness).
+
+### Software Engineering
+
 - SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
 - HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
- GAIA: [`evaluation/gaia`](./gaia)
- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
- MINT: [`evaluation/mint`](./mint)
- AgentBench: [`evaluation/agent_bench`](./agent_bench)
 - BIRD: [`evaluation/bird`](./bird)
- LogicReasoning: [`evaluation/logic_reasoning`](./logic_reasoning)
+- BioCoder: [`evaluation/ml_bench`](./ml_bench)
+- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
+- APIBench: [`evaluation/gorilla`](./gorilla/)
+- ToolQA: [`evaluation/toolqa`](./toolqa/)
+
+### Web Browsing
+
+- WebArena: [`evaluation/webarena`](./webarena/)
+- MiniWob++: [`evaluation/miniwob`](./miniwob/)
+
+### Misc. Assistance
+
+- GAIA: [`evaluation/gaia`](./gaia)
+- GPQA: [`evaluation/gpqa`](./gpqa)
+- AgentBench: [`evaluation/agent_bench`](./agent_bench)
+- MINT: [`evaluation/mint`](./mint)
+- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
+- ProofWriter: [`evaluation/logic_reasoning`](./logic_reasoning)
+
+
+## Before everything begins: Setup Environment and LLM Configuration
+
+Please follow instruction [here](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup your local development environment and LLM.
+
+OpenDevin in development mode uses `config.toml` to keep track of most configurations.
+
+Here's an example configuration file you can use to define and use multiple LLMs:
+
+```toml
+[llm]
+# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
+model = "gpt-4o-2024-05-13"
+api_key = "sk-XXX"
+
+[llm.eval_gpt4_1106_preview_llm]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[llm.eval_some_openai_compatible_model_llm]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+

 ### Result Visualization

--- a/evaluation/TUTORIAL.md
+++ b/evaluation/TUTORIAL.md
@@ -1,184 +0,0 @@
-# Tutorial: How to add a New Evaluation Benchmark to OpenDevin
-
-This tutorial provides a general guide on how to integrate your own evaluation benchmark into the OpenDevin framework.
-
-You can read this for details, and also learn by example by looking at our existing evaluations:
- [swe_bench](swe_bench/)
-
-
-## A quick walk-through of OpenDevin architecture
-
-### Before everything begins
-
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-### Configuration file
-
-OpenDevin uses `config.toml` to keep track of most configurations.
-
-Here's an example configuration file you can use:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-
-# IMPORTANT: You should set these two paths to YOUR WORKSPACE directory,
-# which will be mounted into Sandbox for agent to interact with!
-# The OpenDevin agent will be able to read/write files whatever they like (even rm -rf)
-# in this directory, so be careful!!
-workspace_base = "/path/to/your/workspace"
-workspace_mount_path = "/path/to/your/workspace"
-# ==========================
-
-ssh_hostname = "localhost"
-
-# SWEBench eval specific - but you can tweak it to your needs
-use_host_network = false
-run_as_devin = false
-# linting python after editing helps LLM fix indentations
-enable_auto_lint = true
-
-[sandbox]
-box_type = "ssh"
-timeout = 120
-
-[llm]
-# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
-model = "gpt-4o-2024-05-13"
-api_key = "sk-XXX"
-```
-
-### How to use OpenDevin programmatically
-
-In this section, for the purpose of building an evaluation task, we don't use the standard OpenDevin web-based GUI, but rather run OpenDevin backend from CLI.
-
-For example, you can run the following, which performs the specified task `-t`, with a particular model config `-l` and agent `-c`, for a maximum number of iterations `-i`:
-
-```bash
-poetry run python ./opendevin/core/main.py \
-        -i 10 \
-        -t "Write me a bash script that print hello world." \
-        -c CodeActAgent \
-        -l llm
-```
-
-After running the script, you will observe the following:
-
-![](./static/example_task_1.png)
-
-You can see the agent uses bash to write a script, makes it executable, and then tests it by running it to make sure it is working.
-
-At the end of the above screenshot, OpenDevin actually requests user inputs when it think it finishes the task. This will cause issues in evaluation, since most evaluation don't assume additional user input. To fix this, we introduce the functionality of `fake_user_response_fn` in the `main` function, which we describe in the next section.
-
-## The `main` function
-
-The signature of `main` (in file [[`opendevin/core/main.py`](../opendevin/core/main.py)]) is as follows:
-
-```python
-async def main(
-    task_str: str = '',
-    exit_on_message: bool = False,
-    fake_user_response_fn: Optional[Callable[[Optional[State]], str]] = None,
-    sandbox: Optional[Sandbox] = None,
-) -> Optional[State]:
-```
-
- `task_str`: The task instruction to run. In the above example, it is "Write me a bash script that print hello world."
- `exit_on_message`: whether to quit if the agent asks for a message from user
- `fake_user_response_fn`: An optional function that receives the current state (could be None) and returns a fake user response.
- `sandbox`: An optional sandbox to run the agent in.
-
-### `fake_user_response_fn`
-
-Here's an example of `fake_user_response_fn` in the implementation for SWE-Bench in [`evaluation/swe_bench/run_infer.py`](swe_bench/run_infer.py):
-
-```python
-def codeact_user_response(state: State) -> str:
-    msg = (
-        'Please continue working on the task on whatever approach you think is suitable.\n'
-        'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
-        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
-    )
-    # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
-    if state.history:
-        user_msgs = [
-            event
-            for event in state.history.get_events()
-            if isinstance(action, MessageAction) and action.source == 'user'
-        ]
-        if len(user_msgs) > 2:
-            # let the agent know that it can give up when it has tried 3 times
-            return (
-                msg
-                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
-            )
-    return msg
-```
-
-
-### Return value
-
-The main function returns a `State`, which is defined in [`opendevin/controller/state/state.py`](../opendevin/controller/state/state.py). We are mainly using `state.history` here, which is the most important field of data. You can imagine it is being a more structured version of OpenAI's chat completion [messages](https://platform.openai.com/docs/guides/text-generation/chat-completions-api).
-
-`history: list[tuple[Action, Observation]] = field(default_factory=list)` is a list of (action, observation) tuple. All the actions are defined at [`opendevin/events/action`](../opendevin/events/action) and observations are defined at [`opendevin/events/observation`](../opendevin/events/action).
-
-The agent can emit different actions like `CmdRunAction`  (`opendevin/events/action/commands.py`) to execute bash commands and receive `CmdOutputObservation` (`opendevin/events/observation/commands.py`), `IPythonRunCellAction` to receive `IPythonRunCellObservation`, `BrowseInteractiveAction` (`opendevin/events/action/browse.py`) to browse the web and receive `BrowserOutputObservation` (`opendevin/events/observation/browse.py`).
-
-The action we used in this example is `MessageAction` (`opendevin/events/action/message.py`), which actually denotes a message from either `agent` or `user`. In the [CodeAct agent example](https://github.com/OpenDevin/OpenDevin/blob/7ca560471bd262f22513f3863995d0a8e6121c07/agenthub/codeact_agent/codeact_agent.py#L239-L273), an agent is considered to emit a `MessageAction` when it does not trigger a `CmdRunAction`, `IPythonRunCellAction`, and/or `BrowseInteractiveAction`.
-
-Typically, the agent returns `MessageAction` when it is confused about the task, and want to ask human for follow-up clarification, which is a good thing in real-world task, but not necessarily in evaluation. So in this example, we provide a dummy prompt to tell the agent "Please continue working on the task on whatever approach you think is suitable[...]".
-
-If you see something like this, you can consider adding this to your evaluation pipeline as well.
-
-### `sandbox`
-
-Sandbox is a fully functioning docker container where the agent can perform all sorts of tasks, e.g., using bash, calling Python, install packages, and more. You can leave `sandbox` to `None` if you don't need to do anything special to pre-configure the `Sandbox`.
-
-In SWE-Bench, we need to copy the proper repository directory to the workspace and activate the right python virtual environment before the agent can start performing the task, so we actually defined a custom [`SWEBenchSSHBox`](https://github.com/OpenDevin/OpenDevin/blob/7ca560471bd262f22513f3863995d0a8e6121c07/evaluation/swe_bench/swe_env_box.py#L12-L118) that inherit from the default sandbox [`SSHBox`](https://github.com/OpenDevin/OpenDevin/blob/7ca560471bd262f22513f3863995d0a8e6121c07/opendevin/runtime/docker/ssh_box.py#L188) and handles all these initial setup. If you need to configure the `sandbox` for your evaluation, check `SWEBenchSSHBox` for a reference of implementation.
-
-## How to put together an evaluation script?
-
-Now we know how to start running the agent end-to-end, and how `fake_user_response_fn` and `sandbox` work. We will walk through a piece of dummy code (simplified version of SWE-Bench's [`run_infer.py`](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/run_infer.py)) that outline the general workflow:
-
- Load the dataset and prepare the evaluation configuration.
- Filter out any instances that have already been processed.
- For each instance in the dataset:
-  - Set up the sandbox environment.
-  - Run the agent to generate a solution.
-  - Apply the solution to the instance and execute the test command.
-  - Collect the results and write them to the output file.
- Perform cleanup after the evaluation is complete.
-
-You can see the [swe_bench/run_infer.py](swe_bench/run_infer.py) file for an example.
-
-When you fully understand the `run_infer.py`, you can be ready to actually starting the evaluation!
-
-
-## Run the evaluation!
-
-You can write your `run_infer.sh` script mimicking SWE-Bench's [`run_infer.sh`](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/scripts/run_infer.sh).
-
-
-You can start the evaluation by running:
-
-```bash
-./run_infer.sh eval_gpt_4o_2024_05_13
-```
-Where `eval_gpt_4o_2024_05_13` is the model config you defined on the config.toml.
-Like this:
-
-```toml
-[core]
-...
-
-[llm]
-model="gpt-4-32k"
-...
-
-[eval_gpt_4o_2024_05_13]
-model="gpt-4o-2024-05-13"
-api_key="sk-xxx"
-```
-
-If `[eval_gpt_4o_2024_05_13]` is not present, it will default to using the model configured in `[llm]`.
--- a/evaluation/agent_bench/README.md
+++ b/evaluation/agent_bench/README.md
@@ -1,44 +1,10 @@
 # AgentBench Evaluation

-This folder contains evaluation harness for evaluating agents on
-the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688).
+This folder contains evaluation harness for evaluating agents on the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688). We currently only support running on the `osbench` subset.

-## Configure OpenDevin and your LLM
+## Setup Environment and LLM Configuration

-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md)
-for how to set this up.
-
-Here is an example `config.toml` file:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/path/to/cache"
-
-workspace_base = "/path/to/workspace"
-workspace_mount_path = "/path/to/workspace"
-
-ssh_hostname = "localhost"
-
-use_host_network = false
-# AgentBench specific
-run_as_devin = true
-enable_auto_lint = true
-
-[sandbox]
-box_type = "ssh"
-timeout = 120
-
-[llm.eval_gpt35_turbo]
-model = "gpt-3.5-turbo"
-api_key = "sk-123"
-temperature = 0.0
-
-[llm.eval_gpt4o]
-model = "gpt-4o"
-api_key = "sk-123"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Start the evaluation

@@ -46,7 +12,18 @@ temperature = 0.0
 ./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
 ```

-Following is the basic command to start the evaluation. Here we are only evaluating the `osbench` for now.
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
+default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
+in order to use `eval_limit`, you must also set `agent`.
+
+
+Following is the basic command to start the evaluation.

 You can update the arguments in the script `evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.

@@ -57,5 +34,5 @@ You can update the arguments in the script `evaluation/agent_bench/scripts/run_i
 - `--eval-n-limit`: the number of examples to evaluate. For example, `100`.

 ```bash
-./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo 0.6.2 CodeActAgent 1
+./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
 ```
--- a/evaluation/agent_bench/helper.py
+++ b/evaluation/agent_bench/helper.py
@@ -14,7 +14,7 @@ def try_parse_answer(act) -> str | None:
        raw_ans = act.thought
    else:
        return None
-    agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
+    agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans, re.DOTALL)
    if not agent_answer:
        return None
    return agent_answer[0].strip()
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -1,10 +1,9 @@
 import asyncio
-import logging
 import os
 import re
-import shutil
+import tempfile
+from typing import Any

-import docker
 import pandas as pd
 from datasets import load_dataset

@@ -16,62 +15,175 @@ from evaluation.agent_bench.helper import (
 )
 from evaluation.utils.shared import (
    EvalMetadata,
+    EvalOutput,
    make_metadata,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import config, get_llm_config_arg, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.events.action import CmdRunAction, MessageAction
-from opendevin.llm.llm import LLM
-from opendevin.runtime.docker.ssh_box import DockerSSHBox
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import AgentFinishAction, CmdRunAction, MessageAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime


-def process_instance(
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='python:3.11-bookworm',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    init_cmd = instance.init
+    if init_cmd is not None:
+        script_name = f'{instance.instance_id}_init.sh'
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            host_script_path = os.path.join(tmpdir, script_name)
+            create_sh_file(host_script_path, init_cmd)
+            await runtime.copy_to(
+                host_script_path,
+                '/workspace',
+            )
+
+        logger.info(f'Running init script: {script_name}')
+        action = CmdRunAction(command=f'chmod +x ./{script_name} && ./{script_name}')
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    agent_answer = None
+    get_agent_result_cmd = instance.get_agent_result
+    if get_agent_result_cmd is not None:
+        script_name = 'get_agent_result.sh'
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            host_script_path = os.path.join(tmpdir, script_name)
+            create_sh_file(host_script_path, get_agent_result_cmd)
+            await runtime.copy_to(
+                host_script_path,
+                '/workspace',
+            )
+            logger.info(f'Running get agent result cmd: {script_name}')
+
+        action = CmdRunAction(
+            command=f'chmod +x ./{script_name} && ./{script_name}',
+            keep_prompt=False,
+        )
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+        agent_answer = obs.content
+    # IF the agent answer is not found, retrieve it from the history
+    # We wait until the controller finishes
+
+    final_ans = None
+    if instance.ground_truth is not None:
+        final_ans = instance.ground_truth
+    else:
+        get_ground_truth_cmd = instance.get_ground_truth
+        if get_ground_truth_cmd is not None:
+            script_name = 'get_ground_truth.sh'
+            with tempfile.TemporaryDirectory() as tmpdir:
+                host_script_path = os.path.join(tmpdir, script_name)
+                create_sh_file(host_script_path, get_ground_truth_cmd)
+                await runtime.copy_to(
+                    host_script_path,
+                    '/workspace',
+                )
+            logger.info(f'Running get ground truth cmd: {script_name}')
+
+            action = CmdRunAction(
+                command=f'chmod +x ./{script_name} && ./{script_name}',
+                keep_prompt=False,
+            )
+            logger.info(action, extra={'msg_type': 'ACTION'})
+            obs = await runtime.run_action(action)
+            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+            final_ans = obs.content
+
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    return {
+        'final_ans': final_ans,
+        'agent_answer': agent_answer,
+    }
+
+
+async def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+) -> EvalOutput:
+    config = get_config(metadata)

-    inst_id = instance.instance_id
-    question = instance.description
-    # create a directory for the instance's workspace
-    instance_workspace = str(os.path.join(config.workspace_base, inst_id))
-    container_inst_workspace = str(
-        os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
-    )
-    if os.path.exists(instance_workspace):
-        shutil.rmtree(instance_workspace)
-    os.makedirs(instance_workspace, exist_ok=True)
-
-    # Set up the logger properly, so you can run multiprocessing to parallel the evaluation
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir, 'logs', f'instance_{inst_id}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {inst_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')

    # =============================================
    # build instruction
@@ -84,96 +196,68 @@ def process_instance(
        'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
        'For example: The answer to the question is <solution> 42 </solution>.\n'
        '# Problem \n'
-        f'{question}\n\n'
+        f'{instance.description}\n\n'
    )
    instruction += (
        'IMPORTANT: You should ONLY interact with the environment provided '
        'to you AND NEVER ASK FOR HUMAN HELP.\n'
    )
    # NOTE: You can actually set slightly different instruction for different agents
-    instruction += INST_SUFFIXES[agent.__class__.__name__]
+    instruction += INST_SUFFIXES[metadata.agent_class]

    # =============================================
    # create sandbox and run the agent
    # =============================================

-    sandbox = DockerSSHBox()
-    sandbox.execute(f'cd {inst_id}')
+    runtime: Runtime = await create_runtime(config, sid=instance.instance_id)

-    init_cmd = instance.init
-    if init_cmd is not None:
-        scpt_name = f'{instance.instance_id}_init.sh'
-        scpt_path = os.path.join(container_inst_workspace, scpt_name)
-        host_scpt_path = os.path.join(instance_workspace, scpt_name)
-        create_sh_file(host_scpt_path, init_cmd)
-        logger.info(f'Running init script: {scpt_path}')
-        _, init_res = sandbox.execute(scpt_path)
-        logger.info(f'Init script result: {init_res}')
+    await initialize_runtime(runtime, instance=instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            fake_user_response_fn=FAKE_RESPONSES[agent.__class__.__name__],
-            sandbox=sandbox,
-            sid=inst_id,
-        )
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
    )
-
    if state is None:
        raise ValueError('State should not be None.')

-    # get the ground truth
-    # OSBenchSSHBox.get_ground_truth(instance, state)
-
    # =============================================
    # result evaluation
    # =============================================

-    agent_answer = ''
-    get_agent_result_cmd = instance.get_agent_result
-    if get_agent_result_cmd is not None:
-        scpt_name = f'{instance.instance_id}_get_agent_result.sh'
-        scpt_path = os.path.join(container_inst_workspace, scpt_name)
-        host_scpt_path = os.path.join(instance_workspace, scpt_name)
-        create_sh_file(host_scpt_path, get_agent_result_cmd)
-        logger.info(f'Running get agent result cmd: {scpt_path}')
-        _, agent_answer = sandbox.execute(scpt_path)
-    else:
+    return_val = await complete_runtime(runtime, instance)
+    agent_answer = return_val['agent_answer']
+    final_ans = return_val['final_ans']
+
+    # If the agent answer is not found, retrieve it from the history
+    if agent_answer is None:
+        agent_answer = ''
        logger.info('Retrieving agent answer from history.')
        raw_ans = ''

        # retrieve the last agent message or thought
        for event in state.history.get_events(reverse=True):
-            if isinstance(event, MessageAction) and event.source == 'agent':
-                raw_ans = event.content
-            elif isinstance(event, CmdRunAction) and event.source == 'agent':
-                raw_ans = event.thought
+            if event.source == 'agent':
+                if isinstance(event, AgentFinishAction):
+                    raw_ans = event.thought
+                    break
+                elif isinstance(event, MessageAction):
+                    raw_ans = event.content
+                    break
+                elif isinstance(event, CmdRunAction):
+                    raw_ans = event.thought
+                    break

        # parse the answer for a solution tag
-        agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
+        agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans, re.DOTALL)
        if len(agent_answer) == 0:
            logger.warning(f'Failed to parse model answer: {raw_ans}')
            agent_answer = raw_ans
        else:
            agent_answer = agent_answer[0]

-    final_ans = ''
-    if instance.ground_truth is not None:
-        final_ans = instance.ground_truth
-    else:
-        get_ground_truth_cmd = instance.get_ground_truth
-        if get_ground_truth_cmd is not None:
-            scpt_name = f'{instance.instance_id}_get_ground_truth.sh'
-            scpt_path = os.path.join(container_inst_workspace, scpt_name)
-            host_scpt_path = os.path.join(instance_workspace, scpt_name)
-            create_sh_file(host_scpt_path, get_ground_truth_cmd)
-            logger.info(f'Running get ground truth cmd: {scpt_path}')
-            sandbox.execute(f'cd {container_inst_workspace}')
-            _, final_ans = sandbox.execute(scpt_path)
-
    comparison_method = instance.comparison_method
    logger.info(
        f'Final message: {agent_answer} | Ground truth: {final_ans} | Comparison method: {comparison_method}'
@@ -188,58 +272,49 @@ def process_instance(
    metrics = state.metrics.get() if state.metrics else None

    # Save the output
-    output = {
-        'instance_id': inst_id,
-        'instance': instance.to_dict(),
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': {
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
            'agent_answer': agent_answer,
            'final_answer': final_ans,
            'check_method': comparison_method,
            'result': test_result,
        },
-    }
-
-    # clean up
-    if os.path.exists(instance_workspace):
-        shutil.rmtree(instance_workspace)
-    # Close the sandbox
-    try:
-        sandbox.close()
-    except docker.errors.NotFound as e:
-        logger.error(f'Failed to close sandbox: {e}')
+    )
    return output


 if __name__ == '__main__':
-    id_column = 'instance_id'
    args = parse_arguments()
    dataset = load_dataset('iFurySt/AgentBench')
    agent_bench_tests = dataset['osbench'].to_pandas()

-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config,
-        args.dataset_name,
+        'AgentBench-OS',
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
        args.eval_output_dir,
    )
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
+    instances = prepare_dataset(agent_bench_tests, output_file, args.eval_n_limit)

-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
    )
--- a/evaluation/agent_bench/scripts/run_infer.sh
+++ b/evaluation/agent_bench/scripts/run_infer.sh
--- a/evaluation/biocoder/README.md
+++ b/evaluation/biocoder/README.md
@@ -2,15 +2,12 @@

 Implements evaluation of agents on BioCoder from the BioCoder benchmark introduced in [BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models](https://arxiv.org/abs/2308.16458). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper.

-## Setup Environment
+## Setup Environment and LLM Configuration

-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-
-## Configure OpenDevin and your LLM
-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## BioCoder Docker Image
+
 In the opendevin branch of the Biocoder repository, we have slightly modified our original Docker image to work with the OpenDevin environment. In the Docker image are testing scripts (`/testing/start_test_opendevin.py` and aux files in `/testing_files/`) to assist with evaluation. Additionally, we have installed all dependencies, including OpenJDK, mamba (with Python 3.6), and many system libraries. Notably, we have **not** packaged all repositories into the image, so they are downloaded at runtime.

 **Before first execution, pull our Docker image with the following command**
@@ -41,12 +38,12 @@ to `CodeActAgent`.
 - `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.

 Let's say you'd like to run 1 instance using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent
-with OpenDevin version 0.6.2, then your command would be:
+with current OpenDevin version, then your command would be:

 ## Examples

 ```bash
-./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent 1
+./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 HEAD CodeActAgent 1
 ```

 ## Reference
--- a/evaluation/biocoder/biocoder_env_box.py
+++ b/evaluation/biocoder/biocoder_env_box.py
@@ -1,385 +0,0 @@
-import json
-import os
-import re
-import sys
-from collections import defaultdict
-from dataclasses import dataclass
-
-from datasets import load_dataset
-
-from opendevin.core.config import config
-from opendevin.core.logger import opendevin_logger as logger
-from opendevin.runtime.docker.ssh_box import DockerSSHBox
-from opendevin.runtime.plugins import (
-    JupyterRequirement,
-    PluginRequirement,
-    SWEAgentCommandsRequirement,
-)
-
-BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
-
-
-@dataclass
-class BiocoderData:
-    filePath: str
-    numLines: int
-    lineStart: int
-    lineEnd: int
-    signature: str
-    comment: str
-    content: str
-    repository: str
-    promptSummaryOnly: str
-    contextCode: str
-    goldenCode: str
-    test_case_id: str
-    language: str
-
-    def to_dict(self):
-        return {
-            'filePath': self.filePath,
-            'numLines': self.numLines,
-            'lineStart': self.lineStart,
-            'lineEnd': self.lineEnd,
-            'signature': self.signature,
-            'comment': self.comment,
-            'content': self.content,
-            'repository': self.repository,
-            'promptSummaryOnly': self.promptSummaryOnly,
-            'contextCode': self.contextCode,
-            'goldenCode': self.goldenCode,
-            'test_case_id': self.test_case_id,
-            'language': self.language,
-        }
-
-
-def get_likely_indent_size(array_of_tabs) -> int:
-    sizes = defaultdict(int)
-
-    for i in range(len(array_of_tabs) - 1):
-        diff = array_of_tabs[i + 1] - array_of_tabs[i]
-        if diff > 0:
-            sizes[diff] += 1
-    if len(sizes) == 0:
-        return 4
-    return int(max(sizes, key=sizes.get))
-
-
-class BiocoderSSHBox(DockerSSHBox):
-    def __init__(
-        self,
-        container_image: str,
-        timeout: int = 120,
-        sid: str | None = None,
-        biocoder_instance_id: str | None = None,
-        biocoder_instance: BiocoderData | None = None,
-        skip_workspace_mount: bool = True,
-        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
-        biocoder_cache_folder: str = 'biocoder_cache',
-        workspace_dir_name: str | None = None,
-    ):
-        if biocoder_instance_id is None:
-            raise ValueError('biocoder_instance_id must be provided')
-        self.biocoder_instance_id = biocoder_instance_id
-        self.biocoder_instance = biocoder_instance
-        self.skip_workspace_mount = skip_workspace_mount
-        self.biocoder_cache_folder = biocoder_cache_folder
-        self.first_line_after_removed = None
-        self.workspace_dir_name = workspace_dir_name
-        self.workspace_base = config.workspace_base
-        self.workspace_mount_path = config.workspace_mount_path
-        # self.workspace_dir_name_host = os.path.join(config.workspace_base, workspace_dir_name)
-
-        self.context_path = None
-        self.generated_path = None
-        self.golden_path = None
-
-        assert (
-            container_image is not None
-        ), 'container_image is required for BiocoderBenchSSHBox!'
-        super().__init__(container_image, timeout, sid)
-        self.init_plugins(sandbox_plugins)
-
-    @property
-    def volumes(self):
-        if self.skip_workspace_mount:
-            return {
-                k: v
-                for k, v in super().volumes.items()
-                if not v['bind'] == self.sandbox_workspace_dir
-            }
-        return super().volumes
-
-    def get_target_filepath(self):
-        target_filepath = os.path.join(
-            self.workspace_mount_path,
-            self.biocoder_instance.repository.split('/')[1],
-            self.biocoder_instance.filePath,
-        )
-        return target_filepath
-
-    def get_changed_code(self, include_signature=False):
-        # copies changed code into /testing_files/
-        # Note that this does NOT copy the function signature
-        target_filepath = self.get_target_filepath()
-        selected_lines = []
-        offset = 1 if include_signature else 0
-        if self.first_line_after_removed is None:
-            logger.warning('First line after removed is None')
-        with open(target_filepath, 'r') as f:
-            lines = f.read().split('\n')
-            for i in range(self.biocoder_instance.lineStart - offset, len(lines)):
-                if lines[i].strip() == self.first_line_after_removed.strip():
-                    break
-                selected_lines.append(lines[i])
-        text = '\n'.join(selected_lines)
-        return text
-
-    def copy_changed_code(self):
-        changed_code = self.get_changed_code(include_signature=True)
-        with open(self.generated_path, 'w') as f:
-            f.write(changed_code)
-        exit_code, output = self.execute_and_check(
-            f'cp -r /workspace/{self.biocoder_cache_folder}/* /testing_files',
-            'Failed to copy the files',
-        )
-
-    def remove_code(self):
-        comment_prefix = {'python': '#', 'java': '//'}
-
-        target_filepath = self.get_target_filepath()
-        line_start = self.biocoder_instance.lineStart
-        line_end = self.biocoder_instance.lineEnd
-        with open(target_filepath, 'r') as f:
-            lines = f.read().split('\n')
-            # print("="*10+"ORIGINAL"+"="*10)
-            # print("\n".join(lines))
-            signature_line = lines[line_start - 1]
-
-            # get the number of tabs
-            def get_indent_size(s: str):
-                return len(re.match(r'\s*', s).group())
-
-            indent_sizes = list(map(get_indent_size, lines))
-            indent_size = get_likely_indent_size(indent_sizes)
-            comment_indent_size = get_indent_size(signature_line) + indent_size
-            lines = (
-                lines[:line_start]
-                + [
-                    f"{' '*comment_indent_size+comment_prefix[self.biocoder_instance.language.lower()]}TODO: replace with your code here"
-                ]
-                + ([''] * 2)
-                + lines[line_end:]
-            )
-        first_line_after_removed_index = line_start
-        while len(
-            lines[first_line_after_removed_index].strip()
-        ) == 0 and first_line_after_removed_index < len(lines):
-            first_line_after_removed_index += 1
-        self.first_line_after_removed = lines[first_line_after_removed_index]
-        # print("FIRST LINE AFTER REMOVED: ", self.first_line_after_removed)
-
-        with open(target_filepath, 'w') as f:
-            f.write('\n'.join(lines))
-
-        # with open(target_filepath, 'r') as f:
-        #     print("="*10+"MODIFIED"+"="*10)
-        #     print(f.read())
-
-    def execute_and_check(self, cmd: str, error_msg: str) -> tuple[int, str]:
-        exit_code, output = self.execute(cmd)
-        if exit_code != 0:
-            logger.error(error_msg)
-            sys.exit(1)
-        return exit_code, output
-
-    @classmethod
-    def get_box_for_instance(
-        cls,
-        instance,
-        workspace_dir_name=None,
-        skip_workspace_mount: bool = False,
-        workspace_mount_path: str | None = None,
-        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
-    ) -> 'BiocoderSSHBox':
-        """This method initializes a container image, then runs some initialization commands"""
-        if workspace_dir_name is None:
-            workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
-                '/', '__'
-            )
-
-        workspace_base = str(os.path.join(config.workspace_base, workspace_dir_name))
-        old_workspace_base = config.workspace_base
-        old_workspace_mount_path = config.workspace_mount_path
-
-        try:
-            config.workspace_base = workspace_base
-            config.workspace_mount_path = workspace_base
-
-            # linting python after editing helps LLM fix indentations
-            config.enable_auto_lint = True
-
-            # create folder for transferring files back/forth
-            biocoder_cache_folder = 'biocoder_cache'
-            if not os.path.exists(os.path.join(workspace_base, biocoder_cache_folder)):
-                os.makedirs(
-                    os.path.join(workspace_base, biocoder_cache_folder), exist_ok=True
-                )
-
-            file_ext = {
-                'python': 'py',
-                'java': 'java',
-                'c': 'c',
-                'cpp': 'cpp',
-                'javascript': 'js',
-                'typescript': 'ts',
-            }[instance.language.lower()]
-
-            context_path = os.path.join(
-                workspace_base, biocoder_cache_folder, 'context.' + file_ext
-            )
-            generated_path = os.path.join(
-                workspace_base, biocoder_cache_folder, 'generated.' + file_ext
-            )
-            golden_path = os.path.join(
-                workspace_base, biocoder_cache_folder, 'golden.' + file_ext
-            )
-
-            # print(instance.contextCode)
-            with open(context_path, 'w') as f:
-                f.write(instance.contextCode)
-            with open(generated_path, 'w') as f:
-                f.write(instance.goldenCode)
-            with open(golden_path, 'w') as f:
-                f.write(instance.goldenCode)
-
-            testcase_json = {
-                'test_case_id': instance.test_case_id,
-                'num_cases': 1000,
-                'language': instance.language.lower(),
-            }
-
-            with open(
-                os.path.join(
-                    workspace_base, biocoder_cache_folder, 'testcase_biocoder.json'
-                ),
-                'w',
-            ) as f:
-                f.write(json.dumps(testcase_json, indent=4))
-
-            # linting python after editing helps LLM fix indentations
-            config.enable_auto_lint = True
-
-            sandbox = cls(
-                container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
-                biocoder_instance_id=instance.test_case_id,
-                biocoder_instance=instance,
-                skip_workspace_mount=skip_workspace_mount,
-                sandbox_plugins=sandbox_plugins,
-                biocoder_cache_folder=biocoder_cache_folder,
-                workspace_dir_name=workspace_dir_name,
-            )
-        except Exception:
-            raise
-        finally:
-            config.workspace_base = old_workspace_base
-            config.workspace_mount_path = old_workspace_mount_path
-
-        sandbox.context_path = context_path
-        sandbox.generated_path = generated_path
-        sandbox.golden_path = golden_path
-
-        logger.info(f'SSH box started for instance {instance.test_case_id}.')
-        # cd to the workspace
-        exit_code, output = sandbox.execute_and_check(
-            'cd /workspace', 'Failed to cd to workspace'
-        )
-        logger.info(f'cd to workspace: {output}')
-
-        # download repository archive
-        repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
-        exit_code, output = sandbox.execute_and_check(
-            'wget -O repo.zip ' + repository_url, 'Failed to download the repository'
-        )
-        logger.info(f'Downloaded the repository: {output}')
-        exit_code, output = sandbox.execute_and_check(
-            'unzip -o -q repo.zip', 'Failed to unzip the repository'
-        )
-        logger.info(f'Unzipped the repository: {output}')
-
-        # copy the context, generated and golden files to the /testing_files folder
-        exit_code, output = sandbox.execute_and_check(
-            f'cp -r /workspace/{biocoder_cache_folder}/* /testing_files',
-            'Failed to copy the files',
-        )
-
-        # chmod 777
-        exit_code, output = sandbox.execute_and_check(
-            'chmod -R 777 /workspace',
-            'Failed to chmod the files',
-        )
-
-        return sandbox
-
-
-if __name__ == '__main__':
-    biocoder_dataset = load_dataset('Lilbillbiscuit/biocoder_public')
-    EXAMPLE_INSTANCE = biocoder_dataset['test'][0]
-    EXAMPLE_INSTANCE = BiocoderData(**EXAMPLE_INSTANCE)
-
-    sandbox = BiocoderSSHBox.get_box_for_instance(
-        instance=EXAMPLE_INSTANCE,
-        workspace_mount_path='/home/ubuntu/OpenDevinBioCoder/workspace',
-        skip_workspace_mount=False,
-        sandbox_plugins=[JupyterRequirement(), SWEAgentCommandsRequirement()],
-    )
-
-    # PRE TEST
-    exit_code, output = sandbox.execute_and_check(
-        'cd /testing',
-        'Failed to cd /testing',
-    )
-    logger.info(f'cd $REPO_PATH: {output}')
-
-    exit_code, output = sandbox.execute_and_check(
-        'whoami',
-        'Failed to run whoami',
-    )
-    logger.info(f'whoami: {output}')
-
-    # TEST
-    exit_code, output = sandbox.execute(
-        '/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
-    )
-    assert exit_code == 0, 'Expected exit code 0 (this should have passed)'
-    logger.info(f'$TEST_CMD:\n{output}')
-
-    exit_code, output = sandbox.execute_and_check(
-        'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
-    )
-
-    print(output)
-    json_obj = json.loads(output)
-    if json_obj['result'] == 'pass':
-        print('PASS')
-    else:
-        print('FAIL')
-
-    sys.stdout.flush()
-    try:
-        while True:
-            try:
-                user_input = input('>>> ')
-            except EOFError:
-                logger.info('Exiting...')
-                break
-            if user_input.lower() == 'exit':
-                logger.info('Exiting...')
-                break
-            exit_code, output = sandbox.execute(user_input)
-            logger.info('exit code: %d', exit_code)
-            logger.info(output)
-            sys.stdout.flush()
-    except KeyboardInterrupt:
-        logger.info('Exiting...')
-    sandbox.close()
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`The files in this directory configure a development container for GitHub Codespaces.`