mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
Compare commits
184 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 2e60d25eae | |||
| 0627af8578 | |||
| 21ea9953b3 | |||
| 70dd705418 | |||
| b049bc9688 | |||
| 1d49ef253b | |||
| 938ed027c2 | |||
| 0cf4e1ecf3 | |||
| 93433fa849 | |||
| 9fc522a610 | |||
| d41699c133 | |||
| a562a7ac7d | |||
| 5f177b6f88 | |||
| bd68249fba | |||
| c8fd039173 | |||
| c0687db1af | |||
| 2953438c98 | |||
| 29258cd62a | |||
| 0efd1c7a87 | |||
| 34072102d6 | |||
| b9b5cf7a61 | |||
| 84a6e90dc2 | |||
| 2533efabbb | |||
| 7cefd32fd0 | |||
| 570fd6e483 | |||
| a52df6a272 | |||
| 563ebd406d | |||
| 1eb3bdea95 | |||
| eb182f492e | |||
| b1ea204c5b | |||
| 8b77e8a0ff | |||
| 3328669b89 | |||
| 9ed95abf83 | |||
| b5d3fcaba8 | |||
| a29c795418 | |||
| f07280153a | |||
| 437e0c76bf | |||
| c0adca1e30 | |||
| 1c813a2fa0 | |||
| 275ea706cf | |||
| 3301beffec | |||
| 6c4cce01a7 | |||
| e62a7c08f8 | |||
| f6317a3607 | |||
| c4eb8e9fc8 | |||
| 618b124e0c | |||
| 1f6e86c932 | |||
| 99f6f8899d | |||
| f28a6db2c6 | |||
| 22c7bca556 | |||
| d0217b84ef | |||
| 056b66df65 | |||
| 422c3194c4 | |||
| e3e942370e | |||
| 006842bb88 | |||
| e3b180e702 | |||
| f586911ecf | |||
| da4dc15e76 | |||
| cef0d13c43 | |||
| fcaf0d2a40 | |||
| 1e2d5b57fd | |||
| 2702c09477 | |||
| b71dc38855 | |||
| 9576916a49 | |||
| 6f345e82f2 | |||
| 690290697a | |||
| 98276cf733 | |||
| 4862661732 | |||
| caece67ef8 | |||
| 4a1111d497 | |||
| fe7b5f12a7 | |||
| 625de5668f | |||
| d50a8447ad | |||
| 405c8a0456 | |||
| 547c510848 | |||
| 800e25eac1 | |||
| d92bbd97d7 | |||
| 549032176c | |||
| e4319da3f5 | |||
| 61a07a5864 | |||
| 4ed5967442 | |||
| c2c4e9f37f | |||
| 723d2e7c36 | |||
| 8a5bd21d77 | |||
| 79e76e9053 | |||
| 606a4b41d5 | |||
| 445f290beb | |||
| 41a8bb3cf1 | |||
| da17665cab | |||
| 4099e48122 | |||
| c3d4f6495f | |||
| a0f1cd2cdb | |||
| aeed1ea871 | |||
| cc6128522d | |||
| 78e700ef94 | |||
| 6c8aae0d12 | |||
| 784f644b1d | |||
| 669fe40229 | |||
| 733d4f5924 | |||
| 812e5d1dc9 | |||
| ce8a11a62f | |||
| f3c23e8039 | |||
| a61ac5a214 | |||
| 04877f8caf | |||
| 15697bed5a | |||
| be6e6e3add | |||
| 6b16a5da0b | |||
| dada004fac | |||
| 9cf2b5b74b | |||
| 3a21198424 | |||
| 71cb8b02dc | |||
| 08b44f0d60 | |||
| 82f94b99c4 | |||
| fb9ad04362 | |||
| 4bb92bdd02 | |||
| 46c9c9d5c6 | |||
| b004678345 | |||
| 93b9fd028d | |||
| 23493a2e36 | |||
| 8bfa61f3e4 | |||
| fa6c12473e | |||
| b5e4cddce3 | |||
| ac27ded81f | |||
| c555fb6840 | |||
| 452da5663d | |||
| b60890c064 | |||
| 1761b88af5 | |||
| ff6ddc831f | |||
| d6642c26be | |||
| 3bff8cf88a | |||
| bf39af895e | |||
| de74c7a0a1 | |||
| cf910dfa9d | |||
| 692fe21d60 | |||
| 9d41314d1a | |||
| f70c5afb6e | |||
| 2250947919 | |||
| 5a5713009f | |||
| 728131ff1d | |||
| 135da0ea2b | |||
| f689d5dcc3 | |||
| f80ecec772 | |||
| cf3d2298da | |||
| b04c69858c | |||
| 5c438432d6 | |||
| 70b2238f5e | |||
| f991069b00 | |||
| a66ede2ee6 | |||
| d97e92e714 | |||
| 9f12c77bac | |||
| 2c02ab9586 | |||
| c897791024 | |||
| 01ce1e35b5 | |||
| 88d53e781f | |||
| 257698e89b | |||
| 7111e8ee14 | |||
| 2c982582d7 | |||
| 0b0952547d | |||
| a2ec1ded26 | |||
| dc45b14720 | |||
| 4b6a2ff3c4 | |||
| f45a2ff04e | |||
| f5a4fb80a3 | |||
| 59d05f3934 | |||
| 29483c0620 | |||
| 42abc727d7 | |||
| 1fd2e511f8 | |||
| e3e437fcc2 | |||
| 8f76587e5c | |||
| 149dac8e5b | |||
| ec2535c57c | |||
| 471703bea6 | |||
| 4b4fa1c390 | |||
| 3c0975d71d | |||
| 17b2eb58e4 | |||
| 7cd3431beb | |||
| cf531518a5 | |||
| 8ea66a82c8 | |||
| 59042bb0a9 | |||
| b501083425 | |||
| 653a3c0f11 | |||
| b834b354e5 | |||
| 214f728d32 | |||
| 9d7adefe0c |
@@ -0,0 +1 @@
|
||||
The files in this directory configure a development container for GitHub Codespaces.
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"name": "OpenDevin Codespaces",
|
||||
"image": "mcr.microsoft.com/devcontainers/universal",
|
||||
"customizations":{
|
||||
"vscode":{
|
||||
"extensions": [
|
||||
"ms-python.python"
|
||||
]
|
||||
}
|
||||
},
|
||||
"onCreateCommand": "sh ./.devcontainer/on_create.sh",
|
||||
"postCreateCommand": "make build",
|
||||
"postStartCommand": "USE_HOST_NETWORK=True nohup bash -c 'make run &'"
|
||||
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
sudo apt update
|
||||
sudo apt install -y netcat
|
||||
sudo add-apt-repository -y ppa:deadsnakes/ppa
|
||||
sudo apt install -y python3.11
|
||||
curl -sSL https://install.python-poetry.org | python3.11 -
|
||||
# chromadb requires SQLite > 3.35 but SQLite in Python3.11.9 comes with 3.31.1
|
||||
sudo cp /opt/conda/lib/libsqlite3.so.0 /lib/x86_64-linux-gnu/libsqlite3.so.0
|
||||
@@ -12,7 +12,7 @@ body:
|
||||
label: Is there an existing issue for the same bug?
|
||||
description: Please check if an issue already exists for the bug you encountered.
|
||||
options:
|
||||
- label: I have checked the troubleshooting document at https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting
|
||||
- label: I have checked the troubleshooting document at https://docs.all-hands.dev/modules/usage/troubleshooting
|
||||
required: true
|
||||
- label: I have checked the existing issues.
|
||||
required: true
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
**What is the problem that this fixes or functionality that this introduces? Does it fix any open issues?**
|
||||
|
||||
**Give a brief summary of what the PR does, explaining any non-trivial design decisions**
|
||||
|
||||
|
||||
---
|
||||
**Give a summary of what the PR does, explaining any non-trivial design decisions**
|
||||
|
||||
|
||||
|
||||
---
|
||||
**Other references**
|
||||
|
||||
@@ -25,7 +25,7 @@ jobs:
|
||||
- name: Set up environment
|
||||
run: |
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
poetry install --without evaluation
|
||||
poetry install --without evaluation,llama-index
|
||||
poetry run playwright install --with-deps chromium
|
||||
wget https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json -P /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/1_Pooling/
|
||||
- name: Run tests
|
||||
|
||||
+231
-13
@@ -1,4 +1,4 @@
|
||||
name: Build Publish and Test Docker Image
|
||||
name: Build Publish and Test Runtime Image
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
@@ -77,14 +77,47 @@ jobs:
|
||||
name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
|
||||
path: /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
|
||||
|
||||
test-for-sandbox:
|
||||
name: Test for Sandbox
|
||||
ghcr_build_runtime:
|
||||
runs-on: ubuntu-latest
|
||||
needs: ghcr_build
|
||||
env:
|
||||
PERSIST_SANDBOX: "false"
|
||||
|
||||
outputs:
|
||||
tags: ${{ steps.capture-tags.outputs.tags }}
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
image: ["od_runtime"]
|
||||
base_image: ["ubuntu:22.04"]
|
||||
platform: ["amd64", "arm64"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Free Disk Space (Ubuntu)
|
||||
uses: jlumbroso/free-disk-space@main
|
||||
with:
|
||||
# this might remove tools that are actually needed,
|
||||
# if set to "true" but frees about 6 GB
|
||||
tool-cache: true
|
||||
# all of these default to true, but feel free to set to
|
||||
# "false" if necessary for your workflow
|
||||
android: true
|
||||
dotnet: true
|
||||
haskell: true
|
||||
large-packages: true
|
||||
docker-images: false
|
||||
swap-storage: true
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
id: buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Install poetry via pipx
|
||||
run: pipx install poetry
|
||||
@@ -98,16 +131,89 @@ jobs:
|
||||
- name: Install Python dependencies using Poetry
|
||||
run: make install-python-dependencies
|
||||
|
||||
- name: Download sandbox Docker image
|
||||
- name: Create source distribution and Dockerfile
|
||||
run: poetry run python3 opendevin/runtime/utils/runtime_build.py --base_image ${{ matrix.base_image }} --build_folder containers/runtime
|
||||
|
||||
- name: Build and export image
|
||||
id: build
|
||||
run: ./containers/build.sh ${{ matrix.image }} ${{ github.repository_owner }} ${{ matrix.platform }}
|
||||
|
||||
- name: Capture tags
|
||||
id: capture-tags
|
||||
run: |
|
||||
tags=$(cat tags.txt)
|
||||
echo "tags=$tags"
|
||||
echo "tags=$tags" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Upload Docker image as artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
|
||||
path: /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
|
||||
|
||||
test_runtime:
|
||||
name: Test Runtime
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ghcr_build_runtime, ghcr_build]
|
||||
env:
|
||||
PERSIST_SANDBOX: "false"
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
runtime_type: ["eventstream", "server"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Free Disk Space (Ubuntu)
|
||||
uses: jlumbroso/free-disk-space@main
|
||||
with:
|
||||
# this might remove tools that are actually needed,
|
||||
# when set to "true" but frees about 6 GB
|
||||
tool-cache: true
|
||||
|
||||
# all of these default to true, but feel free to set to
|
||||
# "false" if necessary for your workflow
|
||||
android: true
|
||||
dotnet: true
|
||||
haskell: true
|
||||
large-packages: true
|
||||
swap-storage: true
|
||||
|
||||
- name: Install poetry via pipx
|
||||
run: pipx install poetry
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
cache: "poetry"
|
||||
|
||||
- name: Install Python dependencies using Poetry
|
||||
run: make install-python-dependencies
|
||||
|
||||
- name: Download Runtime Docker image
|
||||
if: matrix.runtime_type == 'eventstream'
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: od_runtime-docker-image-amd64
|
||||
path: /tmp/
|
||||
|
||||
- name: Download Sandbox Docker image
|
||||
if: matrix.runtime_type == 'server'
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: sandbox-docker-image-amd64
|
||||
path: /tmp/
|
||||
|
||||
- name: Load sandbox image and run sandbox tests
|
||||
- name: Load Runtime image and run runtime tests
|
||||
run: |
|
||||
# Load the Docker image and capture the output
|
||||
output=$(docker load -i /tmp/sandbox_image_amd64.tar)
|
||||
if [ "${{ matrix.runtime_type }}" == "eventstream" ]; then
|
||||
output=$(docker load -i /tmp/od_runtime_image_amd64.tar)
|
||||
else
|
||||
output=$(docker load -i /tmp/sandbox_image_amd64.tar)
|
||||
fi
|
||||
|
||||
# Extract the first image name from the output
|
||||
image_name=$(echo "$output" | grep -oP 'Loaded image: \K.*' | head -n 1)
|
||||
@@ -115,14 +221,14 @@ jobs:
|
||||
# Print the full name of the image
|
||||
echo "Loaded Docker image: $image_name"
|
||||
|
||||
SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml -s ./tests/unit/test_sandbox.py
|
||||
TEST_RUNTIME=${{ matrix.runtime_type }} SANDBOX_USER_ID=$(id -u) SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml -s ./tests/unit/test_runtime.py
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v4
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
|
||||
integration-tests-on-linux:
|
||||
integration_tests_on_linux:
|
||||
name: Integration Tests on Linux
|
||||
runs-on: ubuntu-latest
|
||||
needs: ghcr_build
|
||||
@@ -174,10 +280,11 @@ jobs:
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
|
||||
|
||||
ghcr_push:
|
||||
runs-on: ubuntu-latest
|
||||
# don't push if integration tests or sandbox tests fail
|
||||
needs: [ghcr_build, integration-tests-on-linux, test-for-sandbox]
|
||||
needs: [ghcr_build, test_runtime, integration_tests_on_linux]
|
||||
if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
|
||||
|
||||
env:
|
||||
@@ -223,6 +330,78 @@ jobs:
|
||||
docker push $image_name:${tag}_${{ matrix.platform }}
|
||||
done
|
||||
|
||||
ghcr_push_runtime:
|
||||
runs-on: ubuntu-latest
|
||||
# don't push if runtime tests fail
|
||||
needs: [ghcr_build_runtime, test_runtime, integration_tests_on_linux]
|
||||
if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
|
||||
|
||||
env:
|
||||
tags: ${{ needs.ghcr_build_runtime.outputs.tags }}
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
image: ["od_runtime"]
|
||||
platform: ["amd64", "arm64"]
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Free Disk Space (Ubuntu)
|
||||
uses: jlumbroso/free-disk-space@main
|
||||
with:
|
||||
tool-cache: true
|
||||
android: true
|
||||
dotnet: true
|
||||
haskell: true
|
||||
large-packages: true
|
||||
docker-images: false
|
||||
swap-storage: true
|
||||
|
||||
- name: Login to GHCR
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Download Docker images
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
|
||||
path: /tmp/${{ matrix.platform }}
|
||||
|
||||
- name: List downloaded files
|
||||
run: |
|
||||
ls -la /tmp/${{ matrix.platform }}
|
||||
file /tmp/${{ matrix.platform }}/*
|
||||
|
||||
- name: Load images and push to registry
|
||||
run: |
|
||||
mv /tmp/${{ matrix.platform }}/${{ matrix.image }}_image_${{ matrix.platform }}.tar ./${{ matrix.image }}_image_${{ matrix.platform }}.tar
|
||||
if ! loaded_image=$(docker load -i ${{ matrix.image }}_image_${{ matrix.platform }}.tar | grep "Loaded image:" | head -n 1 | awk '{print $3}'); then
|
||||
echo "Failed to load Docker image"
|
||||
exit 1
|
||||
fi
|
||||
echo "loaded image = $loaded_image"
|
||||
tags=$(echo ${tags} | tr ' ' '\n')
|
||||
image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
|
||||
echo "image name = $image_name"
|
||||
for tag in $tags; do
|
||||
echo "tag = $tag"
|
||||
if [ -n "$image_name" ]; then
|
||||
docker tag $loaded_image $image_name:${tag}_${{ matrix.platform }}
|
||||
docker push $image_name:${tag}_${{ matrix.platform }}
|
||||
else
|
||||
echo "Skipping tag and push due to empty image_name"
|
||||
fi
|
||||
done
|
||||
|
||||
create_manifest:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ghcr_build, ghcr_push]
|
||||
@@ -261,3 +440,42 @@ jobs:
|
||||
$image_name:${tag}_amd64 \
|
||||
$image_name:${tag}_arm64
|
||||
done
|
||||
|
||||
create_manifest_runtime:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ghcr_build_runtime, ghcr_push_runtime]
|
||||
if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
|
||||
|
||||
env:
|
||||
tags: ${{ needs.ghcr_build_runtime.outputs.tags }}
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
image: ["od_runtime"]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Login to GHCR
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Create and push multi-platform manifest
|
||||
run: |
|
||||
image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
|
||||
echo "image name = $image_name"
|
||||
tags=$(echo ${tags} | tr ' ' '\n')
|
||||
for tag in $tags; do
|
||||
echo 'tag = $tag'
|
||||
docker buildx imagetools create --tag $image_name:$tag \
|
||||
$image_name:${tag}_amd64 \
|
||||
$image_name:${tag}_arm64
|
||||
done
|
||||
|
||||
@@ -12,15 +12,15 @@ jobs:
|
||||
dogfood:
|
||||
if: contains(github.event.pull_request.labels.*.name, 'review-this')
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: ghcr.io/opendevin/opendevin
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
- name: install git, github cli
|
||||
run: |
|
||||
apt-get install -y git gh
|
||||
sudo apt-get install -y git gh
|
||||
git config --global --add safe.directory $PWD
|
||||
|
||||
- name: Checkout Repository
|
||||
@@ -34,7 +34,9 @@ jobs:
|
||||
|
||||
- name: Write Task File
|
||||
run: |
|
||||
echo "Your coworker wants to apply a pull request to this project. Read and review ${{ github.event.pull_request.number }}.diff file. Create a review-${{ github.event.pull_request.number }}.txt and write your concise comments and suggestions there." > task.txt
|
||||
echo "Your coworker wants to apply a pull request to this project." > task.txt
|
||||
echo "Read and review ${{ github.event.pull_request.number }}.diff file. Create a review-${{ github.event.pull_request.number }}.txt and write your concise comments and suggestions there." >> task.txt
|
||||
echo "Do not ask me for confirmation at any point." >> task.txt
|
||||
echo "" >> task.txt
|
||||
echo "Title" >> task.txt
|
||||
echo "${{ github.event.pull_request.title }}" >> task.txt
|
||||
@@ -48,20 +50,22 @@ jobs:
|
||||
run: |
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
export PATH="/github/home/.local/bin:$PATH"
|
||||
poetry install --without evaluation
|
||||
poetry install --without evaluation,llama-index
|
||||
poetry run playwright install --with-deps chromium
|
||||
|
||||
- name: Run OpenDevin
|
||||
env:
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_MODEL: ${{ vars.LLM_MODEL }}
|
||||
SANDBOX_BOX_TYPE: ssh
|
||||
run: |
|
||||
# Append path to launch poetry
|
||||
export PATH="/github/home/.local/bin:$PATH"
|
||||
# Append path to correctly import package, note: must set pwd at first
|
||||
export PYTHONPATH=$(pwd):$PYTHONPATH
|
||||
WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
|
||||
export WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE
|
||||
export WORKSPACE_BASE=$GITHUB_WORKSPACE
|
||||
echo -e "/exit\n" | poetry run python opendevin/core/main.py -i 50 -f task.txt
|
||||
rm task.txt
|
||||
|
||||
- name: Check if review file is non-empty
|
||||
|
||||
@@ -70,23 +70,52 @@ jobs:
|
||||
cache: "poetry"
|
||||
|
||||
- name: Install Python dependencies using Poetry
|
||||
run: poetry install
|
||||
run: poetry install --without evaluation,llama-index
|
||||
|
||||
- name: Install & Start Docker
|
||||
if: env.INSTALL_DOCKER == '1'
|
||||
run: |
|
||||
INSTANCE_NAME="colima-${GITHUB_RUN_ID}"
|
||||
|
||||
# Uninstall colima to upgrade to the latest version
|
||||
if brew list colima &>/dev/null; then
|
||||
brew uninstall colima
|
||||
# unlinking colima dependency: go
|
||||
brew uninstall go@1.21
|
||||
brew uninstall colima
|
||||
# unlinking colima dependency: go
|
||||
brew uninstall go@1.21
|
||||
fi
|
||||
rm -rf ~/.colima ~/.lima
|
||||
brew install --HEAD colima
|
||||
brew services start colima
|
||||
brew install docker
|
||||
colima delete
|
||||
colima start --network-address --arch x86_64 --cpu=1 --memory=1
|
||||
|
||||
start_colima() {
|
||||
# Find a free port in the range 10000-20000
|
||||
RANDOM_PORT=$((RANDOM % 10001 + 10000))
|
||||
|
||||
# Original line:
|
||||
if ! colima start --network-address --arch x86_64 --cpu=1 --memory=1 --verbose --ssh-port $RANDOM_PORT; then
|
||||
echo "Failed to start Colima."
|
||||
return 1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
# Attempt to start Colima for 5 total attempts:
|
||||
ATTEMPT_LIMIT=5
|
||||
for ((i=1; i<=ATTEMPT_LIMIT; i++)); do
|
||||
|
||||
if start_colima; then
|
||||
echo "Colima started successfully."
|
||||
break
|
||||
else
|
||||
colima stop -f
|
||||
sleep 10
|
||||
colima delete -f
|
||||
if [ $i -eq $ATTEMPT_LIMIT ]; then
|
||||
exit 1
|
||||
fi
|
||||
sleep 10
|
||||
fi
|
||||
done
|
||||
|
||||
# For testcontainers to find the Colima socket
|
||||
# https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
|
||||
@@ -96,7 +125,7 @@ jobs:
|
||||
run: make build
|
||||
|
||||
- name: Run Tests
|
||||
run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"
|
||||
run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox.py and not test_runtime.py"
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v4
|
||||
@@ -124,13 +153,13 @@ jobs:
|
||||
cache: "poetry"
|
||||
|
||||
- name: Install Python dependencies using Poetry
|
||||
run: poetry install --without evaluation
|
||||
run: poetry install --without evaluation,llama-index
|
||||
|
||||
- name: Build Environment
|
||||
run: make build
|
||||
|
||||
- name: Run Tests
|
||||
run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"
|
||||
run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox.py and not test_runtime.py"
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v4
|
||||
|
||||
@@ -40,7 +40,7 @@ jobs:
|
||||
run: |
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
export PATH="/github/home/.local/bin:$PATH"
|
||||
poetry install --without evaluation
|
||||
poetry install --without evaluation,llama-index
|
||||
poetry run playwright install --with-deps chromium
|
||||
|
||||
|
||||
|
||||
@@ -210,6 +210,7 @@ cache
|
||||
|
||||
# configuration
|
||||
config.toml
|
||||
config.toml_
|
||||
config.toml.bak
|
||||
|
||||
containers/agnostic_sandbox
|
||||
@@ -217,3 +218,9 @@ containers/agnostic_sandbox
|
||||
# swe-bench-eval
|
||||
image_build_logs
|
||||
run_instance_logs
|
||||
|
||||
od_runtime_*.tar
|
||||
|
||||
# docker build
|
||||
containers/runtime/Dockerfile
|
||||
containers/runtime/project.tar.gz
|
||||
|
||||
+3
-4
@@ -1,13 +1,13 @@
|
||||
# Contributing
|
||||
|
||||
Thanks for your interest in contributing to OpenDevin! We welcome and appreciate contributions.
|
||||
Thanks for your interest in contributing to OpenDevin! We welcome and appreciate contributions.
|
||||
|
||||
## How Can I Contribute?
|
||||
|
||||
There are many ways that you can contribute:
|
||||
|
||||
1. **Download and use** OpenDevin, and send [issues](https://github.com/OpenDevin/OpenDevin/issues) when you encounter something that isn't working or a feature that you'd like to see.
|
||||
2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://opendevin.github.io/OpenDevin/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
|
||||
2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
|
||||
3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issue](https://github.com/OpenDevin/OpenDevin/labels/good%20first%20issue) issues that may be ones to start on.
|
||||
|
||||
## Understanding OpenDevin's CodeBase
|
||||
@@ -83,7 +83,7 @@ git push origin my_branch
|
||||
- Set `base repository` to `OpenDevin/OpenDevin`
|
||||
- Set `base` to `main`
|
||||
- Click `Create pull request`
|
||||
|
||||
|
||||
The PR should appear in [OpenDevin PRs](https://github.com/OpenDevin/OpenDevin/pulls).
|
||||
|
||||
Then the OpenDevin team will review your code.
|
||||
@@ -114,4 +114,3 @@ You may also check out previous PRs in the [PR list](https://github.com/OpenDevi
|
||||
### 2. Pull Request description
|
||||
- If your PR is small (such as a typo fix), you can go brief.
|
||||
- If it contains a lot of changes, it's better to write more details.
|
||||
|
||||
|
||||
+6
-5
@@ -39,18 +39,18 @@ make build
|
||||
OpenDevin supports a diverse array of Language Models (LMs) through the powerful [litellm](https://docs.litellm.ai) library. By default, we've chosen the mighty GPT-4 from OpenAI as our go-to model, but the world is your oyster! You can unleash the potential of Anthropic's suave Claude, the enigmatic Llama, or any other LM that piques your interest.
|
||||
|
||||
To configure the LM of your choice, run:
|
||||
|
||||
|
||||
```bash
|
||||
make setup-config
|
||||
```
|
||||
|
||||
|
||||
This command will prompt you to enter the LLM API key, model name, and other variables ensuring that OpenDevin is tailored to your specific needs. Note that the model name will apply only when you run headless. If you use the UI, please set the model in the UI.
|
||||
|
||||
|
||||
Note: If you have previously run OpenDevin using the docker command, you may have already set some environmental variables in your terminal. The final configurations are set from highest to lowest priority:
|
||||
Environment variables > config.toml variables > default variables
|
||||
|
||||
**Note on Alternative Models:**
|
||||
Some alternative models may prove more challenging to tame than others. Fear not, brave adventurer! We shall soon unveil LLM-specific documentation to guide you on your quest.
|
||||
Some alternative models may prove more challenging to tame than others. Fear not, brave adventurer! We shall soon unveil LLM-specific documentation to guide you on your quest.
|
||||
And if you've already mastered the art of wielding a model other than OpenAI's GPT, we encourage you to share your setup instructions with us by creating instructions and adding it [to our documentation](https://github.com/OpenDevin/OpenDevin/tree/main/docs/modules/usage/llms).
|
||||
|
||||
For a full list of the LM providers and models available, please consult the [litellm documentation](https://docs.litellm.ai/docs/providers).
|
||||
@@ -84,10 +84,11 @@ make help
|
||||
```
|
||||
|
||||
### 8. Testing
|
||||
To run tests, refer to the following:
|
||||
#### Unit tests
|
||||
|
||||
```bash
|
||||
poetry run pytest ./tests/unit/test_sandbox.py
|
||||
poetry run pytest ./tests/unit/test_*.py
|
||||
```
|
||||
|
||||
#### Integration tests
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
# Issue Triage
|
||||
These are the procedures and guidelines on how issues are triaged in this repo by the maintainers.
|
||||
|
||||
## General
|
||||
* Most issues must be tagged with **enhancement** or **bug**
|
||||
* Issues may be tagged with what it relates to (**backend**, **frontend**, **agent quality**, etc.)
|
||||
|
||||
## Severity
|
||||
* **Low**: Minor issues, single user report
|
||||
* **Medium**: Affecting multiple users
|
||||
* **Critical**: Affecting all users or potential security issues
|
||||
|
||||
## Effort
|
||||
* Issues may be estimated with effort required (**small effort**, **medium effort**, **large effort**)
|
||||
|
||||
## Difficulty
|
||||
* Issues with low implementation difficulty may be tagged with **good first issue**
|
||||
|
||||
## Not Enough Information
|
||||
* User is asked to provide more information (logs, how to reproduce, etc.) when the issue is not clear
|
||||
* If an issue is unclear and the author does not provide more information or respond to a request, the issue may be closed as **not planned** (Usually after a week)
|
||||
|
||||
## Multiple Requests/Fixes in One Issue
|
||||
* These issues will be narrowed down to one request/fix so the issue is more easily tracked and fixed
|
||||
* Issues may be broken down into multiple issues if required
|
||||
@@ -141,7 +141,7 @@ install-python-dependencies:
|
||||
export HNSWLIB_NO_NATIVE=1; \
|
||||
poetry run pip install chroma-hnswlib; \
|
||||
fi
|
||||
@poetry install
|
||||
@poetry install --without llama-index
|
||||
@if [ -f "/etc/manjaro-release" ]; then \
|
||||
echo "$(BLUE)Detected Manjaro Linux. Installing Playwright dependencies...$(RESET)"; \
|
||||
poetry run pip install playwright; \
|
||||
@@ -162,11 +162,8 @@ install-frontend-dependencies:
|
||||
@echo "$(YELLOW)Setting up frontend environment...$(RESET)"
|
||||
@echo "$(YELLOW)Detect Node.js version...$(RESET)"
|
||||
@cd frontend && node ./scripts/detect-node-version.js
|
||||
@cd frontend && \
|
||||
echo "$(BLUE)Installing frontend dependencies with npm...$(RESET)" && \
|
||||
npm install && \
|
||||
echo "$(BLUE)Running make-i18n with npm...$(RESET)" && \
|
||||
npm run make-i18n
|
||||
echo "$(BLUE)Installing frontend dependencies with npm...$(RESET)"
|
||||
@cd frontend && npm install
|
||||
@echo "$(GREEN)Frontend dependencies installed successfully.$(RESET)"
|
||||
|
||||
install-pre-commit-hooks:
|
||||
|
||||
@@ -33,8 +33,10 @@
|
||||
<div align="center">
|
||||
<img src="./docs/static/img/logo.png" alt="Logo" width="200" height="200">
|
||||
<h1 align="center">OpenDevin: Code Less, Make More</h1>
|
||||
<a href="https://opendevin.github.io/OpenDevin/modules/usage/intro"><img src="https://img.shields.io/badge/Documentation-OpenDevin-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Check out the documentation"></a>
|
||||
<a href="https://huggingface.co/spaces/OpenDevin/evaluation"><img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?style=for-the-badge" alt="Evaluation Benchmark"></a>
|
||||
<a href="https://docs.all-hands.dev/modules/usage/intro"><img src="https://img.shields.io/badge/Documentation-OpenDevin-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Check out the documentation"></a>
|
||||
<a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv"></a>
|
||||
<br/>
|
||||
<a href="https://huggingface.co/spaces/OpenDevin/evaluation"><img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark"></a>
|
||||
</div>
|
||||
<hr>
|
||||
|
||||
@@ -45,7 +47,7 @@ OpenDevin agents collaborate with human developers to write code, fix bugs, and
|
||||

|
||||
|
||||
## ⚡ Getting Started
|
||||
OpenDevin works best with the most recent version of Docker, `26.0.0`.
|
||||
OpenDevin works best with Docker version 26.0.0+ (Docker Desktop 4.31.0+).
|
||||
You must be using Linux, Mac OS, or WSL on Windows.
|
||||
|
||||
To start OpenDevin in a docker container, run the following commands in your terminal:
|
||||
@@ -71,7 +73,7 @@ docker run -it \
|
||||
> By default, this command pulls the `latest` tag, which represents the most recent release of OpenDevin. You have other options as well:
|
||||
> - For a specific release version, use `ghcr.io/opendevin/opendevin:<OpenDevin_version>` (replace <OpenDevin_version> with the desired version number).
|
||||
> - For the most up-to-date development version, use `ghcr.io/opendevin/opendevin:main`. This version may be **(unstable!)** and is recommended for testing or development purposes only.
|
||||
>
|
||||
>
|
||||
> Choose the tag that best suits your needs based on stability requirements and desired features.
|
||||
|
||||
You'll find OpenDevin running at [http://localhost:3000](http://localhost:3000) with access to `./workspace`. To have OpenDevin operate on your code, place it in `./workspace`.
|
||||
@@ -82,12 +84,12 @@ the `Settings` button (gear icon) in the UI. If the required `Model` does not ex
|
||||
|
||||
For the development workflow, see [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md).
|
||||
|
||||
Are you having trouble? Check out our [Troubleshooting Guide](https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting).
|
||||
Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).
|
||||
|
||||
## 🚀 Documentation
|
||||
|
||||
To learn more about the project, and for tips on using OpenDevin,
|
||||
**check out our [documentation](https://opendevin.github.io/OpenDevin/modules/usage/intro)**.
|
||||
**check out our [documentation](https://docs.all-hands.dev/modules/usage/intro)**.
|
||||
|
||||
There you'll find resources on how to use different LLM providers (like ollama and Anthropic's Claude),
|
||||
troubleshooting resources, and advanced configuration options.
|
||||
@@ -138,12 +140,13 @@ Distributed under the MIT License. See [`LICENSE`](./LICENSE) for more informati
|
||||
## 📚 Cite
|
||||
|
||||
```
|
||||
@misc{opendevin2024,
|
||||
author = {{OpenDevin Team}},
|
||||
title = {{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
|
||||
year = {2024},
|
||||
version = {v1.0},
|
||||
howpublished = {\url{https://github.com/OpenDevin/OpenDevin}},
|
||||
note = {Accessed: ENTER THE DATE YOU ACCESSED THE PROJECT}
|
||||
@misc{opendevin,
|
||||
title={{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
|
||||
author={Xingyao Wang and Boxuan Li and Yufan Song and Frank F. Xu and Xiangru Tang and Mingchen Zhuge and Jiayi Pan and Yueqi Song and Bowen Li and Jaskirat Singh and Hoang H. Tran and Fuqiang Li and Ren Ma and Mingzhang Zheng and Bill Qian and Yanjun Shao and Niklas Muennighoff and Yizhe Zhang and Binyuan Hui and Junyang Lin and Robert Brennan and Hao Peng and Heng Ji and Graham Neubig},
|
||||
year={2024},
|
||||
eprint={2407.16741},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.SE},
|
||||
url={https://arxiv.org/abs/2407.16741},
|
||||
}
|
||||
```
|
||||
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
# Agent Framework Research
|
||||
# Agent Hub
|
||||
|
||||
In this folder, there may exist multiple implementations of `Agent` that will be used by the framework.
|
||||
|
||||
|
||||
@@ -14,15 +14,11 @@ from . import ( # noqa: E402
|
||||
codeact_swe_agent,
|
||||
delegator_agent,
|
||||
dummy_agent,
|
||||
gptswarm_agent,
|
||||
monologue_agent,
|
||||
planner_agent,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'monologue_agent',
|
||||
'codeact_agent',
|
||||
'gptswarm_agent',
|
||||
'codeact_swe_agent',
|
||||
'planner_agent',
|
||||
'delegator_agent',
|
||||
|
||||
@@ -99,8 +99,7 @@ class BrowsingAgent(Agent):
|
||||
self,
|
||||
llm: LLM,
|
||||
) -> None:
|
||||
"""
|
||||
Initializes a new instance of the BrowsingAgent class.
|
||||
"""Initializes a new instance of the BrowsingAgent class.
|
||||
|
||||
Parameters:
|
||||
- llm (LLM): The llm to be used by this agent
|
||||
@@ -120,16 +119,13 @@ class BrowsingAgent(Agent):
|
||||
self.reset()
|
||||
|
||||
def reset(self) -> None:
|
||||
"""
|
||||
Resets the Browsing Agent.
|
||||
"""
|
||||
"""Resets the Browsing Agent."""
|
||||
super().reset()
|
||||
self.cost_accumulator = 0
|
||||
self.error_accumulator = 0
|
||||
|
||||
def step(self, state: State) -> Action:
|
||||
"""
|
||||
Performs one step using the Browsing Agent.
|
||||
"""Performs one step using the Browsing Agent.
|
||||
This includes gathering information on previous steps and prompting the model to make a browsing command to execute.
|
||||
|
||||
Parameters:
|
||||
|
||||
@@ -75,7 +75,8 @@ class PromptElement:
|
||||
Prompt elements are used to build the prompt. Use flags to control which
|
||||
prompt elements are visible. We use class attributes as a convenient way
|
||||
to implement static prompts, but feel free to override them with instance
|
||||
attributes or @property decorator."""
|
||||
attributes or @property decorator.
|
||||
"""
|
||||
|
||||
_prompt = ''
|
||||
_abstract_ex = ''
|
||||
@@ -200,11 +201,10 @@ def fit_tokens(
|
||||
model_name : str, optional
|
||||
The name of the model used when tokenizing.
|
||||
|
||||
Returns
|
||||
Returns:
|
||||
-------
|
||||
str : the prompt after shrinking.
|
||||
"""
|
||||
|
||||
if max_prompt_chars is None:
|
||||
return shrinkable.prompt
|
||||
|
||||
@@ -579,8 +579,8 @@ the form is not visible yet or some fields are disabled. I need to replan.
|
||||
def diff(previous, new):
|
||||
"""Return a string showing the difference between original and new.
|
||||
|
||||
If the difference is above diff_threshold, return the diff string."""
|
||||
|
||||
If the difference is above diff_threshold, return the diff string.
|
||||
"""
|
||||
if previous == new:
|
||||
return 'Identical', []
|
||||
|
||||
|
||||
@@ -37,9 +37,8 @@ class BrowsingResponseParser(ResponseParser):
|
||||
|
||||
|
||||
class BrowsingActionParserMessage(ActionParser):
|
||||
"""
|
||||
Parser action:
|
||||
- BrowseInteractiveAction(browser_actions) - unexpected response format, message back to user
|
||||
"""Parser action:
|
||||
- BrowseInteractiveAction(browser_actions) - unexpected response format, message back to user
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -60,9 +59,8 @@ class BrowsingActionParserMessage(ActionParser):
|
||||
|
||||
|
||||
class BrowsingActionParserBrowseInteractive(ActionParser):
|
||||
"""
|
||||
Parser action:
|
||||
- BrowseInteractiveAction(browser_actions) - handle send message to user function call in BrowserGym
|
||||
"""Parser action:
|
||||
- BrowseInteractiveAction(browser_actions) - handle send message to user function call in BrowserGym
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -7,7 +7,6 @@ import yaml
|
||||
|
||||
def yaml_parser(message):
|
||||
"""Parse a yaml message for the retry function."""
|
||||
|
||||
# saves gpt-3.5 from some yaml parsing errors
|
||||
message = re.sub(r':\s*\n(?=\S|\n)', ': ', message)
|
||||
|
||||
@@ -47,7 +46,6 @@ def _compress_chunks(text, identifier, skip_list, split_regex='\n\n+'):
|
||||
|
||||
def compress_string(text):
|
||||
"""Compress a string by replacing redundant paragraphs and lines with identifiers."""
|
||||
|
||||
# Perform paragraph-level compression
|
||||
def_dict, compressed_text = _compress_chunks(
|
||||
text, identifier='§', skip_list=[], split_regex='\n\n+'
|
||||
@@ -79,12 +77,12 @@ def extract_html_tags(text, keys):
|
||||
keys : list of str
|
||||
The HTML tags to extract the content from.
|
||||
|
||||
Returns
|
||||
Returns:
|
||||
-------
|
||||
dict
|
||||
A dictionary mapping each key to a list of subset in `text` that match the key.
|
||||
|
||||
Notes
|
||||
Notes:
|
||||
-----
|
||||
All text and keys will be converted to lowercase before matching.
|
||||
|
||||
@@ -126,7 +124,7 @@ def parse_html_tags(text, keys=(), optional_keys=(), merge_multiple=False):
|
||||
optional_keys : list of str
|
||||
The HTML tags to extract the content from, but are optional.
|
||||
|
||||
Returns
|
||||
Returns:
|
||||
-------
|
||||
dict
|
||||
A dictionary mapping each key to subset of `text` that match the key.
|
||||
|
||||
@@ -12,13 +12,12 @@ from opendevin.events.action import (
|
||||
|
||||
|
||||
class CodeActResponseParser(ResponseParser):
|
||||
"""
|
||||
Parser action:
|
||||
- CmdRunAction(command) - bash command to run
|
||||
- IPythonRunCellAction(code) - IPython code to run
|
||||
- AgentDelegateAction(agent, inputs) - delegate action for (sub)task
|
||||
- MessageAction(content) - Message action to run (e.g. ask for clarification)
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""Parser action:
|
||||
- CmdRunAction(command) - bash command to run
|
||||
- IPythonRunCellAction(code) - IPython code to run
|
||||
- AgentDelegateAction(agent, inputs) - delegate action for (sub)task
|
||||
- MessageAction(content) - Message action to run (e.g. ask for clarification)
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
@@ -53,9 +52,8 @@ class CodeActResponseParser(ResponseParser):
|
||||
|
||||
|
||||
class CodeActActionParserFinish(ActionParser):
|
||||
"""
|
||||
Parser action:
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""Parser action:
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -76,10 +74,9 @@ class CodeActActionParserFinish(ActionParser):
|
||||
|
||||
|
||||
class CodeActActionParserCmdRun(ActionParser):
|
||||
"""
|
||||
Parser action:
|
||||
- CmdRunAction(command) - bash command to run
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""Parser action:
|
||||
- CmdRunAction(command) - bash command to run
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -101,14 +98,13 @@ class CodeActActionParserCmdRun(ActionParser):
|
||||
# a command was found
|
||||
command_group = self.bash_command.group(1).strip()
|
||||
if command_group.strip() == 'exit':
|
||||
return AgentFinishAction()
|
||||
return AgentFinishAction(thought=thought)
|
||||
return CmdRunAction(command=command_group, thought=thought)
|
||||
|
||||
|
||||
class CodeActActionParserIPythonRunCell(ActionParser):
|
||||
"""
|
||||
Parser action:
|
||||
- IPythonRunCellAction(code) - IPython code to run
|
||||
"""Parser action:
|
||||
- IPythonRunCellAction(code) - IPython code to run
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -137,9 +133,8 @@ class CodeActActionParserIPythonRunCell(ActionParser):
|
||||
|
||||
|
||||
class CodeActActionParserAgentDelegate(ActionParser):
|
||||
"""
|
||||
Parser action:
|
||||
- AgentDelegateAction(agent, inputs) - delegate action for (sub)task
|
||||
"""Parser action:
|
||||
- AgentDelegateAction(agent, inputs) - delegate action for (sub)task
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -164,9 +159,8 @@ class CodeActActionParserAgentDelegate(ActionParser):
|
||||
|
||||
|
||||
class CodeActActionParserMessage(ActionParser):
|
||||
"""
|
||||
Parser action:
|
||||
- MessageAction(content) - Message action to run (e.g. ask for clarification)
|
||||
"""Parser action:
|
||||
- MessageAction(content) - Message action to run (e.g. ask for clarification)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -8,7 +8,6 @@ from agenthub.codeact_agent.prompt import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config
|
||||
from opendevin.events.action import (
|
||||
Action,
|
||||
AgentDelegateAction,
|
||||
@@ -22,6 +21,7 @@ from opendevin.events.observation import (
|
||||
CmdOutputObservation,
|
||||
IPythonRunCellObservation,
|
||||
)
|
||||
from opendevin.events.observation.observation import Observation
|
||||
from opendevin.events.serialization.event import truncate_content
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.plugins import (
|
||||
@@ -34,62 +34,6 @@ from opendevin.runtime.tools import RuntimeTool
|
||||
ENABLE_GITHUB = True
|
||||
|
||||
|
||||
def action_to_str(action: Action) -> str:
|
||||
if isinstance(action, CmdRunAction):
|
||||
return f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
|
||||
elif isinstance(action, IPythonRunCellAction):
|
||||
return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
|
||||
elif isinstance(action, AgentDelegateAction):
|
||||
return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
|
||||
elif isinstance(action, MessageAction):
|
||||
return action.content
|
||||
return ''
|
||||
|
||||
|
||||
def get_action_message(action: Action) -> dict[str, str] | None:
|
||||
if (
|
||||
isinstance(action, AgentDelegateAction)
|
||||
or isinstance(action, CmdRunAction)
|
||||
or isinstance(action, IPythonRunCellAction)
|
||||
or isinstance(action, MessageAction)
|
||||
):
|
||||
return {
|
||||
'role': 'user' if action.source == 'user' else 'assistant',
|
||||
'content': action_to_str(action),
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def get_observation_message(obs) -> dict[str, str] | None:
|
||||
max_message_chars = config.get_llm_config_from_agent(
|
||||
'CodeActAgent'
|
||||
).max_message_chars
|
||||
if isinstance(obs, CmdOutputObservation):
|
||||
content = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
|
||||
content += (
|
||||
f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
|
||||
)
|
||||
return {'role': 'user', 'content': content}
|
||||
elif isinstance(obs, IPythonRunCellObservation):
|
||||
content = 'OBSERVATION:\n' + obs.content
|
||||
# replace base64 images with a placeholder
|
||||
splitted = content.split('\n')
|
||||
for i, line in enumerate(splitted):
|
||||
if ' already displayed to user'
|
||||
)
|
||||
content = '\n'.join(splitted)
|
||||
content = truncate_content(content, max_message_chars)
|
||||
return {'role': 'user', 'content': content}
|
||||
elif isinstance(obs, AgentDelegateObservation):
|
||||
content = 'OBSERVATION:\n' + truncate_content(
|
||||
str(obs.outputs), max_message_chars
|
||||
)
|
||||
return {'role': 'user', 'content': content}
|
||||
return None
|
||||
|
||||
|
||||
# FIXME: We can tweak these two settings to create MicroAgents specialized toward different area
|
||||
def get_system_message() -> str:
|
||||
if ENABLE_GITHUB:
|
||||
@@ -110,7 +54,7 @@ class CodeActAgent(Agent):
|
||||
|
||||
### Overview
|
||||
|
||||
This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.13463), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
|
||||
This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
|
||||
|
||||
The conceptual idea is illustrated below. At each turn, the agent can:
|
||||
|
||||
@@ -158,8 +102,7 @@ class CodeActAgent(Agent):
|
||||
self,
|
||||
llm: LLM,
|
||||
) -> None:
|
||||
"""
|
||||
Initializes a new instance of the CodeActAgent class.
|
||||
"""Initializes a new instance of the CodeActAgent class.
|
||||
|
||||
Parameters:
|
||||
- llm (LLM): The llm to be used by this agent
|
||||
@@ -167,15 +110,70 @@ class CodeActAgent(Agent):
|
||||
super().__init__(llm)
|
||||
self.reset()
|
||||
|
||||
def action_to_str(self, action: Action) -> str:
|
||||
if isinstance(action, CmdRunAction):
|
||||
return (
|
||||
f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
|
||||
)
|
||||
elif isinstance(action, IPythonRunCellAction):
|
||||
return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
|
||||
elif isinstance(action, AgentDelegateAction):
|
||||
return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
|
||||
elif isinstance(action, MessageAction):
|
||||
return action.content
|
||||
elif isinstance(action, AgentFinishAction) and action.source == 'agent':
|
||||
return action.thought
|
||||
return ''
|
||||
|
||||
def get_action_message(self, action: Action) -> dict[str, str] | None:
|
||||
if (
|
||||
isinstance(action, AgentDelegateAction)
|
||||
or isinstance(action, CmdRunAction)
|
||||
or isinstance(action, IPythonRunCellAction)
|
||||
or isinstance(action, MessageAction)
|
||||
or (isinstance(action, AgentFinishAction) and action.source == 'agent')
|
||||
):
|
||||
return {
|
||||
'role': 'user' if action.source == 'user' else 'assistant',
|
||||
'content': self.action_to_str(action),
|
||||
}
|
||||
return None
|
||||
|
||||
def get_observation_message(self, obs: Observation) -> dict[str, str] | None:
|
||||
max_message_chars = self.llm.config.max_message_chars
|
||||
if isinstance(obs, CmdOutputObservation):
|
||||
content = 'OBSERVATION:\n' + truncate_content(
|
||||
obs.content, max_message_chars
|
||||
)
|
||||
content += (
|
||||
f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
|
||||
)
|
||||
return {'role': 'user', 'content': content}
|
||||
elif isinstance(obs, IPythonRunCellObservation):
|
||||
content = 'OBSERVATION:\n' + obs.content
|
||||
# replace base64 images with a placeholder
|
||||
splitted = content.split('\n')
|
||||
for i, line in enumerate(splitted):
|
||||
if ' already displayed to user'
|
||||
)
|
||||
content = '\n'.join(splitted)
|
||||
content = truncate_content(content, max_message_chars)
|
||||
return {'role': 'user', 'content': content}
|
||||
elif isinstance(obs, AgentDelegateObservation):
|
||||
content = 'OBSERVATION:\n' + truncate_content(
|
||||
str(obs.outputs), max_message_chars
|
||||
)
|
||||
return {'role': 'user', 'content': content}
|
||||
return None
|
||||
|
||||
def reset(self) -> None:
|
||||
"""
|
||||
Resets the CodeAct Agent.
|
||||
"""
|
||||
"""Resets the CodeAct Agent."""
|
||||
super().reset()
|
||||
|
||||
def step(self, state: State) -> Action:
|
||||
"""
|
||||
Performs one step using the CodeAct Agent.
|
||||
"""Performs one step using the CodeAct Agent.
|
||||
This includes gathering info on previous steps and prompting the model to make a command to execute.
|
||||
|
||||
Parameters:
|
||||
@@ -188,7 +186,6 @@ class CodeActAgent(Agent):
|
||||
- MessageAction(content) - Message action to run (e.g. ask for clarification)
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""
|
||||
|
||||
# if we're done, go back
|
||||
latest_user_message = state.history.get_last_user_message()
|
||||
if latest_user_message and latest_user_message.strip() == '/exit':
|
||||
@@ -216,15 +213,22 @@ class CodeActAgent(Agent):
|
||||
|
||||
for event in state.history.get_events():
|
||||
# create a regular message from an event
|
||||
message = (
|
||||
get_action_message(event)
|
||||
if isinstance(event, Action)
|
||||
else get_observation_message(event)
|
||||
)
|
||||
if isinstance(event, Action):
|
||||
message = self.get_action_message(event)
|
||||
elif isinstance(event, Observation):
|
||||
message = self.get_observation_message(event)
|
||||
else:
|
||||
raise ValueError(f'Unknown event type: {type(event)}')
|
||||
|
||||
# add regular message
|
||||
if message:
|
||||
messages.append(message)
|
||||
# handle error if the message is the SAME role as the previous message
|
||||
# litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
|
||||
# there should not have two consecutive messages from the same role
|
||||
if messages and messages[-1]['role'] == message['role']:
|
||||
messages[-1]['content'] += '\n\n' + message['content']
|
||||
else:
|
||||
messages.append(message)
|
||||
|
||||
# the latest user message is important:
|
||||
# we want to remind the agent of the environment constraints
|
||||
|
||||
@@ -11,9 +11,8 @@ from opendevin.events.action import (
|
||||
|
||||
|
||||
class CodeActSWEActionParserFinish(ActionParser):
|
||||
"""
|
||||
Parser action:
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""Parser action:
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -34,10 +33,9 @@ class CodeActSWEActionParserFinish(ActionParser):
|
||||
|
||||
|
||||
class CodeActSWEActionParserCmdRun(ActionParser):
|
||||
"""
|
||||
Parser action:
|
||||
- CmdRunAction(command) - bash command to run
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""Parser action:
|
||||
- CmdRunAction(command) - bash command to run
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -64,9 +62,8 @@ class CodeActSWEActionParserCmdRun(ActionParser):
|
||||
|
||||
|
||||
class CodeActSWEActionParserIPythonRunCell(ActionParser):
|
||||
"""
|
||||
Parser action:
|
||||
- IPythonRunCellAction(code) - IPython code to run
|
||||
"""Parser action:
|
||||
- IPythonRunCellAction(code) - IPython code to run
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -95,9 +92,8 @@ class CodeActSWEActionParserIPythonRunCell(ActionParser):
|
||||
|
||||
|
||||
class CodeActSWEActionParserMessage(ActionParser):
|
||||
"""
|
||||
Parser action:
|
||||
- MessageAction(content) - Message action to run (e.g. ask for clarification)
|
||||
"""Parser action:
|
||||
- MessageAction(content) - Message action to run (e.g. ask for clarification)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -7,7 +7,6 @@ from agenthub.codeact_swe_agent.prompt import (
|
||||
from agenthub.codeact_swe_agent.response_parser import CodeActSWEResponseParser
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config
|
||||
from opendevin.events.action import (
|
||||
Action,
|
||||
AgentFinishAction,
|
||||
@@ -19,6 +18,7 @@ from opendevin.events.observation import (
|
||||
CmdOutputObservation,
|
||||
IPythonRunCellObservation,
|
||||
)
|
||||
from opendevin.events.observation.observation import Observation
|
||||
from opendevin.events.serialization.event import truncate_content
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.plugins import (
|
||||
@@ -29,54 +29,6 @@ from opendevin.runtime.plugins import (
|
||||
from opendevin.runtime.tools import RuntimeTool
|
||||
|
||||
|
||||
def action_to_str(action: Action) -> str:
|
||||
if isinstance(action, CmdRunAction):
|
||||
return f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
|
||||
elif isinstance(action, IPythonRunCellAction):
|
||||
return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
|
||||
elif isinstance(action, MessageAction):
|
||||
return action.content
|
||||
return ''
|
||||
|
||||
|
||||
def get_action_message(action: Action) -> dict[str, str] | None:
|
||||
if (
|
||||
isinstance(action, CmdRunAction)
|
||||
or isinstance(action, IPythonRunCellAction)
|
||||
or isinstance(action, MessageAction)
|
||||
):
|
||||
return {
|
||||
'role': 'user' if action.source == 'user' else 'assistant',
|
||||
'content': action_to_str(action),
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def get_observation_message(obs) -> dict[str, str] | None:
|
||||
max_message_chars = config.get_llm_config_from_agent(
|
||||
'CodeActSWEAgent'
|
||||
).max_message_chars
|
||||
if isinstance(obs, CmdOutputObservation):
|
||||
content = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
|
||||
content += (
|
||||
f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
|
||||
)
|
||||
return {'role': 'user', 'content': content}
|
||||
elif isinstance(obs, IPythonRunCellObservation):
|
||||
content = 'OBSERVATION:\n' + obs.content
|
||||
# replace base64 images with a placeholder
|
||||
splitted = content.split('\n')
|
||||
for i, line in enumerate(splitted):
|
||||
if ' already displayed to user'
|
||||
)
|
||||
content = '\n'.join(splitted)
|
||||
content = truncate_content(content, max_message_chars)
|
||||
return {'role': 'user', 'content': content}
|
||||
return None
|
||||
|
||||
|
||||
def get_system_message() -> str:
|
||||
return f'{SYSTEM_PREFIX}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
|
||||
|
||||
@@ -113,8 +65,7 @@ class CodeActSWEAgent(Agent):
|
||||
self,
|
||||
llm: LLM,
|
||||
) -> None:
|
||||
"""
|
||||
Initializes a new instance of the CodeActAgent class.
|
||||
"""Initializes a new instance of the CodeActAgent class.
|
||||
|
||||
Parameters:
|
||||
- llm (LLM): The llm to be used by this agent
|
||||
@@ -122,15 +73,59 @@ class CodeActSWEAgent(Agent):
|
||||
super().__init__(llm)
|
||||
self.reset()
|
||||
|
||||
def action_to_str(self, action: Action) -> str:
|
||||
if isinstance(action, CmdRunAction):
|
||||
return (
|
||||
f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
|
||||
)
|
||||
elif isinstance(action, IPythonRunCellAction):
|
||||
return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
|
||||
elif isinstance(action, MessageAction):
|
||||
return action.content
|
||||
return ''
|
||||
|
||||
def get_action_message(self, action: Action) -> dict[str, str] | None:
|
||||
if (
|
||||
isinstance(action, CmdRunAction)
|
||||
or isinstance(action, IPythonRunCellAction)
|
||||
or isinstance(action, MessageAction)
|
||||
):
|
||||
return {
|
||||
'role': 'user' if action.source == 'user' else 'assistant',
|
||||
'content': self.action_to_str(action),
|
||||
}
|
||||
return None
|
||||
|
||||
def get_observation_message(self, obs: Observation) -> dict[str, str] | None:
|
||||
max_message_chars = self.llm.config.max_message_chars
|
||||
if isinstance(obs, CmdOutputObservation):
|
||||
content = 'OBSERVATION:\n' + truncate_content(
|
||||
obs.content, max_message_chars
|
||||
)
|
||||
content += (
|
||||
f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
|
||||
)
|
||||
return {'role': 'user', 'content': content}
|
||||
elif isinstance(obs, IPythonRunCellObservation):
|
||||
content = 'OBSERVATION:\n' + obs.content
|
||||
# replace base64 images with a placeholder
|
||||
splitted = content.split('\n')
|
||||
for i, line in enumerate(splitted):
|
||||
if ' already displayed to user'
|
||||
)
|
||||
content = '\n'.join(splitted)
|
||||
content = truncate_content(content, max_message_chars)
|
||||
return {'role': 'user', 'content': content}
|
||||
return None
|
||||
|
||||
def reset(self) -> None:
|
||||
"""
|
||||
Resets the CodeAct Agent.
|
||||
"""
|
||||
"""Resets the CodeAct Agent."""
|
||||
super().reset()
|
||||
|
||||
def step(self, state: State) -> Action:
|
||||
"""
|
||||
Performs one step using the CodeAct Agent.
|
||||
"""Performs one step using the CodeAct Agent.
|
||||
This includes gathering info on previous steps and prompting the model to make a command to execute.
|
||||
|
||||
Parameters:
|
||||
@@ -142,7 +137,6 @@ class CodeActSWEAgent(Agent):
|
||||
- MessageAction(content) - Message action to run (e.g. ask for clarification)
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""
|
||||
|
||||
# if we're done, go back
|
||||
latest_user_message = state.history.get_last_user_message()
|
||||
if latest_user_message and latest_user_message.strip() == '/exit':
|
||||
@@ -170,15 +164,22 @@ class CodeActSWEAgent(Agent):
|
||||
|
||||
for event in state.history.get_events():
|
||||
# create a regular message from an event
|
||||
message = (
|
||||
get_action_message(event)
|
||||
if isinstance(event, Action)
|
||||
else get_observation_message(event)
|
||||
)
|
||||
if isinstance(event, Action):
|
||||
message = self.get_action_message(event)
|
||||
elif isinstance(event, Observation):
|
||||
message = self.get_observation_message(event)
|
||||
else:
|
||||
raise ValueError(f'Unknown event type: {type(event)}')
|
||||
|
||||
# add regular message
|
||||
if message:
|
||||
messages.append(message)
|
||||
# handle error if the message is the SAME role as the previous message
|
||||
# litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
|
||||
# there should not have two consecutive messages from the same role
|
||||
if messages and messages[-1]['role'] == message['role']:
|
||||
messages[-1]['content'] += '\n\n' + message['content']
|
||||
else:
|
||||
messages.append(message)
|
||||
|
||||
# the latest user message is important:
|
||||
# we want to remind the agent of the environment constraints
|
||||
|
||||
@@ -9,12 +9,11 @@ from opendevin.events.action import Action
|
||||
|
||||
|
||||
class CodeActSWEResponseParser(ResponseParser):
|
||||
"""
|
||||
Parser action:
|
||||
- CmdRunAction(command) - bash command to run
|
||||
- IPythonRunCellAction(code) - IPython code to run
|
||||
- MessageAction(content) - Message action to run (e.g. ask for clarification)
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""Parser action:
|
||||
- CmdRunAction(command) - bash command to run
|
||||
- IPythonRunCellAction(code) - IPython code to run
|
||||
- MessageAction(content) - Message action to run (e.g. ask for clarification)
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
||||
@@ -14,8 +14,7 @@ class DelegatorAgent(Agent):
|
||||
current_delegate: str = ''
|
||||
|
||||
def __init__(self, llm: LLM):
|
||||
"""
|
||||
Initialize the Delegator Agent with an LLM
|
||||
"""Initialize the Delegator Agent with an LLM
|
||||
|
||||
Parameters:
|
||||
- llm (LLM): The llm to be used by this agent
|
||||
@@ -23,8 +22,7 @@ class DelegatorAgent(Agent):
|
||||
super().__init__(llm)
|
||||
|
||||
def step(self, state: State) -> Action:
|
||||
"""
|
||||
Checks to see if current step is completed, returns AgentFinishAction if True.
|
||||
"""Checks to see if current step is completed, returns AgentFinishAction if True.
|
||||
Otherwise, delegates the task to the next agent in the pipeline.
|
||||
|
||||
Parameters:
|
||||
|
||||
+111
-41
@@ -1,8 +1,8 @@
|
||||
import time
|
||||
from typing import TypedDict
|
||||
from typing import TypedDict, Union
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.schema import AgentState
|
||||
from opendevin.events.action import (
|
||||
Action,
|
||||
AddTaskAction,
|
||||
@@ -17,6 +17,7 @@ from opendevin.events.action import (
|
||||
ModifyTaskAction,
|
||||
)
|
||||
from opendevin.events.observation import (
|
||||
AgentStateChangedObservation,
|
||||
CmdOutputObservation,
|
||||
FileReadObservation,
|
||||
FileWriteObservation,
|
||||
@@ -48,32 +49,40 @@ class DummyAgent(Agent):
|
||||
super().__init__(llm)
|
||||
self.steps: list[ActionObs] = [
|
||||
{
|
||||
'action': AddTaskAction(parent='0', goal='check the current directory'),
|
||||
'observations': [NullObservation('')],
|
||||
'action': AddTaskAction(
|
||||
parent='None', goal='check the current directory'
|
||||
),
|
||||
'observations': [],
|
||||
},
|
||||
{
|
||||
'action': AddTaskAction(parent='0.0', goal='run ls'),
|
||||
'observations': [NullObservation('')],
|
||||
'action': AddTaskAction(parent='0', goal='run ls'),
|
||||
'observations': [],
|
||||
},
|
||||
{
|
||||
'action': ModifyTaskAction(task_id='0.0', state='in_progress'),
|
||||
'observations': [NullObservation('')],
|
||||
'action': ModifyTaskAction(task_id='0', state='in_progress'),
|
||||
'observations': [],
|
||||
},
|
||||
{
|
||||
'action': MessageAction('Time to get started!'),
|
||||
'observations': [NullObservation('')],
|
||||
'observations': [],
|
||||
},
|
||||
{
|
||||
'action': CmdRunAction(command='echo "foo"'),
|
||||
'observations': [
|
||||
CmdOutputObservation('foo', command_id=-1, command='echo "foo"')
|
||||
CmdOutputObservation(
|
||||
'foo', command_id=-1, command='echo "foo"', exit_code=0
|
||||
)
|
||||
],
|
||||
},
|
||||
{
|
||||
'action': FileWriteAction(
|
||||
content='echo "Hello, World!"', path='hello.sh'
|
||||
),
|
||||
'observations': [FileWriteObservation('', path='hello.sh')],
|
||||
'observations': [
|
||||
FileWriteObservation(
|
||||
content='echo "Hello, World!"', path='hello.sh'
|
||||
)
|
||||
],
|
||||
},
|
||||
{
|
||||
'action': FileReadAction(path='hello.sh'),
|
||||
@@ -85,14 +94,17 @@ class DummyAgent(Agent):
|
||||
'action': CmdRunAction(command='bash hello.sh'),
|
||||
'observations': [
|
||||
CmdOutputObservation(
|
||||
'Hello, World!', command_id=-1, command='bash hello.sh'
|
||||
'bash: hello.sh: No such file or directory',
|
||||
command_id=-1,
|
||||
command='bash workspace/hello.sh',
|
||||
exit_code=127,
|
||||
)
|
||||
],
|
||||
},
|
||||
{
|
||||
'action': BrowseURLAction(url='https://google.com'),
|
||||
'observations': [
|
||||
# BrowserOutputObservation('<html></html>', url='https://google.com', screenshot=""),
|
||||
# BrowserOutputObservation('<html><body>Simulated Google page</body></html>',url='https://google.com',screenshot=''),
|
||||
],
|
||||
},
|
||||
{
|
||||
@@ -100,47 +112,105 @@ class DummyAgent(Agent):
|
||||
browser_actions='goto("https://google.com")'
|
||||
),
|
||||
'observations': [
|
||||
# BrowserOutputObservation('<html></html>', url='https://google.com', screenshot=""),
|
||||
# BrowserOutputObservation('<html><body>Simulated Google page after interaction</body></html>',url='https://google.com',screenshot=''),
|
||||
],
|
||||
},
|
||||
{
|
||||
'action': AgentFinishAction(),
|
||||
'observations': [],
|
||||
'action': AgentRejectAction(),
|
||||
'observations': [NullObservation('')],
|
||||
},
|
||||
{
|
||||
'action': AgentRejectAction(),
|
||||
'observations': [],
|
||||
'action': AgentFinishAction(
|
||||
outputs={}, thought='Task completed', action='finish'
|
||||
),
|
||||
'observations': [AgentStateChangedObservation('', AgentState.FINISHED)],
|
||||
},
|
||||
]
|
||||
|
||||
def step(self, state: State) -> Action:
|
||||
time.sleep(0.1)
|
||||
if state.iteration >= len(self.steps):
|
||||
return AgentFinishAction()
|
||||
|
||||
current_step = self.steps[state.iteration]
|
||||
action = current_step['action']
|
||||
|
||||
# If the action is AddTaskAction or ModifyTaskAction, update the parent ID or task_id
|
||||
if isinstance(action, AddTaskAction):
|
||||
if action.parent == 'None':
|
||||
action.parent = '' # Root task has no parent
|
||||
elif action.parent == '0':
|
||||
action.parent = state.root_task.id
|
||||
elif action.parent.startswith('0.'):
|
||||
action.parent = f'{state.root_task.id}{action.parent[1:]}'
|
||||
elif isinstance(action, ModifyTaskAction):
|
||||
if action.task_id == '0':
|
||||
action.task_id = state.root_task.id
|
||||
elif action.task_id.startswith('0.'):
|
||||
action.task_id = f'{state.root_task.id}{action.task_id[1:]}'
|
||||
# Ensure the task_id doesn't start with a dot
|
||||
if action.task_id.startswith('.'):
|
||||
action.task_id = action.task_id[1:]
|
||||
elif isinstance(action, (BrowseURLAction, BrowseInteractiveAction)):
|
||||
try:
|
||||
return self.simulate_browser_action(action)
|
||||
except (
|
||||
Exception
|
||||
): # This could be a specific exception for browser unavailability
|
||||
return self.handle_browser_unavailable(action)
|
||||
|
||||
if state.iteration > 0:
|
||||
prev_step = self.steps[state.iteration - 1]
|
||||
|
||||
# a step is (action, observations list)
|
||||
if 'observations' in prev_step:
|
||||
# one obs, at most
|
||||
if 'observations' in prev_step and prev_step['observations']:
|
||||
expected_observations = prev_step['observations']
|
||||
|
||||
# check if the history matches the expected observations
|
||||
hist_events = state.history.get_last_events(len(expected_observations))
|
||||
for i in range(len(expected_observations)):
|
||||
|
||||
if len(hist_events) < len(expected_observations):
|
||||
print(
|
||||
f'Warning: Expected {len(expected_observations)} observations, but got {len(hist_events)}'
|
||||
)
|
||||
|
||||
for i in range(min(len(expected_observations), len(hist_events))):
|
||||
hist_obs = event_to_dict(hist_events[i])
|
||||
expected_obs = event_to_dict(expected_observations[i])
|
||||
if (
|
||||
'command_id' in hist_obs['extras']
|
||||
and hist_obs['extras']['command_id'] != -1
|
||||
):
|
||||
del hist_obs['extras']['command_id']
|
||||
hist_obs['content'] = ''
|
||||
if (
|
||||
'command_id' in expected_obs['extras']
|
||||
and expected_obs['extras']['command_id'] != -1
|
||||
):
|
||||
del expected_obs['extras']['command_id']
|
||||
expected_obs['content'] = ''
|
||||
assert (
|
||||
hist_obs == expected_obs
|
||||
), f'Expected observation {expected_obs}, got {hist_obs}'
|
||||
return self.steps[state.iteration]['action']
|
||||
|
||||
# Remove dynamic fields for comparison
|
||||
for obs in [hist_obs, expected_obs]:
|
||||
obs.pop('id', None)
|
||||
obs.pop('timestamp', None)
|
||||
obs.pop('cause', None)
|
||||
obs.pop('source', None)
|
||||
if 'extras' in obs:
|
||||
obs['extras'].pop('command_id', None)
|
||||
|
||||
if hist_obs != expected_obs:
|
||||
print(
|
||||
f'Warning: Observation mismatch. Expected {expected_obs}, got {hist_obs}'
|
||||
)
|
||||
|
||||
return action
|
||||
|
||||
def simulate_browser_action(
|
||||
self, action: Union[BrowseURLAction, BrowseInteractiveAction]
|
||||
) -> Action:
|
||||
# Instead of simulating, we'll reject the browser action
|
||||
return self.handle_browser_unavailable(action)
|
||||
|
||||
def handle_browser_unavailable(
|
||||
self, action: Union[BrowseURLAction, BrowseInteractiveAction]
|
||||
) -> Action:
|
||||
# Create a message action to inform that browsing is not available
|
||||
message = 'Browser actions are not available in the DummyAgent environment.'
|
||||
if isinstance(action, BrowseURLAction):
|
||||
message += f' Unable to browse URL: {action.url}'
|
||||
elif isinstance(action, BrowseInteractiveAction):
|
||||
message += (
|
||||
f' Unable to perform interactive browsing: {action.browser_actions}'
|
||||
)
|
||||
return MessageAction(content=message)
|
||||
|
||||
async def get_working_directory(self, state: State) -> str:
|
||||
# Implement this method to return the current working directory
|
||||
# This might involve accessing state information or making an async call
|
||||
# For now, we'll return a placeholder value
|
||||
return './workspace'
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
# GPTSwarm Framework
|
||||
|
||||
## Introduction
|
||||
|
||||
This folder implements the GPTSwarm ([paper](https://arxiv.org/abs/2402.01030), [Original Repo](https://github.com/metauto-ai/GPTSwarm)). For more details, please see paper.
|
||||
|
||||
|
||||
## Reference
|
||||
```
|
||||
@article{zhuge2024language,
|
||||
title={Language Agents as Optimizable Graphs},
|
||||
author={Zhuge, Mingchen and Wang, Wenyi and Kirsch, Louis and Faccio, Francesco and Khizbullin, Dmitrii and Schmidhuber, Jurgen},
|
||||
journal={arXiv preprint arXiv:2402.16823},
|
||||
year={2024}
|
||||
}
|
||||
```
|
||||
@@ -1,5 +0,0 @@
|
||||
from opendevin.controller.agent import Agent
|
||||
|
||||
from .gptswarm_agent import GPTSwarm
|
||||
|
||||
Agent.register('GPTSwarmAgent', GPTSwarm)
|
||||
@@ -1,196 +0,0 @@
|
||||
import asyncio
|
||||
import dataclasses
|
||||
from copy import deepcopy
|
||||
from typing import Any, Dict, List, Literal
|
||||
|
||||
from agenthub.gptswarm_agent.gptswarm_graph import AssistantGraph
|
||||
from agenthub.gptswarm_agent.prompt import GPTSwarmPromptSet
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.events.action import Action
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
ENABLE_GITHUB = True
|
||||
OPENAI_API_KEY = 'sk-proj-****' # TODO: get from environment or config
|
||||
|
||||
|
||||
MessageRole = Literal['system', 'user', 'assistant']
|
||||
|
||||
|
||||
@dataclasses.dataclass()
|
||||
class Message:
|
||||
role: MessageRole
|
||||
content: str
|
||||
|
||||
|
||||
class GPTSwarm(Agent):
|
||||
VERSION = '1.0'
|
||||
"""
|
||||
This is simple revision of GPTSwarm which serve as an assistant agent.
|
||||
|
||||
GPTSwarm Paper: https://arxiv.org/abs/2402.16823 (ICML 2024, Oral Presentation)
|
||||
GPTSwarm Code: https://github.com/metauto-ai/GPTSwarm
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
llm: LLM,
|
||||
model_name: str,
|
||||
) -> None:
|
||||
"""
|
||||
Initializes a new instance of the GPTSwarm class.
|
||||
|
||||
Parameters:
|
||||
- llm (LLM): The llm to be used by this agent
|
||||
"""
|
||||
super().__init__(llm)
|
||||
self.api_key = OPENAI_API_KEY
|
||||
self.llm = LLM(model=model_name, api_key=self.api_key)
|
||||
self.graph = AssistantGraph(domain='gaia', model_name=model_name)
|
||||
self.prompt_set = GPTSwarmPromptSet()
|
||||
|
||||
def reset(self) -> None:
|
||||
"""
|
||||
Resets the GPTSwarm Agent.
|
||||
"""
|
||||
super().reset()
|
||||
|
||||
def step(self, state: State) -> Action:
|
||||
"""
|
||||
# TODO: It is stateless now. Find a way to make it stateful.
|
||||
# NOTE: For the AI assistant, state-based design may introduce more uncertainties.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
async def swarm_run(self, inputs: List[Dict[str, Any]], num_agents=3) -> List[str]:
|
||||
"""
|
||||
Run the `run` method of this agent concurrently for `num_agents` times.
|
||||
# NOTE: This is just a simple self-consistency.
|
||||
# TODO: should follow original GPTSwarm's graph design to revise.
|
||||
"""
|
||||
|
||||
async def run_single_agent(index):
|
||||
try:
|
||||
result = await asyncio.wait_for(self.run(inputs=inputs), timeout=200)
|
||||
print('-----------------------------------')
|
||||
print(f'No. {index} Agent complete task..')
|
||||
logger.info(result[0])
|
||||
print('-----------------------------------')
|
||||
return result[0]
|
||||
except asyncio.TimeoutError:
|
||||
print(f'No. {index} Agent timed out.')
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f'No. {index} Agent resulted in an error: {e}')
|
||||
return None
|
||||
|
||||
# Create a list of tasks to run concurrently
|
||||
tasks = [run_single_agent(i) for i in range(num_agents)]
|
||||
|
||||
# Run all tasks concurrently and gather the results
|
||||
agent_answers = await asyncio.gather(*tasks)
|
||||
|
||||
# Filter out None results (from timeouts or errors)
|
||||
agent_answers = [answer for answer in agent_answers if answer is not None]
|
||||
|
||||
task = inputs[0]['task']
|
||||
prompt = self.prompt_set.get_self_consistency(
|
||||
question=task,
|
||||
answers=agent_answers,
|
||||
constraint=self.prompt_set.get_constraint(),
|
||||
)
|
||||
messages = [
|
||||
Message(role='system', content=f'You are a {self.prompt_set.get_role()}.'),
|
||||
Message(role='user', content=prompt),
|
||||
]
|
||||
|
||||
swarm_ans = self.llm.completion(
|
||||
messages=[{'role': msg.role, 'content': msg.content} for msg in messages]
|
||||
)
|
||||
swarm_ans = swarm_ans.choices[0].message.content
|
||||
return [swarm_ans]
|
||||
|
||||
async def run(
|
||||
self,
|
||||
inputs: List[Dict[str, Any]],
|
||||
max_tries: int = 3,
|
||||
max_time: int = 600,
|
||||
return_all_outputs: bool = False,
|
||||
) -> List[Any]:
|
||||
def is_node_useful(node):
|
||||
if node in self.graph.output_nodes:
|
||||
return True
|
||||
|
||||
for successor in node.successors:
|
||||
if is_node_useful(successor):
|
||||
return True
|
||||
return False
|
||||
|
||||
useful_node_ids = [
|
||||
node_id
|
||||
for node_id, node in self.graph.nodes.items()
|
||||
if is_node_useful(node)
|
||||
]
|
||||
in_degree = {
|
||||
node_id: len(self.graph.nodes[node_id].predecessors)
|
||||
for node_id in useful_node_ids
|
||||
}
|
||||
zero_in_degree_queue = [
|
||||
node_id
|
||||
for node_id, deg in in_degree.items()
|
||||
if deg == 0 and node_id in useful_node_ids
|
||||
]
|
||||
|
||||
for i, input_node in enumerate(self.graph.input_nodes):
|
||||
node_input = deepcopy(inputs)
|
||||
input_node.inputs = [node_input]
|
||||
|
||||
while zero_in_degree_queue:
|
||||
current_node_id = zero_in_degree_queue.pop(0)
|
||||
current_node = self.graph.nodes[current_node_id]
|
||||
tries = 0
|
||||
while tries < max_tries:
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
self.graph.nodes[current_node_id].execute(), timeout=max_time
|
||||
)
|
||||
# TODO: make GPTSwarm stateful in OpenDevin.
|
||||
# State.inputs = self.graph.nodes[current_node_id].inputs
|
||||
# State.outputs = self.graph.nodes[current_node_id].outputs
|
||||
# self.step(State)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
print(
|
||||
f'Node {current_node_id} execution timed out, retrying {tries + 1} out of {max_tries}...'
|
||||
)
|
||||
except Exception as e:
|
||||
print(f'Error during execution of node {current_node_id}: {e}')
|
||||
break
|
||||
tries += 1
|
||||
|
||||
for successor in current_node.successors:
|
||||
if successor.id in useful_node_ids:
|
||||
in_degree[successor.id] -= 1
|
||||
if in_degree[successor.id] == 0:
|
||||
zero_in_degree_queue.append(successor.id)
|
||||
|
||||
final_answers = []
|
||||
|
||||
for output_node in self.graph.output_nodes:
|
||||
output_messages = output_node.outputs
|
||||
|
||||
if len(output_messages) > 0 and not return_all_outputs:
|
||||
final_answer = output_messages[-1].get('output', output_messages[-1])
|
||||
final_answers.append(final_answer)
|
||||
else:
|
||||
for output_message in output_messages:
|
||||
final_answer = output_message.get('output', output_message)
|
||||
final_answers.append(final_answer)
|
||||
|
||||
if len(final_answers) == 0:
|
||||
final_answers.append('No answer since there are no inputs provided')
|
||||
return final_answers
|
||||
|
||||
def search_memory(self, query: str) -> list[str]:
|
||||
raise NotImplementedError('Implement this abstract method')
|
||||
@@ -1,520 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import ast
|
||||
import asyncio
|
||||
import dataclasses
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Literal, Optional
|
||||
|
||||
import requests
|
||||
from pytube import YouTube
|
||||
from swarm.graph import Graph, Node
|
||||
|
||||
from agenthub.gptswarm_agent.prompt import GPTSwarmPromptSet
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.plugins.agent_skills.agentskills import (
|
||||
parse_audio,
|
||||
parse_docx,
|
||||
parse_image,
|
||||
parse_latex,
|
||||
parse_pdf,
|
||||
parse_pptx,
|
||||
parse_txt,
|
||||
parse_video,
|
||||
)
|
||||
|
||||
OPENAI_API_KEY = 'sk-proj-****' # TODO: get from environment or config
|
||||
SEARCHAPI_API_KEY = '****' # TODO: get from environment or config
|
||||
|
||||
MessageRole = Literal['system', 'user', 'assistant']
|
||||
|
||||
|
||||
@dataclasses.dataclass()
|
||||
class Message:
|
||||
role: MessageRole
|
||||
content: str
|
||||
|
||||
|
||||
READER_MAP = {
|
||||
'.png': parse_image,
|
||||
'.jpg': parse_image,
|
||||
'.jpeg': parse_image,
|
||||
'.gif': parse_image,
|
||||
'.bmp': parse_image,
|
||||
'.tiff': parse_image,
|
||||
'.tif': parse_image,
|
||||
'.webp': parse_image,
|
||||
'.mp3': parse_audio,
|
||||
'.m4a': parse_audio,
|
||||
'.wav': parse_audio,
|
||||
'.MOV': parse_video,
|
||||
'.mp4': parse_video,
|
||||
'.mov': parse_video,
|
||||
'.avi': parse_video,
|
||||
'.mpg': parse_video,
|
||||
'.mpeg': parse_video,
|
||||
'.wmv': parse_video,
|
||||
'.flv': parse_video,
|
||||
'.webm': parse_video,
|
||||
'.pptx': parse_pptx,
|
||||
'.pdf': parse_pdf,
|
||||
'.docx': parse_docx,
|
||||
'.tex': parse_latex,
|
||||
'.txt': parse_txt,
|
||||
}
|
||||
|
||||
|
||||
class FileReader:
|
||||
def __init__(self):
|
||||
self.reader = None # Initial type is None
|
||||
|
||||
def set_reader(self, suffix: str):
|
||||
reader = READER_MAP.get(suffix)
|
||||
if reader is not None:
|
||||
self.reader = reader
|
||||
logger.info(f'Setting Reader to {self.reader.__name__}')
|
||||
else:
|
||||
logger.error(f'No reader found for suffix {suffix}')
|
||||
self.reader = None
|
||||
|
||||
def read_file(self, file_path: Path, task: str = 'describe the file') -> str:
|
||||
suffix = file_path.suffix
|
||||
self.set_reader(suffix)
|
||||
if not self.reader:
|
||||
raise ValueError(f'No reader set for suffix {suffix}')
|
||||
if self.reader in [parse_image, parse_video]:
|
||||
file_content = self.reader(file_path, task)
|
||||
else:
|
||||
file_content = self.reader(file_path)
|
||||
logger.info(f'Reading file {file_path} using {self.reader.__name__}')
|
||||
return file_content
|
||||
|
||||
|
||||
class GenerateQuery(Node):
|
||||
def __init__(
|
||||
self,
|
||||
domain: str = 'gaia',
|
||||
model_name: Optional[str] = 'gpt-4o-2024-05-13',
|
||||
operation_description: str = 'Given a question, return what information is needed to answer the question.',
|
||||
id=None,
|
||||
):
|
||||
super().__init__(operation_description, id, True)
|
||||
self.domain = domain
|
||||
self.api_key = OPENAI_API_KEY
|
||||
self.llm = LLM(model=model_name, api_key=self.api_key)
|
||||
self.prompt_set = GPTSwarmPromptSet()
|
||||
|
||||
@property
|
||||
def node_name(self) -> str:
|
||||
return self.__class__.__name__
|
||||
|
||||
def extract_urls(self, text: str) -> List[str]:
|
||||
url_pattern = r'https?://[^\s]+'
|
||||
urls = re.findall(url_pattern, text)
|
||||
return urls
|
||||
|
||||
def is_youtube_url(self, url: str) -> bool:
|
||||
youtube_regex = (
|
||||
r'(https?://)?(www\.)?'
|
||||
r'(youtube|youtu|youtube-nocookie)\.(com|be)/'
|
||||
r'(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})'
|
||||
)
|
||||
return bool(re.match(youtube_regex, url))
|
||||
|
||||
def _youtube_download(self, url: str) -> str:
|
||||
try:
|
||||
video_id = url.split('v=')[-1].split('&')[0]
|
||||
video_id = video_id.strip()
|
||||
youtube = YouTube(url)
|
||||
video_stream = (
|
||||
youtube.streams.filter(progressive=True, file_extension='mp4')
|
||||
.order_by('resolution')
|
||||
.desc()
|
||||
.first()
|
||||
)
|
||||
if not video_stream:
|
||||
raise ValueError('No suitable video stream found.')
|
||||
|
||||
output_dir = 'workspace/tmp'
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_path = f'{output_dir}/{video_id}.mp4'
|
||||
video_stream.download(output_path=output_dir, filename=f'{video_id}.mp4')
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f'Error downloading video from {url}: {e}'
|
||||
) # Use logger for error messages
|
||||
return ''
|
||||
|
||||
async def _execute(
|
||||
self, inputs: Optional[List[dict]] = None, **kwargs
|
||||
) -> List[dict]:
|
||||
if inputs is None:
|
||||
inputs = []
|
||||
node_inputs = inputs
|
||||
outputs = []
|
||||
|
||||
for input in node_inputs:
|
||||
urls = self.extract_urls(input['task'])
|
||||
|
||||
download_paths = []
|
||||
|
||||
for url in urls:
|
||||
if self.is_youtube_url(url):
|
||||
download_path = self._youtube_download(url)
|
||||
if download_path:
|
||||
download_paths.append(download_path)
|
||||
|
||||
if urls:
|
||||
logger.info(urls)
|
||||
if download_paths:
|
||||
logger.info(download_paths)
|
||||
|
||||
files = input.get('files', [])
|
||||
if not isinstance(files, list):
|
||||
files = []
|
||||
files.extend(download_paths)
|
||||
|
||||
role = self.prompt_set.get_role()
|
||||
# constraint = self.prompt_set.get_constraint()
|
||||
prompt = self.prompt_set.get_query_prompt(question=input['task'])
|
||||
|
||||
messages = [
|
||||
Message(role='system', content=f'You are a {role}.'),
|
||||
Message(role='user', content=prompt),
|
||||
]
|
||||
|
||||
response = self.llm.completion(
|
||||
messages=[
|
||||
{'role': msg.role, 'content': msg.content} for msg in messages
|
||||
]
|
||||
)
|
||||
response = response.choices[0].message.content
|
||||
|
||||
executions = {
|
||||
'operation': self.node_name,
|
||||
'task': input['task'],
|
||||
'files': files,
|
||||
'input': input.get('task', None),
|
||||
'subtask': prompt,
|
||||
'output': response,
|
||||
'format': 'natural language',
|
||||
}
|
||||
outputs.append(executions)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class FileAnalyse(Node):
|
||||
def __init__(
|
||||
self,
|
||||
domain: str = 'gaia',
|
||||
model_name: Optional[str] = 'gpt-4o-2024-05-13',
|
||||
operation_description: str = 'Given a question, extract information from a file.',
|
||||
id=None,
|
||||
):
|
||||
super().__init__(operation_description, id, True)
|
||||
self.domain = domain
|
||||
self.api_key = OPENAI_API_KEY
|
||||
self.llm = LLM(model=model_name, api_key=self.api_key)
|
||||
self.prompt_set = GPTSwarmPromptSet()
|
||||
self.reader = FileReader()
|
||||
|
||||
@property
|
||||
def node_name(self) -> str:
|
||||
return self.__class__.__name__
|
||||
|
||||
async def _execute(
|
||||
self, inputs: Optional[List[dict]] = None, **kwargs
|
||||
) -> List[dict]:
|
||||
if inputs is None:
|
||||
inputs = []
|
||||
node_inputs = inputs
|
||||
outputs = []
|
||||
for input in node_inputs:
|
||||
query = input.get('output', 'Please organize the information of this file.')
|
||||
files = input.get('files', [])
|
||||
response = await self.file_analyse(query, files, self.llm)
|
||||
|
||||
executions = {
|
||||
'operation': self.node_name,
|
||||
'task': input['task'],
|
||||
'files': files,
|
||||
'input': query,
|
||||
'subtask': f'Read the content of ###{files}, use query ###{query}',
|
||||
'output': response,
|
||||
'format': 'natural language',
|
||||
}
|
||||
|
||||
outputs.append(executions)
|
||||
|
||||
return outputs
|
||||
|
||||
async def file_analyse(self, query: str, files: List[str], llm: LLM) -> str:
|
||||
answer = ''
|
||||
for file in files:
|
||||
file_path = Path(file)
|
||||
if self.reader not in [parse_image, parse_video]:
|
||||
file_content = self.reader.read_file(file_path)
|
||||
prompt = self.prompt_set.get_file_analysis_prompt(
|
||||
query=query, file=file_content
|
||||
)
|
||||
messages = [
|
||||
Message(
|
||||
role='system',
|
||||
content=f'You are a {self.prompt_set.get_role()}.',
|
||||
),
|
||||
Message(role='user', content=prompt),
|
||||
]
|
||||
response = llm.completion(
|
||||
messages=[
|
||||
{'role': msg.role, 'content': msg.content} for msg in messages
|
||||
]
|
||||
)
|
||||
answer += response.choices[0].message.content + '\n'
|
||||
return answer
|
||||
|
||||
|
||||
class WebSearch(Node):
|
||||
def __init__(
|
||||
self,
|
||||
domain: str = 'gaia',
|
||||
model_name: Optional[str] = 'gpt-4o-2024-05-13',
|
||||
operation_description: str = 'Given a question, search the web for infomation.',
|
||||
id=None,
|
||||
):
|
||||
super().__init__(operation_description, id, True)
|
||||
self.domain = domain
|
||||
self.api_key = OPENAI_API_KEY
|
||||
self.llm = LLM(model=model_name, api_key=self.api_key)
|
||||
self.prompt_set = GPTSwarmPromptSet()
|
||||
|
||||
@property
|
||||
def node_name(self) -> str:
|
||||
return self.__class__.__name__
|
||||
|
||||
async def _execute(
|
||||
self, inputs: Optional[List[dict]] = None, max_keywords: int = 4, **kwargs
|
||||
) -> List[dict]:
|
||||
if inputs is None:
|
||||
inputs = []
|
||||
node_inputs = inputs
|
||||
outputs = []
|
||||
for input in node_inputs:
|
||||
task = input['task']
|
||||
query = input['output']
|
||||
prompt = self.prompt_set.get_websearch_prompt(question=task, query=query)
|
||||
messages = [
|
||||
Message(
|
||||
role='system', content=f'You are a {self.prompt_set.get_role()}.'
|
||||
),
|
||||
Message(role='user', content=prompt),
|
||||
]
|
||||
generated_quires = self.llm.completion(
|
||||
messages=[
|
||||
{'role': msg.role, 'content': msg.content} for msg in messages
|
||||
]
|
||||
)
|
||||
|
||||
generated_quires = generated_quires.choices[0].message.content
|
||||
generated_quires = generated_quires.split(',')[:max_keywords]
|
||||
logger.info(f'The search keywords include: {generated_quires}')
|
||||
search_results = [self.web_search(query) for query in generated_quires]
|
||||
logger.info(f'The search results: {str(search_results)[:100]}...')
|
||||
|
||||
distill_prompt = self.prompt_set.get_distill_websearch_prompt(
|
||||
question=input['task'], query=query, results='.\n'.join(search_results)
|
||||
)
|
||||
|
||||
messages = [
|
||||
Message(
|
||||
role='system', content=f'You are a {self.prompt_set.get_role()}.'
|
||||
),
|
||||
Message(role='user', content=distill_prompt),
|
||||
]
|
||||
response = self.llm.completion(
|
||||
messages=[
|
||||
{'role': msg.role, 'content': msg.content} for msg in messages
|
||||
]
|
||||
)
|
||||
response = response.choices[0].message.content
|
||||
|
||||
executions = {
|
||||
'operation': self.node_name,
|
||||
'task': task,
|
||||
'files': input.get('files', []),
|
||||
'input': query,
|
||||
'subtask': distill_prompt,
|
||||
'output': response,
|
||||
'format': 'natural language',
|
||||
}
|
||||
outputs.append(executions)
|
||||
|
||||
return outputs
|
||||
|
||||
def web_search(self, query: str, item_num: int = 3) -> str:
|
||||
url = 'https://www.searchapi.io/api/v1/search'
|
||||
params = {
|
||||
'engine': 'google',
|
||||
'q': query,
|
||||
'api_key': SEARCHAPI_API_KEY, # os.getenv("SEARCHAPI_API_KEY")
|
||||
}
|
||||
|
||||
response = ast.literal_eval(requests.get(url, params=params).text)
|
||||
|
||||
if (
|
||||
'knowledge_graph' in response.keys()
|
||||
and 'description' in response['knowledge_graph'].keys()
|
||||
):
|
||||
return response['knowledge_graph']['description']
|
||||
|
||||
if (
|
||||
'organic_results' in response.keys()
|
||||
and len(response['organic_results']) > 0
|
||||
):
|
||||
snippets = []
|
||||
for res in response['organic_results'][:item_num]:
|
||||
if 'snippet' in res:
|
||||
snippets.append(res['snippet'])
|
||||
return '\n'.join(snippets)
|
||||
|
||||
return ' '
|
||||
|
||||
|
||||
class CombineAnswer(Node):
|
||||
def __init__(
|
||||
self,
|
||||
domain: str = 'gaia',
|
||||
model_name: Optional[str] = 'gpt-4o-2024-05-13',
|
||||
operation_description: str = 'Combine multiple inputs into one.',
|
||||
max_token: int = 500,
|
||||
id=None,
|
||||
):
|
||||
super().__init__(operation_description, id, True)
|
||||
self.domain = domain
|
||||
self.max_token = max_token
|
||||
self.api_key = OPENAI_API_KEY
|
||||
self.llm = LLM(model=model_name, api_key=self.api_key)
|
||||
self.prompt_set = GPTSwarmPromptSet()
|
||||
self.materials: defaultdict[str, str] = defaultdict(str)
|
||||
|
||||
@property
|
||||
def node_name(self) -> str:
|
||||
return self.__class__.__name__
|
||||
|
||||
async def _execute(
|
||||
self, inputs: Optional[List[Any]] = None, **kwargs
|
||||
) -> List[dict]:
|
||||
if inputs is None:
|
||||
inputs = []
|
||||
node_inputs = inputs
|
||||
|
||||
role = self.prompt_set.get_role()
|
||||
constraint = self.prompt_set.get_constraint()
|
||||
|
||||
self.materials = defaultdict(str)
|
||||
for input in node_inputs:
|
||||
operation = input.get('operation')
|
||||
if operation:
|
||||
self.materials[operation] += f'{input.get("output", "")}\n'
|
||||
self.materials['task'] = input.get('task')
|
||||
|
||||
question = self.prompt_set.get_combine_materials(self.materials)
|
||||
prompt = self.prompt_set.get_answer_prompt(question=question)
|
||||
|
||||
messages = [
|
||||
Message(role='system', content=f'You are a {role}. {constraint}'),
|
||||
Message(role='user', content=prompt),
|
||||
]
|
||||
|
||||
response = self.llm.completion(
|
||||
messages=[{'role': msg.role, 'content': msg.content} for msg in messages]
|
||||
)
|
||||
|
||||
response = response.choices[0].message.content
|
||||
|
||||
executions = {
|
||||
'operation': self.node_name,
|
||||
'task': self.materials['task'],
|
||||
'files': self.materials['files']
|
||||
if isinstance(self.materials['files'], str)
|
||||
else ', '.join(self.materials['files']),
|
||||
'input': node_inputs,
|
||||
'subtask': prompt,
|
||||
'output': response,
|
||||
'format': 'natural language',
|
||||
}
|
||||
|
||||
return [executions]
|
||||
|
||||
|
||||
class AssistantGraph(Graph):
|
||||
def build_graph(self):
|
||||
query = GenerateQuery(self.domain, self.model_name)
|
||||
|
||||
file_analysis = FileAnalyse(self.domain, self.model_name)
|
||||
web_search = WebSearch(self.domain, self.model_name)
|
||||
|
||||
query.add_successor(file_analysis)
|
||||
query.add_successor(web_search)
|
||||
|
||||
combine = CombineAnswer(self.domain, self.model_name)
|
||||
file_analysis.add_successor(combine)
|
||||
web_search.add_successor(combine)
|
||||
|
||||
self.input_nodes = [query]
|
||||
self.output_nodes = [combine]
|
||||
|
||||
self.add_node(query)
|
||||
self.add_node(file_analysis)
|
||||
self.add_node(web_search)
|
||||
self.add_node(combine)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# # test node
|
||||
# task = 'What is the text representation of the last digit of twelve squared?'
|
||||
# inputs = [{'task': task}]
|
||||
# query_instance = GenerateQuery()
|
||||
# query = asyncio.run(query_instance._execute(inputs))
|
||||
# print(query)
|
||||
|
||||
# task = 'What is the text representation of the last digit of twelve squared?'
|
||||
# inputs = [
|
||||
# {
|
||||
# 'task': 'How can researchers ensure AGI development is both safe and ethical while avoiding societal biases and inequalities?',
|
||||
# 'files': ['agi.txt'],
|
||||
# }
|
||||
# ]
|
||||
# file_instance = FileAnalyse()
|
||||
# file_info = asyncio.run(file_instance._execute(inputs))
|
||||
# print(file_info)
|
||||
|
||||
# task = 'What is the text representation of the last digit of twelve squared?'
|
||||
# inputs = [
|
||||
# {
|
||||
# 'task': 'How can researchers ensure AGI development is both safe and ethical while avoiding societal biases and inequalities?'
|
||||
# }
|
||||
# ]
|
||||
# search_instance = WebSearch()
|
||||
# search_info = asyncio.run(search_instance._execute(inputs))
|
||||
# print(search_info)
|
||||
|
||||
assistant_graph = AssistantGraph(domain='gaia', model_name='gpt-4o-2024-05-13')
|
||||
|
||||
# test graph
|
||||
assistant_graph.build_graph()
|
||||
inputs = [
|
||||
{
|
||||
'task': 'How can researchers ensure AGI development is both safe and ethical while avoiding societal biases and inequalities?',
|
||||
'files': ['agi.txt'],
|
||||
}
|
||||
]
|
||||
outputs = asyncio.run(assistant_graph.run(inputs))
|
||||
print(outputs)
|
||||
@@ -1,129 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
|
||||
class GPTSwarmPromptSet:
|
||||
"""
|
||||
GPTSwarmPromptSet provides a collection of static methods to generate prompts
|
||||
for a general AI assistant. These prompts cover various tasks like answering questions,
|
||||
performing web searches, analyzing files, and reflecting on tasks.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_role():
|
||||
return 'a general AI assistant'
|
||||
|
||||
@staticmethod
|
||||
def get_constraint():
|
||||
return (
|
||||
'I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. '
|
||||
'YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. '
|
||||
"If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. "
|
||||
"If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. "
|
||||
'If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. '
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_format():
|
||||
return 'natural language'
|
||||
|
||||
@staticmethod
|
||||
def get_answer_prompt(question):
|
||||
return f'{question}'
|
||||
|
||||
@staticmethod
|
||||
def get_query_prompt(question):
|
||||
return (
|
||||
'# Information Gathering for Question Resolution\n\n'
|
||||
'Evaluate if additional information is needed to answer the question. '
|
||||
'If a web search or file analysis is necessary, outline specific clues or details to be searched for.\n\n'
|
||||
f'## ❓ Target Question:\n{question}\n\n'
|
||||
'## 🔍 Clues for Investigation:\n'
|
||||
'Identify critical clues and concepts within the question that are essential for finding the answer.\n'
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_file_analysis_prompt(query, file):
|
||||
return (
|
||||
'# File Analysis Task\n\n'
|
||||
f'## 🔍 Information Extraction Objective:\n---\n{query}\n---\n\n'
|
||||
f'## 📄 File Under Analysis:\n---\n{file}\n---\n\n'
|
||||
'## 📝 Instructions:\n'
|
||||
'1. Identify the key sections in the file relevant to the query.\n'
|
||||
'2. Extract and summarize the necessary information from these sections.\n'
|
||||
'3. Ensure the response is focused and directly addresses the query.\n'
|
||||
"Example: 'Identify the main theme in the text.'"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_websearch_prompt(question, query):
|
||||
return (
|
||||
'# Web Search Task\n\n'
|
||||
f'## Original Question: \n---\n{question}\n---\n\n'
|
||||
f'## 🔍 Targeted Search Objective:\n---\n{query}\n---\n\n'
|
||||
'## 🌐 Simplified Search Instructions:\n'
|
||||
'Generate three specific search queries directly related to the original question. Each query should focus on key terms from the question. Format the output as a comma-separated list.\n'
|
||||
"For example, if the question is 'Who will be the next US president?', your queries could be: 'US presidential candidates, current US president, next US president'.\n"
|
||||
"Remember to format the queries as 'query1, query2, query3'."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_distill_websearch_prompt(question, query, results):
|
||||
return (
|
||||
'# Summarization of Search Results\n\n'
|
||||
f'## Original question: \n---\n{question}\n---\n\n'
|
||||
f'## 🔍 Required Information for Summary:\n---\n{query}\n---\n\n'
|
||||
f'## 🌐 Analyzed Search Results:\n---\n{results}\n---\n\n'
|
||||
'## 📝 Instructions for Summarization:\n'
|
||||
'1. Review the provided search results and identify the most relevant information related to the question and query.\n'
|
||||
'2. Extract and highlight the key findings, facts, or data points from these results.\n'
|
||||
'3. Organize the summarized information in a coherent and logical manner.\n'
|
||||
'4. Ensure the summary is concise and directly addresses the query, avoiding extraneous details.\n'
|
||||
'5. If the information from web search is useless, directly answer: "No useful information from WebSearch".\n'
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_combine_materials(materials: Dict[str, Any], avoid_vague=True) -> str:
|
||||
question = materials.get('task', 'No problem provided')
|
||||
|
||||
for key, value in materials.items():
|
||||
if 'No useful information from WebSearch' in value:
|
||||
continue
|
||||
value = value.strip('\n').strip()
|
||||
if key != 'task' and value:
|
||||
question += (
|
||||
f'\n\nReference information for {key}:'
|
||||
+ '\n----------------------------------------------\n'
|
||||
+ f'{value}'
|
||||
+ '\n----------------------------------------------\n\n'
|
||||
)
|
||||
|
||||
if avoid_vague:
|
||||
question += (
|
||||
'\nProvide a specific answer. For questions with known answers, ensure to provide accurate and factual responses. '
|
||||
+ "Avoid vague responses or statements like 'unable to...' that don't contribute to a definitive answer. "
|
||||
+ "For example: if a question asks 'who will be the president of America', and the answer is currently unknown, you could suggest possibilities like 'Donald Trump', or 'Biden'. However, if the answer is known, provide the correct information."
|
||||
)
|
||||
|
||||
return question
|
||||
|
||||
@staticmethod
|
||||
def get_self_consistency(question: str, answers: list, constraint: str) -> str:
|
||||
formatted_answers = '\n'.join(
|
||||
[f'Answer {index + 1}: {answer}' for index, answer in enumerate(answers)]
|
||||
)
|
||||
return (
|
||||
'# Self-Consistency Evaluation Task\n\n'
|
||||
f'## 🤔 Question for Review:\n---\n{question}\n---\n\n'
|
||||
f'## 💡 Reviewable Answers:\n---\n{formatted_answers}\n---\n\n'
|
||||
'## 📋 Instructions for Selection:\n'
|
||||
'1. Read each answer and assess how it addresses the question.\n'
|
||||
"2. Compare the answers for their adherence to the given question's criteria and logical coherence.\n"
|
||||
"3. Identify the answer that best aligns with the question's requirements and is the most logically consistent.\n"
|
||||
"4. Ignore the candidate answers if they do not give a direct answer, for example, using 'unable to ...', 'as an AI ...'.\n"
|
||||
'5. Copy the most suitable answer as it is, without modification, to maintain its original form.\n'
|
||||
f'6. Adhere to the constraints: {constraint}.\n'
|
||||
'Note: If no answer fully meets the criteria, choose and copy the one that is closest to the requirements.'
|
||||
)
|
||||
+24
-28
@@ -2,7 +2,6 @@ from jinja2 import BaseLoader, Environment
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config
|
||||
from opendevin.core.utils import json
|
||||
from opendevin.events.action import Action
|
||||
from opendevin.events.serialization.action import action_from_dict
|
||||
@@ -23,40 +22,37 @@ def parse_response(orig_response: str) -> Action:
|
||||
|
||||
|
||||
def to_json(obj, **kwargs):
|
||||
"""
|
||||
Serialize an object to str format
|
||||
"""
|
||||
"""Serialize an object to str format"""
|
||||
return json.dumps(obj, **kwargs)
|
||||
|
||||
|
||||
def history_to_json(history: ShortTermHistory, max_events=20, **kwargs):
|
||||
"""
|
||||
Serialize and simplify history to str format
|
||||
"""
|
||||
# TODO: get agent specific llm config
|
||||
llm_config = config.get_llm_config()
|
||||
max_message_chars = llm_config.max_message_chars
|
||||
|
||||
processed_history = []
|
||||
event_count = 0
|
||||
|
||||
for event in history.get_events(reverse=True):
|
||||
if event_count >= max_events:
|
||||
break
|
||||
processed_history.append(event_to_memory(event, max_message_chars))
|
||||
event_count += 1
|
||||
|
||||
# history is in reverse order, let's fix it
|
||||
processed_history.reverse()
|
||||
|
||||
return json.dumps(processed_history, **kwargs)
|
||||
|
||||
|
||||
class MicroAgent(Agent):
|
||||
VERSION = '1.0'
|
||||
prompt = ''
|
||||
agent_definition: dict = {}
|
||||
|
||||
def history_to_json(
|
||||
self, history: ShortTermHistory, max_events: int = 20, **kwargs
|
||||
):
|
||||
"""
|
||||
Serialize and simplify history to str format
|
||||
"""
|
||||
processed_history = []
|
||||
event_count = 0
|
||||
|
||||
for event in history.get_events(reverse=True):
|
||||
if event_count >= max_events:
|
||||
break
|
||||
processed_history.append(
|
||||
event_to_memory(event, self.llm.config.max_message_chars)
|
||||
)
|
||||
event_count += 1
|
||||
|
||||
# history is in reverse order, let's fix it
|
||||
processed_history.reverse()
|
||||
|
||||
return json.dumps(processed_history, **kwargs)
|
||||
|
||||
def __init__(self, llm: LLM):
|
||||
super().__init__(llm)
|
||||
if 'name' not in self.agent_definition:
|
||||
@@ -70,7 +66,7 @@ class MicroAgent(Agent):
|
||||
state=state,
|
||||
instructions=instructions,
|
||||
to_json=to_json,
|
||||
history_to_json=history_to_json,
|
||||
history_to_json=self.history_to_json,
|
||||
delegates=self.delegates,
|
||||
latest_user_message=state.get_current_user_intent(),
|
||||
)
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
.envrc
|
||||
workspace
|
||||
@@ -1,8 +0,0 @@
|
||||
# LLM control loop
|
||||
This is currently a standalone utility. It will need to be integrated into OpenDevin's backend.
|
||||
|
||||
## Usage
|
||||
```bash
|
||||
# Run this in project root
|
||||
./agenthub/monologue_agent/build-and-run.sh "write a bash script that prints 'hello world'"
|
||||
```
|
||||
@@ -1,8 +0,0 @@
|
||||
# TODO
|
||||
There's a lot of low-hanging fruit for this agent:
|
||||
|
||||
* Strip `<script>`, `<style>`, and other non-text tags from the HTML before sending it to the LLM
|
||||
* Keep track of the working directory when the agent uses `cd`
|
||||
* Improve memory condensing--condense earlier memories more aggressively
|
||||
* Limit the time that `run` can wait (in case agent runs an interactive command and it's hanging)
|
||||
* Figure out how to run background processes, e.g. `node server.js` to start a server
|
||||
@@ -1,5 +0,0 @@
|
||||
from opendevin.controller.agent import Agent
|
||||
|
||||
from .agent import MonologueAgent
|
||||
|
||||
Agent.register('MonologueAgent', MonologueAgent)
|
||||
@@ -1,191 +0,0 @@
|
||||
import agenthub.monologue_agent.utils.prompts as prompts
|
||||
from agenthub.monologue_agent.response_parser import MonologueResponseParser
|
||||
from agenthub.monologue_agent.utils.prompts import INITIAL_THOUGHTS
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config
|
||||
from opendevin.core.exceptions import AgentNoInstructionError
|
||||
from opendevin.core.schema import ActionType
|
||||
from opendevin.events.action import (
|
||||
Action,
|
||||
BrowseURLAction,
|
||||
CmdRunAction,
|
||||
FileReadAction,
|
||||
FileWriteAction,
|
||||
MessageAction,
|
||||
NullAction,
|
||||
)
|
||||
from opendevin.events.observation import (
|
||||
BrowserOutputObservation,
|
||||
CmdOutputObservation,
|
||||
FileReadObservation,
|
||||
NullObservation,
|
||||
Observation,
|
||||
)
|
||||
from opendevin.events.serialization.event import event_to_memory
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.memory.condenser import MemoryCondenser
|
||||
from opendevin.runtime.tools import RuntimeTool
|
||||
|
||||
if config.get_agent_config('MonologueAgent').memory_enabled:
|
||||
from opendevin.memory.memory import LongTermMemory
|
||||
|
||||
|
||||
class MonologueAgent(Agent):
|
||||
VERSION = '1.0'
|
||||
"""
|
||||
The Monologue Agent utilizes long and short term memory to complete tasks.
|
||||
Long term memory is stored as a LongTermMemory object and the model uses it to search for examples from the past.
|
||||
Short term memory is stored as a Monologue object and the model can condense it as necessary.
|
||||
"""
|
||||
|
||||
_initialized = False
|
||||
initial_thoughts: list[dict[str, str]]
|
||||
memory: 'LongTermMemory | None'
|
||||
memory_condenser: MemoryCondenser
|
||||
runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]
|
||||
response_parser = MonologueResponseParser()
|
||||
|
||||
def __init__(self, llm: LLM):
|
||||
"""
|
||||
Initializes the Monologue Agent with an llm.
|
||||
|
||||
Parameters:
|
||||
- llm (LLM): The llm to be used by this agent
|
||||
"""
|
||||
super().__init__(llm)
|
||||
|
||||
def _initialize(self, task: str):
|
||||
"""
|
||||
Utilizes the INITIAL_THOUGHTS list to give the agent a context for its capabilities
|
||||
and how to navigate the WORKSPACE_MOUNT_PATH_IN_SANDBOX in `config` (e.g., /workspace by default).
|
||||
Short circuited to return when already initialized.
|
||||
Will execute again when called after reset.
|
||||
|
||||
Parameters:
|
||||
- task: The initial goal statement provided by the user
|
||||
|
||||
Raises:
|
||||
- AgentNoInstructionError: If task is not provided
|
||||
"""
|
||||
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
if task is None or task == '':
|
||||
raise AgentNoInstructionError()
|
||||
|
||||
self.initial_thoughts = []
|
||||
if config.get_agent_config('MonologueAgent').memory_enabled:
|
||||
self.memory = LongTermMemory()
|
||||
else:
|
||||
self.memory = None
|
||||
|
||||
self.memory_condenser = MemoryCondenser()
|
||||
|
||||
self._add_initial_thoughts(task)
|
||||
self._initialized = True
|
||||
|
||||
def _add_initial_thoughts(self, task):
|
||||
max_message_chars = config.get_llm_config_from_agent(
|
||||
'MonologueAgent'
|
||||
).max_message_chars
|
||||
previous_action = ''
|
||||
for thought in INITIAL_THOUGHTS:
|
||||
thought = thought.replace('$TASK', task)
|
||||
if previous_action != '':
|
||||
observation: Observation = NullObservation(content='')
|
||||
if previous_action in {ActionType.RUN, ActionType.PUSH}:
|
||||
observation = CmdOutputObservation(
|
||||
content=thought, command_id=0, command=''
|
||||
)
|
||||
elif previous_action == ActionType.READ:
|
||||
observation = FileReadObservation(content=thought, path='')
|
||||
elif previous_action == ActionType.BROWSE:
|
||||
observation = BrowserOutputObservation(
|
||||
content=thought, url='', screenshot=''
|
||||
)
|
||||
self.initial_thoughts.append(
|
||||
event_to_memory(observation, max_message_chars)
|
||||
)
|
||||
previous_action = ''
|
||||
else:
|
||||
action: Action = NullAction()
|
||||
if thought.startswith('RUN'):
|
||||
command = thought.split('RUN ')[1]
|
||||
action = CmdRunAction(command)
|
||||
previous_action = ActionType.RUN
|
||||
elif thought.startswith('WRITE'):
|
||||
parts = thought.split('WRITE ')[1].split(' > ')
|
||||
path = parts[1]
|
||||
content = parts[0]
|
||||
action = FileWriteAction(path=path, content=content)
|
||||
elif thought.startswith('READ'):
|
||||
path = thought.split('READ ')[1]
|
||||
action = FileReadAction(path=path)
|
||||
previous_action = ActionType.READ
|
||||
elif thought.startswith('BROWSE'):
|
||||
url = thought.split('BROWSE ')[1]
|
||||
action = BrowseURLAction(url=url)
|
||||
previous_action = ActionType.BROWSE
|
||||
else:
|
||||
action = MessageAction(thought)
|
||||
self.initial_thoughts.append(event_to_memory(action, max_message_chars))
|
||||
|
||||
def step(self, state: State) -> Action:
|
||||
"""
|
||||
Modifies the current state by adding the most recent actions and observations, then prompts the model to think about it's next action to take using monologue, memory, and hint.
|
||||
|
||||
Parameters:
|
||||
- state (State): The current state based on previous steps taken
|
||||
|
||||
Returns:
|
||||
- Action: The next action to take based on LLM response
|
||||
"""
|
||||
max_message_chars = config.get_llm_config_from_agent(
|
||||
'MonologueAgent'
|
||||
).max_message_chars
|
||||
goal = state.get_current_user_intent()
|
||||
self._initialize(goal)
|
||||
|
||||
recent_events: list[dict[str, str]] = []
|
||||
|
||||
# add the events from state.history
|
||||
for event in state.history.get_events():
|
||||
recent_events.append(event_to_memory(event, max_message_chars))
|
||||
|
||||
# add the last messages to long term memory
|
||||
if self.memory is not None:
|
||||
last_action = state.history.get_last_action()
|
||||
last_observation = state.history.get_last_observation()
|
||||
|
||||
# this should still work
|
||||
# we will need to do this differently: find out if there really is an action or an observation in this step
|
||||
if last_action:
|
||||
self.memory.add_event(event_to_memory(last_action, max_message_chars))
|
||||
if last_observation:
|
||||
self.memory.add_event(
|
||||
event_to_memory(last_observation, max_message_chars)
|
||||
)
|
||||
|
||||
# the action prompt with initial thoughts and recent events
|
||||
prompt = prompts.get_request_action_prompt(
|
||||
goal, self.initial_thoughts, recent_events
|
||||
)
|
||||
|
||||
messages: list[dict[str, str]] = [
|
||||
{'role': 'user', 'content': prompt},
|
||||
]
|
||||
|
||||
# format all as a single message, a monologue
|
||||
resp = self.llm.completion(messages=messages)
|
||||
|
||||
action = self.response_parser.parse(resp)
|
||||
self.latest_action = action
|
||||
return action
|
||||
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
|
||||
# Reset the initial monologue and memory
|
||||
self._initialized = False
|
||||
@@ -1,212 +0,0 @@
|
||||
from opendevin.core.config import config
|
||||
from opendevin.core.utils import json
|
||||
from opendevin.events.action import (
|
||||
Action,
|
||||
)
|
||||
from opendevin.events.serialization.action import action_from_dict
|
||||
|
||||
ACTION_PROMPT = """
|
||||
You're a thoughtful robot. Your main task is this:
|
||||
%(task)s
|
||||
|
||||
Don't expand the scope of your task--just complete it as written.
|
||||
|
||||
This is your internal monologue, in JSON format:
|
||||
|
||||
%(monologue)s
|
||||
|
||||
Your most recent thought is at the bottom of that monologue. Continue your train of thought.
|
||||
What is your next single thought or action? Your response must be in JSON format.
|
||||
It must be a single object, and it must contain two fields:
|
||||
* `action`, which is one of the actions below
|
||||
* `args`, which is a map of key-value pairs, specifying the arguments for that action
|
||||
|
||||
Here are the possible actions:
|
||||
* `read` - reads the content of a file. Arguments:
|
||||
* `path` - the path of the file to read
|
||||
* `write` - writes the content to a file. Arguments:
|
||||
* `path` - the path of the file to write
|
||||
* `content` - the content to write to the file
|
||||
* `run` - runs a command. Arguments:
|
||||
* `command` - the command to run
|
||||
* `browse` - opens a web page. Arguments:
|
||||
* `url` - the URL to open
|
||||
* `push` - Push a branch from the current repo to github:
|
||||
* `owner` - the owner of the repo to push to
|
||||
* `repo` - the name of the repo to push to
|
||||
* `branch` - the name of the branch to push
|
||||
* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
|
||||
* `content` - the message to record
|
||||
* `wait_for_response` - set to `true` to wait for the user to respond before proceeding
|
||||
* `finish` - if you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
|
||||
|
||||
You MUST take time to think in between read, write, run, browse, and push actions--do this with the `message` action.
|
||||
You should never act twice in a row without thinking. But if your last several
|
||||
actions are all `message` actions, you should consider taking a different action.
|
||||
|
||||
Notes:
|
||||
* you are logged in as %(user)s, but sudo will always work without a password.
|
||||
* all non-background commands will be forcibly stopped if they remain running for over %(timeout)s seconds.
|
||||
* your environment is Debian Linux. You can install software with `sudo apt-get`, but remember to use -y.
|
||||
* don't run interactive commands, or commands that don't return (e.g. `node server.js`). You may run commands in the background (e.g. `node server.js &`)
|
||||
* don't run interactive text editors (e.g. `nano` or 'vim'), instead use the 'write' or 'read' action.
|
||||
* don't run gui applications (e.g. software IDEs (like vs code or codium), web browsers (like firefox or chromium), or other complex software packages). Use non-interactive cli applications, or special actions instead.
|
||||
* whenever an action fails, always send a `message` about why it may have happened before acting again.
|
||||
|
||||
What is your next single thought or action? Again, you must reply with JSON, and only with JSON. You must respond with exactly one 'action' object.
|
||||
|
||||
%(hint)s
|
||||
"""
|
||||
|
||||
MONOLOGUE_SUMMARY_PROMPT = """
|
||||
Below is the internal monologue of an automated LLM agent. Each
|
||||
thought is an item in a JSON array. The thoughts may be memories,
|
||||
actions taken by the agent, or outputs from those actions.
|
||||
Please return a new, smaller JSON array, which summarizes the
|
||||
internal monologue. You can summarize individual thoughts, and
|
||||
you can condense related thoughts together with a description
|
||||
of their content.
|
||||
|
||||
%(monologue)s
|
||||
|
||||
Make the summaries as pithy and informative as possible.
|
||||
Be specific about what happened and what was learned. The summary
|
||||
will be used as keywords for searching for the original memory.
|
||||
Be sure to preserve any key words or important information.
|
||||
|
||||
Your response must be in JSON format. It must be an object with the
|
||||
key `new_monologue`, which is a JSON array containing the summarized monologue.
|
||||
Each entry in the array must have an `action` key, and an `args` key.
|
||||
The action key may be `summarize`, and `args.summary` should contain the summary.
|
||||
You can also use the same action and args from the source monologue.
|
||||
"""
|
||||
|
||||
INITIAL_THOUGHTS = [
|
||||
'I exist!',
|
||||
'Hmm...looks like I can type in a command line prompt',
|
||||
'Looks like I have a web browser too!',
|
||||
"Here's what I want to do: $TASK",
|
||||
'How am I going to get there though?',
|
||||
'It seems like I have some kind of short term memory.',
|
||||
'Each of my thoughts seems to be stored in a JSON array.',
|
||||
'It seems whatever I say next will be added as an object to the list.',
|
||||
"It looks like it's easy for me to use the command line too! I just have to perform a run action and include the command I want to run in the command argument. The command output just jumps into my head!",
|
||||
'RUN echo "hello world"',
|
||||
'hello world',
|
||||
'Cool! I bet I can write files too using the write action.',
|
||||
'WRITE echo "console.log(\'hello world\')" > test.js',
|
||||
'',
|
||||
"I just created test.js. I'll try and run it now.",
|
||||
'RUN node test.js',
|
||||
'hello world',
|
||||
'It works!',
|
||||
"I'm going to try reading it now using the read action.",
|
||||
'READ test.js',
|
||||
"console.log('hello world')",
|
||||
'Nice! I can read files too!',
|
||||
'And if I want to use the browser, I just need to use the browse action and include the url I want to visit in the url argument',
|
||||
"Let's try that...",
|
||||
'BROWSE google.com',
|
||||
'<form><input type="text"></input><button type="submit"></button></form>',
|
||||
'I can browse the web too!',
|
||||
'And once I have completed my task, I can use the finish action to stop working.',
|
||||
"But I should only use the finish action when I'm absolutely certain that I've completed my task and have tested my work.",
|
||||
'Very cool. Now to accomplish my task.',
|
||||
"I'll need a strategy. And as I make progress, I'll need to keep refining that strategy. I'll need to set goals, and break them into sub-goals.",
|
||||
'In between actions, I must always take some time to think, strategize, and set new goals. I should never take two actions in a row.',
|
||||
"OK so my task is to $TASK. I haven't made any progress yet. Where should I start?",
|
||||
'It seems like there might be an existing project here. I should probably start by running `pwd` and `ls` to orient myself.',
|
||||
]
|
||||
|
||||
|
||||
def get_summarize_monologue_prompt(thoughts: list[dict]):
|
||||
"""
|
||||
Gets the prompt for summarizing the monologue
|
||||
|
||||
Returns:
|
||||
- str: A formatted string with the current monologue within the prompt
|
||||
"""
|
||||
return MONOLOGUE_SUMMARY_PROMPT % {
|
||||
'monologue': json.dumps({'old_monologue': thoughts}, indent=2),
|
||||
}
|
||||
|
||||
|
||||
def get_request_action_prompt(
|
||||
task: str,
|
||||
thoughts: list[dict],
|
||||
recent_events: list[dict],
|
||||
):
|
||||
"""
|
||||
Gets the action prompt formatted with appropriate values.
|
||||
|
||||
Parameters:
|
||||
- task (str): The current task the agent is trying to accomplish
|
||||
- thoughts (list[dict]): The agent's current thoughts
|
||||
|
||||
Returns:
|
||||
- str: Formatted prompt string with hint, task, monologue, and background commands included
|
||||
"""
|
||||
|
||||
hint = ''
|
||||
if len(recent_events) > 0:
|
||||
latest_event = recent_events[-1]
|
||||
if 'action' in latest_event:
|
||||
if (
|
||||
latest_event['action'] == 'message'
|
||||
and 'source' in latest_event
|
||||
and latest_event['source'] == 'agent'
|
||||
):
|
||||
hint = (
|
||||
"You've been thinking a lot lately. Maybe it's time to take action?"
|
||||
)
|
||||
elif latest_event['action'] == 'error':
|
||||
hint = 'Looks like that last command failed. Maybe you need to fix it, or try something else.'
|
||||
else:
|
||||
hint = "You're just getting started! What should you do first?"
|
||||
|
||||
user = 'opendevin' if config.run_as_devin else 'root'
|
||||
|
||||
monologue = thoughts + recent_events
|
||||
|
||||
return ACTION_PROMPT % {
|
||||
'task': task,
|
||||
'monologue': json.dumps(monologue, indent=2),
|
||||
'hint': hint,
|
||||
'user': user,
|
||||
'timeout': config.sandbox.timeout,
|
||||
'WORKSPACE_MOUNT_PATH_IN_SANDBOX': config.workspace_mount_path_in_sandbox,
|
||||
}
|
||||
|
||||
|
||||
def parse_action_response(orig_response: str) -> Action:
|
||||
"""
|
||||
Parses a string to find an action within it
|
||||
|
||||
Parameters:
|
||||
- response (str): The string to be parsed
|
||||
|
||||
Returns:
|
||||
- Action: The action that was found in the response string
|
||||
"""
|
||||
# attempt to load the JSON dict from the response
|
||||
action_dict = json.loads(orig_response)
|
||||
|
||||
if 'content' in action_dict:
|
||||
# The LLM gets confused here. Might as well be robust
|
||||
action_dict['contents'] = action_dict.pop('content')
|
||||
|
||||
return action_from_dict(action_dict)
|
||||
|
||||
|
||||
def parse_summary_response(response: str) -> list[dict]:
|
||||
"""
|
||||
Parses a summary of the monologue
|
||||
|
||||
Parameters:
|
||||
- response (str): The response string to be parsed
|
||||
|
||||
Returns:
|
||||
- list[dict]: The list of summaries output by the model
|
||||
"""
|
||||
parsed = json.loads(response)
|
||||
return parsed['new_monologue']
|
||||
@@ -1,4 +1,4 @@
|
||||
from agenthub.monologue_agent.response_parser import MonologueResponseParser
|
||||
from agenthub.planner_agent.response_parser import PlannerResponseParser
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.events.action import Action, AgentFinishAction
|
||||
@@ -15,11 +15,10 @@ class PlannerAgent(Agent):
|
||||
The agent is given its previous action-observation pairs, current task, and hint based on last action taken at every step.
|
||||
"""
|
||||
runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]
|
||||
response_parser = MonologueResponseParser()
|
||||
response_parser = PlannerResponseParser()
|
||||
|
||||
def __init__(self, llm: LLM):
|
||||
"""
|
||||
Initialize the Planner Agent with an LLM
|
||||
"""Initialize the Planner Agent with an LLM
|
||||
|
||||
Parameters:
|
||||
- llm (LLM): The llm to be used by this agent
|
||||
@@ -27,8 +26,7 @@ class PlannerAgent(Agent):
|
||||
super().__init__(llm)
|
||||
|
||||
def step(self, state: State) -> Action:
|
||||
"""
|
||||
Checks to see if current step is completed, returns AgentFinishAction if True.
|
||||
"""Checks to see if current step is completed, returns AgentFinishAction if True.
|
||||
Otherwise, creates a plan prompt and sends to model for inference, returning the result as the next action.
|
||||
|
||||
Parameters:
|
||||
@@ -38,14 +36,13 @@ class PlannerAgent(Agent):
|
||||
- AgentFinishAction: If the last state was 'completed', 'verified', or 'abandoned'
|
||||
- Action: The next action to take based on llm response
|
||||
"""
|
||||
|
||||
if state.root_task.state in [
|
||||
'completed',
|
||||
'verified',
|
||||
'abandoned',
|
||||
]:
|
||||
return AgentFinishAction()
|
||||
prompt = get_prompt(state)
|
||||
prompt = get_prompt(state, self.llm.config.max_message_chars)
|
||||
messages = [{'content': prompt, 'role': 'user'}]
|
||||
resp = self.llm.completion(messages=messages)
|
||||
return self.response_parser.parse(resp)
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.schema import ActionType
|
||||
from opendevin.core.utils import json
|
||||
@@ -101,7 +100,6 @@ What is your next thought or action? Again, you must reply with JSON, and only w
|
||||
|
||||
def get_hint(latest_action_id: str) -> str:
|
||||
"""Returns action type hint based on given action_id"""
|
||||
|
||||
hints = {
|
||||
'': "You haven't taken any actions yet. Start by using `ls` to check out what files you're working with.",
|
||||
ActionType.RUN: 'You should think about the command you just ran, what output it gave, and how that affects your plan.',
|
||||
@@ -117,9 +115,9 @@ def get_hint(latest_action_id: str) -> str:
|
||||
return hints.get(latest_action_id, '')
|
||||
|
||||
|
||||
def get_prompt(state: State) -> str:
|
||||
"""
|
||||
Gets the prompt for the planner agent.
|
||||
def get_prompt(state: State, max_message_chars: int) -> str:
|
||||
"""Gets the prompt for the planner agent.
|
||||
|
||||
Formatted with the most recent action-observation pairs, current task, and hint based on last action
|
||||
|
||||
Parameters:
|
||||
@@ -128,10 +126,6 @@ def get_prompt(state: State) -> str:
|
||||
Returns:
|
||||
- str: The formatted string prompt with historical values
|
||||
"""
|
||||
max_message_chars = config.get_llm_config_from_agent(
|
||||
'PlannerAgent'
|
||||
).max_message_chars
|
||||
|
||||
# the plan
|
||||
plan_str = json.dumps(state.root_task.to_dict(), indent=2)
|
||||
|
||||
@@ -180,10 +174,10 @@ def get_prompt(state: State) -> str:
|
||||
|
||||
|
||||
def parse_response(response: str) -> Action:
|
||||
"""
|
||||
Parses the model output to find a valid action to take
|
||||
"""Parses the model output to find a valid action to take
|
||||
Parameters:
|
||||
- response (str): A response from the model that potentially contains an Action.
|
||||
|
||||
Returns:
|
||||
- Action: A valid next action to perform from model output
|
||||
"""
|
||||
|
||||
+2
-3
@@ -6,7 +6,7 @@ from opendevin.events.action import (
|
||||
from opendevin.events.serialization.action import action_from_dict
|
||||
|
||||
|
||||
class MonologueResponseParser(ResponseParser):
|
||||
class PlannerResponseParser(ResponseParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@@ -19,8 +19,7 @@ class MonologueResponseParser(ResponseParser):
|
||||
return response['choices'][0]['message']['content']
|
||||
|
||||
def parse_action(self, action_str: str) -> Action:
|
||||
"""
|
||||
Parses a string to find an action within it
|
||||
"""Parses a string to find an action within it
|
||||
|
||||
Parameters:
|
||||
- response (str): The string to be parsed
|
||||
@@ -25,9 +25,6 @@ workspace_base = "./workspace"
|
||||
# Disable color in terminal output
|
||||
#disable_color = false
|
||||
|
||||
# Enable auto linting after editing
|
||||
#enable_auto_lint = false
|
||||
|
||||
# Enable saving and restoring the session when run from CLI
|
||||
#enable_cli_session = false
|
||||
|
||||
@@ -76,8 +73,6 @@ persist_sandbox = false
|
||||
# SSH port for the sandbox
|
||||
#ssh_port = 63710
|
||||
|
||||
# Use host network
|
||||
#use_host_network = false
|
||||
|
||||
# Name of the default agent
|
||||
#default_agent = "CodeActAgent"
|
||||
@@ -197,6 +192,12 @@ llm_config = 'gpt3'
|
||||
# Container image to use for the sandbox
|
||||
#container_image = "ghcr.io/opendevin/sandbox:main"
|
||||
|
||||
# Use host network
|
||||
#use_host_network = false
|
||||
|
||||
# Enable auto linting after editing
|
||||
#enable_auto_lint = false
|
||||
|
||||
#################################### Eval ####################################
|
||||
# Configuration for the evaluation, please refer to the specific evaluation
|
||||
# plugin for the available options
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
ARG OPEN_DEVIN_BUILD_VERSION=dev
|
||||
FROM node:21.7.2-bookworm-slim as frontend-builder
|
||||
FROM node:21.7.2-bookworm-slim AS frontend-builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -10,10 +10,10 @@ RUN npm ci
|
||||
COPY ./frontend ./
|
||||
RUN npm run make-i18n && npm run build
|
||||
|
||||
FROM python:3.12.3-slim as backend-builder
|
||||
FROM python:3.12.3-slim AS backend-builder
|
||||
|
||||
WORKDIR /app
|
||||
ENV PYTHONPATH '/app'
|
||||
ENV PYTHONPATH='/app'
|
||||
|
||||
ENV POETRY_NO_INTERACTION=1 \
|
||||
POETRY_VIRTUALENVS_IN_PROJECT=1 \
|
||||
@@ -26,9 +26,9 @@ RUN apt-get update -y \
|
||||
|
||||
COPY ./pyproject.toml ./poetry.lock ./
|
||||
RUN touch README.md
|
||||
RUN poetry install --without evaluation --no-root && rm -rf $POETRY_CACHE_DIR
|
||||
RUN export POETRY_CACHE_DIR && poetry install --without evaluation,llama-index --no-root && rm -rf $POETRY_CACHE_DIR
|
||||
|
||||
FROM python:3.12.3-slim as runtime
|
||||
FROM python:3.12.3-slim AS runtime
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
Executable → Regular
+6
-2
@@ -22,7 +22,9 @@ if [[ "$SANDBOX_USER_ID" -eq 0 ]]; then
|
||||
echo "Running OpenDevin as root"
|
||||
export RUN_AS_DEVIN=false
|
||||
mkdir -p /root/.cache/ms-playwright/
|
||||
mv /home/opendevin/.cache/ms-playwright/ /root/.cache/
|
||||
if [ -d "/home/opendevin/.cache/ms-playwright/" ]; then
|
||||
mv /home/opendevin/.cache/ms-playwright/ /root/.cache/
|
||||
fi
|
||||
"$@"
|
||||
else
|
||||
echo "Setting up enduser with id $SANDBOX_USER_ID"
|
||||
@@ -52,7 +54,9 @@ else
|
||||
|
||||
mkdir -p /home/enduser/.cache/huggingface/hub/
|
||||
mkdir -p /home/enduser/.cache/ms-playwright/
|
||||
mv /home/opendevin/.cache/ms-playwright/ /home/enduser/.cache/
|
||||
if [ -d "/home/opendevin/.cache/ms-playwright/" ]; then
|
||||
mv /home/opendevin/.cache/ms-playwright/ /home/enduser/.cache/
|
||||
fi
|
||||
|
||||
usermod -aG $DOCKER_SOCKET_GID enduser
|
||||
echo "Running as enduser"
|
||||
|
||||
+9
-1
@@ -27,11 +27,14 @@ echo "Tags: ${tags[@]}"
|
||||
|
||||
if [[ "$image_name" == "opendevin" ]]; then
|
||||
dir="./containers/app"
|
||||
elif [[ "$image_name" == "od_runtime" ]]; then
|
||||
dir="./containers/runtime"
|
||||
else
|
||||
dir="./containers/$image_name"
|
||||
fi
|
||||
|
||||
if [[ ! -f "$dir/Dockerfile" ]]; then
|
||||
if [[ (! -f "$dir/Dockerfile") && "$image_name" != "od_runtime" ]]; then
|
||||
# Allow runtime to be built without a Dockerfile
|
||||
echo "No Dockerfile found"
|
||||
exit 1
|
||||
fi
|
||||
@@ -46,6 +49,11 @@ if [[ -n "$org_name" ]]; then
|
||||
DOCKER_ORG="$org_name"
|
||||
fi
|
||||
|
||||
# If $DOCKER_IMAGE_TAG is set, add it to the tags
|
||||
if [[ -n "$DOCKER_IMAGE_TAG" ]]; then
|
||||
tags+=("$DOCKER_IMAGE_TAG")
|
||||
fi
|
||||
|
||||
DOCKER_REPOSITORY="$DOCKER_REGISTRY/$DOCKER_ORG/$DOCKER_IMAGE"
|
||||
DOCKER_REPOSITORY=${DOCKER_REPOSITORY,,} # lowercase
|
||||
echo "Repo: $DOCKER_REPOSITORY"
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
# Dynamic constructed Dockerfile
|
||||
|
||||
This folder builds runtime image (sandbox), which will use a `Dockerfile` that is dynamically generated depends on the `base_image` AND a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that's based on the current commit of `opendevin`.
|
||||
|
||||
The following command will generate Dockerfile for `ubuntu:22.04` and the source distribution `.tar` into `containers/runtime`.
|
||||
|
||||
```bash
|
||||
poetry run python3 opendevin/runtime/utils/runtime_build.py \
|
||||
--base_image ubuntu:22.04 \
|
||||
--build_folder containers/runtime
|
||||
```
|
||||
@@ -0,0 +1,8 @@
|
||||
DOCKER_REGISTRY=ghcr.io
|
||||
DOCKER_ORG=opendevin
|
||||
DOCKER_BASE_DIR="./containers/runtime"
|
||||
# These two variables will be appended by the runtime_build.py script
|
||||
# DOCKER_IMAGE=
|
||||
# DOCKER_IMAGE_TAG=
|
||||
DOCKER_IMAGE=od_runtime
|
||||
DOCKER_IMAGE_TAG=od_v0.8.1_image_ubuntu_tag_22.04
|
||||
@@ -7,5 +7,3 @@ warn_unreachable = True
|
||||
warn_redundant_casts = True
|
||||
no_implicit_optional = True
|
||||
strict_optional = True
|
||||
|
||||
exclude = agenthub/monologue_agent/regression
|
||||
|
||||
@@ -1,7 +1,3 @@
|
||||
exclude = [
|
||||
"agenthub/monologue_agent/regression/",
|
||||
]
|
||||
|
||||
[lint]
|
||||
select = [
|
||||
"E",
|
||||
|
||||
@@ -61,42 +61,6 @@ _Exemple de CodeActAgent avec `gpt-4-turbo-2024-04-09` effectuant une tâche de
|
||||
[] Support de la navigation sur le web
|
||||
[] Compléter le workflow pour l'agent CodeAct afin de soumettre des PRs Github
|
||||
|
||||
## Agent Monologue
|
||||
|
||||
### Description
|
||||
|
||||
L'agent Monologue utilise la mémoire à long terme et à court terme pour accomplir des tâches.
|
||||
La mémoire à long terme est stockée en tant qu'objet LongTermMemory et le modèle l'utilise pour rechercher des exemples du passé.
|
||||
La mémoire à court terme est stockée en tant qu'objet Monologue et le modèle peut la condenser si nécessaire.
|
||||
|
||||
### Actions
|
||||
|
||||
`Action`,
|
||||
`NullAction`,
|
||||
`CmdRunAction`,
|
||||
`FileWriteAction`,
|
||||
`FileReadAction`,
|
||||
`BrowseURLAction`,
|
||||
`GithubPushAction`,
|
||||
`AgentThinkAction`
|
||||
|
||||
### Observations
|
||||
|
||||
`Observation`,
|
||||
`NullObservation`,
|
||||
`CmdOutputObservation`,
|
||||
`FileReadObservation`,
|
||||
`BrowserOutputObservation`
|
||||
|
||||
### Méthodes
|
||||
|
||||
| Méthode | Description |
|
||||
| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `__init__` | Initialise l'agent avec une mémoire à long terme et un monologue interne |
|
||||
| `_add_event` | Ajoute des événements au monologue de l'agent et condense avec un résumé automatiquement si le monologue est trop long |
|
||||
| `_initialize` | Utilise la liste `INITIAL_THOUGHTS` pour donner à l'agent un contexte pour ses capacités et comment naviguer dans le `/workspace` |
|
||||
| `step` | Modifie l'état actuel en ajoutant les actions et observations les plus récentes, puis invite le modèle à réfléchir à la prochaine action à entreprendre. |
|
||||
|
||||
## Agent Planificateur
|
||||
|
||||
### Description
|
||||
|
||||
@@ -93,7 +93,7 @@ Si vous souhaitez utiliser la version **(instable !)** la plus récente, vous po
|
||||
|
||||
Pour le workflow de développement, consultez [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md).
|
||||
|
||||
Avez-vous des problèmes ? Consultez notre [Guide de dépannage](https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting).
|
||||
Avez-vous des problèmes ? Consultez notre [Guide de dépannage](https://docs.all-hands.dev/modules/usage/troubleshooting).
|
||||
|
||||
:::warning
|
||||
OpenDevin est actuellement en cours de développement, mais vous pouvez déjà exécuter la version alpha pour voir le système de bout en bout en action.
|
||||
|
||||
+1
-1
@@ -25,7 +25,7 @@ Si vous utilisez Windows et que vous rencontrez des problèmes, consultez notre
|
||||
### Symptômes
|
||||
|
||||
```bash
|
||||
Erreur lors de la création du contrôleur. Veuillez vérifier que Docker est en cours d'exécution et visitez `https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting` pour plus d'informations sur le débogage.
|
||||
Erreur lors de la création du contrôleur. Veuillez vérifier que Docker est en cours d'exécution et visitez `https://docs.all-hands.dev/modules/usage/troubleshooting` pour plus d'informations sur le débogage.
|
||||
```
|
||||
|
||||
```bash
|
||||
|
||||
@@ -61,42 +61,6 @@ _CodeActAgent使用`gpt-4-turbo-2024-04-09`执行数据科学任务(线性回
|
||||
[] 支持Web浏览
|
||||
[] 完成CodeAct agent提交Github PR的工作流程
|
||||
|
||||
## Monologue Agent
|
||||
|
||||
### 描述
|
||||
|
||||
Monologue Agent利用长短期记忆来完成任务。
|
||||
长期记忆存储为LongTermMemory对象,模型使用它来搜索过去的示例。
|
||||
短期记忆存储为Monologue对象,模型可以根据需要进行压缩。
|
||||
|
||||
### 动作
|
||||
|
||||
`Action`,
|
||||
`NullAction`,
|
||||
`CmdRunAction`,
|
||||
`FileWriteAction`,
|
||||
`FileReadAction`,
|
||||
`BrowseURLAction`,
|
||||
`GithubPushAction`,
|
||||
`AgentThinkAction`
|
||||
|
||||
### 观测
|
||||
|
||||
`Observation`,
|
||||
`NullObservation`,
|
||||
`CmdOutputObservation`,
|
||||
`FileReadObservation`,
|
||||
`BrowserOutputObservation`
|
||||
|
||||
### 方法
|
||||
|
||||
| 方法 | 描述 |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `__init__` | 使用长期记忆和内部独白初始化Agent |
|
||||
| `_add_event` | 将事件附加到Agent的独白中,如独白过长自动与摘要一起压缩 |
|
||||
| `_initialize` | 使用`INITIAL_THOUGHTS`列表为agent提供其能力的上下文以及如何导航`/workspace` |
|
||||
| `step` | 通过添加最近的动作和观测修改当前状态,然后提示模型考虑其接下来的动作。 |
|
||||
|
||||
## Planner Agent
|
||||
|
||||
### 描述
|
||||
|
||||
@@ -93,7 +93,7 @@ OpenDevin 只会访问这个工作区文件夹。它在一个安全的 docker
|
||||
|
||||
有关开发工作流程,请参阅 [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md)。
|
||||
|
||||
遇到问题了吗?查看我们的 [故障排除指南](https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting)。
|
||||
遇到问题了吗?查看我们的 [故障排除指南](https://docs.all-hands.dev/modules/usage/troubleshooting)。
|
||||
|
||||
:::warning
|
||||
OpenDevin 目前正在开发中,但你已经可以运行 alpha 版本来查看端到端系统的运作情况。
|
||||
|
||||
+1
-1
@@ -23,7 +23,7 @@ sidebar_position: 5
|
||||
### 症状
|
||||
|
||||
```bash
|
||||
创建控制器时出错。请检查 Docker 是否正在运行,并访问 `https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting` 获取更多调试信息。
|
||||
创建控制器时出错。请检查 Docker 是否正在运行,并访问 `https://docs.all-hands.dev/modules/usage/troubleshooting` 获取更多调试信息。
|
||||
```
|
||||
|
||||
```bash
|
||||
|
||||
@@ -56,42 +56,6 @@ _Example of CodeActAgent with `gpt-4-turbo-2024-04-09` performing a data science
|
||||
| `__init__` | Initializes an agent with `llm` and a list of messages `list[Mapping[str, str]]` |
|
||||
| `step` | Performs one step using the CodeAct Agent. This includes gathering info on previous steps and prompting the model to make a command to execute. |
|
||||
|
||||
## Monologue Agent
|
||||
|
||||
### Description
|
||||
|
||||
The Monologue Agent utilizes long and short term memory to complete tasks.
|
||||
Long term memory is stored as a LongTermMemory object and the model uses it to search for examples from the past.
|
||||
Short term memory is stored as a Monologue object and the model can condense it as necessary.
|
||||
|
||||
### Actions
|
||||
|
||||
`Action`,
|
||||
`NullAction`,
|
||||
`CmdRunAction`,
|
||||
`FileWriteAction`,
|
||||
`FileReadAction`,
|
||||
`BrowseURLAction`,
|
||||
`GithubPushAction`,
|
||||
`AgentThinkAction`
|
||||
|
||||
### Observations
|
||||
|
||||
`Observation`,
|
||||
`NullObservation`,
|
||||
`CmdOutputObservation`,
|
||||
`FileReadObservation`,
|
||||
`BrowserOutputObservation`
|
||||
|
||||
### Methods
|
||||
|
||||
| Method | Description |
|
||||
| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `__init__` | Initializes the agent with a long term memory, and an internal monologue |
|
||||
| `_add_event` | Appends events to the monologue of the agent and condenses with summary automatically if the monologue is too long |
|
||||
| `_initialize` | Utilizes the `INITIAL_THOUGHTS` list to give the agent a context for its capabilities and how to navigate the `/workspace` |
|
||||
| `step` | Modifies the current state by adding the most recent actions and observations, then prompts the model to think about its next action to take. |
|
||||
|
||||
## Planner Agent
|
||||
|
||||
### Description
|
||||
|
||||
@@ -4,22 +4,35 @@ sidebar_position: 6
|
||||
|
||||
# 💿 How to Create and Use a Custom Docker Sandbox
|
||||
|
||||
The default OpenDevin sandbox comes with a [minimal ubuntu configuration](https://github.com/OpenDevin/OpenDevin/blob/main/containers/sandbox/Dockerfile).
|
||||
The default OpenDevin sandbox comes with a [minimal ubuntu configuration](https://github.com/OpenDevin/OpenDevin/blob/main/containers/sandbox/Dockerfile).
|
||||
|
||||
Your use case may need additional software installed by default.
|
||||
|
||||
There are two ways you can do so:
|
||||
|
||||
1. Use an existing image from docker hub. For instance, if you want to have `nodejs` installed, you can do so by using the `node:20` image
|
||||
2. Creating your own custom docker image and using it
|
||||
|
||||
If you want to take the first approach, you can skip the `Create Your Docker Image` section.
|
||||
|
||||
For a more feature-rich environment, you might consider using pre-built images like **[nikolaik/python-nodejs](https://hub.docker.com/r/nikolaik/python-nodejs)**, which comes with both Python and Node.js pre-installed, along with many other useful tools and libraries, like:
|
||||
|
||||
- Node.js: 22.x
|
||||
- npm: 10.x
|
||||
- yarn: stable
|
||||
- Python: latest
|
||||
- pip: latest
|
||||
- pipenv: latest
|
||||
- poetry: latest
|
||||
- uv: latest
|
||||
|
||||
## Setup
|
||||
|
||||
Make sure you are able to run OpenDevin using the [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) first.
|
||||
|
||||
## Create Your Docker Image
|
||||
To create a custom docker image, it must be debian/ubuntu based.
|
||||
|
||||
To create a custom docker image, it must be debian/ubuntu based.
|
||||
|
||||
For example, if we want OpenDevin to have access to the `node` binary, we would use the following Dockerfile:
|
||||
|
||||
@@ -34,7 +47,7 @@ RUN apt-get update && apt-get install -y
|
||||
RUN apt-get install -y nodejs
|
||||
```
|
||||
|
||||
Next build your docker image with the name of your choice, for example `custom_image`.
|
||||
Next build your docker image with the name of your choice, for example `custom_image`.
|
||||
|
||||
To do this you can create a directory and put your file inside it with the name `Dockerfile`, and inside the directory run the following command:
|
||||
|
||||
@@ -50,7 +63,7 @@ This will produce a new image called ```custom_image``` that will be available i
|
||||
|
||||
## Specify your sandbox image in config.toml file
|
||||
|
||||
OpenDevin configuration occurs via the top-level `config.toml` file.
|
||||
OpenDevin configuration occurs via the top-level `config.toml` file.
|
||||
|
||||
Create a `config.toml` file in the OpenDevin directory and enter these contents:
|
||||
|
||||
@@ -63,6 +76,7 @@ sandbox_container_image="custom_image"
|
||||
```
|
||||
|
||||
For `sandbox_container_image`, you can specify either:
|
||||
|
||||
1. The name of your custom image that you built in the previous step (e.g., `”custom_image”`)
|
||||
2. A pre-existing image from Docker Hub (e.g., `”node:20”` if you want a sandbox with Node.js pre-installed)
|
||||
|
||||
@@ -79,7 +93,7 @@ Congratulations!
|
||||
|
||||
The relevant code is defined in [ssh_box.py](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/docker/ssh_box.py) and [image_agnostic_util.py](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/docker/image_agnostic_util.py).
|
||||
|
||||
In particular, ssh_box.py checks the config object for ```config.sandbox_container_image``` and then attempts to retrieve the image using [get_od_sandbox_image](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/docker/image_agnostic_util.py#L72) which is defined in image_agnostic_util.py.
|
||||
In particular, `ssh_box.py` checks the config object for ```config.sandbox_container_image``` and then attempts to retrieve the image using [get_od_sandbox_image](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/docker/image_agnostic_util.py#L72) which is defined in image_agnostic_util.py.
|
||||
|
||||
When first using a custom image, it will not be found and thus it will be built (on subsequent runs the built image will be found and returned).
|
||||
|
||||
@@ -109,6 +123,7 @@ dockerfile_content = (
|
||||
## Troubleshooting / Errors
|
||||
|
||||
### Error: ```useradd: UID 1000 is not unique```
|
||||
|
||||
If you see this error in the console output it is because OpenDevin is trying to create the opendevin user in the sandbox with a UID of 1000, however this UID is already being used in the image (for some reason). To fix this change the sandbox_user_id field in the config.toml file to a different value:
|
||||
|
||||
```toml
|
||||
@@ -122,7 +137,7 @@ sandbox_user_id="1001"
|
||||
|
||||
### Port use errors
|
||||
|
||||
If you see an error about a port being in use or unavailable, try deleting all running Docker Containers (run `docker ps` and `docker rm` relevant containers) and then re-running ```make run```
|
||||
If you see an error about a port being in use or unavailable, try deleting all running Docker Containers (run `docker ps` and `docker rm` relevant containers) and then re-running ```make run``` .
|
||||
|
||||
## Discuss
|
||||
|
||||
|
||||
@@ -15,13 +15,14 @@ OpenDevin will issue many prompts to the LLM you configure. Most of these LLMs c
|
||||
The `LLM_MODEL` environment variable controls which model is used in programmatic interactions.
|
||||
But when using the OpenDevin UI, you'll need to choose your model in the settings window.
|
||||
|
||||
The following environment variables might be necessary for some LLMs:
|
||||
The following environment variables might be necessary for some LLMs/providers:
|
||||
|
||||
- `LLM_API_KEY`
|
||||
- `LLM_BASE_URL`
|
||||
- `LLM_EMBEDDING_MODEL`
|
||||
- `LLM_EMBEDDING_DEPLOYMENT_NAME`
|
||||
- `LLM_API_VERSION`
|
||||
- `LLM_DROP_PARAMS`
|
||||
|
||||
We have a few guides for running OpenDevin with specific model providers:
|
||||
|
||||
|
||||
@@ -172,9 +172,9 @@ docker run \
|
||||
-it \
|
||||
--pull=always \
|
||||
-e SANDBOX_USER_ID=$(id -u) \
|
||||
-e LLM_MODEL="openai/lmstudio"
|
||||
-e LLM_MODEL="openai/lmstudio" \
|
||||
-e LLM_BASE_URL="http://host.docker.internal:1234/v1" \
|
||||
-e CUSTOM_LLM_PROVIDER="openai"
|
||||
-e CUSTOM_LLM_PROVIDER="openai" \
|
||||
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
|
||||
-v $WORKSPACE_BASE:/opt/workspace_base \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
|
||||
@@ -0,0 +1,302 @@
|
||||
---
|
||||
sidebar_position: 6
|
||||
---
|
||||
|
||||
# 💿 How to use OpenDevin in OpenShift/K8S
|
||||
|
||||
There are different ways and scenarios that you can do, we're just mentioning one example here:
|
||||
1. Create a PV "as a cluster admin" to map workspace_base data and docker directory to the pod through the worker node.
|
||||
2. Create a PVC to be able to mount those PVs to the POD
|
||||
3. Create a POD which contains two containers; the OpenDevin and Sandbox containers.
|
||||
|
||||
## Steps to follow the above example.
|
||||
|
||||
> Note: Make sure you are logged in to the cluster first with the proper account for each step. PV creation requires cluster administrator!
|
||||
|
||||
> Make sure you have read/write permissions on the hostPath used below (i.e. /tmp/workspace)
|
||||
|
||||
1. Create the PV:
|
||||
Sample yaml file below can be used by a cluster admin to create the PV.
|
||||
- workspace-pv.yaml
|
||||
|
||||
```yamlfile
|
||||
apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: workspace-pv
|
||||
spec:
|
||||
capacity:
|
||||
storage: 2Gi
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
persistentVolumeReclaimPolicy: Retain
|
||||
hostPath:
|
||||
path: /tmp/workspace
|
||||
```
|
||||
|
||||
```bash
|
||||
# apply yaml file
|
||||
$ oc create -f workspace-pv.yaml
|
||||
persistentvolume/workspace-pv created
|
||||
|
||||
# review:
|
||||
$ oc get pv
|
||||
NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE
|
||||
workspace-pv 2Gi RWO Retain Available 7m23s
|
||||
```
|
||||
|
||||
- docker-pv.yaml
|
||||
|
||||
```yamlfile
|
||||
apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: docker-pv
|
||||
spec:
|
||||
capacity:
|
||||
storage: 2Gi
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
persistentVolumeReclaimPolicy: Retain
|
||||
hostPath:
|
||||
path: /var/run/docker.sock
|
||||
```
|
||||
|
||||
```bash
|
||||
# apply yaml file
|
||||
$ oc create -f docker-pv.yaml
|
||||
persistentvolume/docker-pv created
|
||||
|
||||
# review:
|
||||
oc get pv
|
||||
NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE
|
||||
docker-pv 2Gi RWO Retain Available 6m55s
|
||||
workspace-pv 2Gi RWO Retain Available 7m23s
|
||||
```
|
||||
|
||||
2. Create the PVC:
|
||||
Sample PVC yaml file below:
|
||||
|
||||
- workspace-pvc.yaml
|
||||
|
||||
```yamlfile
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: workspace-pvc
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
```
|
||||
|
||||
```bash
|
||||
# create the pvc
|
||||
$ oc create -f workspace-pvc.yaml
|
||||
persistentvolumeclaim/workspace-pvc created
|
||||
|
||||
# review
|
||||
$ oc get pvc
|
||||
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
|
||||
workspace-pvc Pending hcloud-volumes 4s
|
||||
|
||||
$ oc get events
|
||||
LAST SEEN TYPE REASON OBJECT MESSAGE
|
||||
8s Normal WaitForFirstConsumer persistentvolumeclaim/workspace-pvc waiting for first consumer to be created before binding
|
||||
```
|
||||
|
||||
- docker-pvc.yaml
|
||||
|
||||
```yamlfile
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: docker-pvc
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
```
|
||||
|
||||
```bash
|
||||
# create pvc
|
||||
$ oc create -f docker-pvc.yaml
|
||||
persistentvolumeclaim/docker-pvc created
|
||||
|
||||
# review
|
||||
$ oc get pvc
|
||||
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
|
||||
docker-pvc Pending hcloud-volumes 4s
|
||||
workspace-pvc Pending hcloud-volumes 2m53s
|
||||
|
||||
$ oc get events
|
||||
LAST SEEN TYPE REASON OBJECT MESSAGE
|
||||
10s Normal WaitForFirstConsumer persistentvolumeclaim/docker-pvc waiting for first consumer to be created before binding
|
||||
10s Normal WaitForFirstConsumer persistentvolumeclaim/workspace-pvc waiting for first consumer to be created before binding
|
||||
```
|
||||
|
||||
3. Create the POD yaml file:
|
||||
Sample POD yaml file below:
|
||||
|
||||
- pod.yaml
|
||||
|
||||
```yamlfile
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: opendevin-app-2024
|
||||
labels:
|
||||
app: opendevin-app-2024
|
||||
spec:
|
||||
containers:
|
||||
- name: opendevin-app-2024
|
||||
image: ghcr.io/opendevin/opendevin:0.7.1
|
||||
env:
|
||||
- name: SANDBOX_USER_ID
|
||||
value: "1000"
|
||||
- name: SANDBOX_BOX_TYPE
|
||||
value: 'local'
|
||||
- name: WORKSPACE_MOUNT_PATH
|
||||
value: "/opt/workspace_base"
|
||||
volumeMounts:
|
||||
- name: workspace-volume
|
||||
mountPath: /opt/workspace_base
|
||||
- name: docker-sock
|
||||
mountPath: /var/run/docker.sock
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
- name: opendevin-sandbox-2024
|
||||
image: ghcr.io/opendevin/sandbox:main
|
||||
ports:
|
||||
- containerPort: 51963
|
||||
command: ["/usr/sbin/sshd", "-D", "-p 51963", "-o", "PermitRootLogin=yes"]
|
||||
volumes:
|
||||
- name: workspace-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: workspace-pvc
|
||||
- name: docker-sock
|
||||
persistentVolumeClaim:
|
||||
claimName: docker-pvc
|
||||
```
|
||||
|
||||
```bash
|
||||
# create the pod
|
||||
$ oc create -f pod.yaml
|
||||
W0716 11:22:07.776271 107626 warnings.go:70] would violate PodSecurity "restricted:v1.24": allowPrivilegeEscalation != false (containers "opendevin-app-2024", "opendevin-sandbox-2024" must set securityContext.allowPrivilegeEscalation=false), unrestricted capabilities (containers "opendevin-app-2024", "opendevin-sandbox-2024" must set securityContext.capabilities.drop=["ALL"]), runAsNonRoot != true (pod or containers "opendevin-app-2024", "opendevin-sandbox-2024" must set securityContext.runAsNonRoot=true), seccompProfile (pod or containers "opendevin-app-2024", "opendevin-sandbox-2024" must set securityContext.seccompProfile.type to "RuntimeDefault" or "Localhost")
|
||||
pod/opendevin-app-2024 created
|
||||
|
||||
# Above warning can be ignored for now as we will not modify SCC restrictions.
|
||||
|
||||
# review
|
||||
$ oc get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
opendevin-app-2024 0/2 Pending 0 5s
|
||||
|
||||
$ oc get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
opendevin-app-2024 0/2 ContainerCreating 0 15s
|
||||
|
||||
$ oc get events
|
||||
LAST SEEN TYPE REASON OBJECT MESSAGE
|
||||
38s Normal WaitForFirstConsumer persistentvolumeclaim/docker-pvc waiting for first consumer to be created before binding
|
||||
23s Normal ExternalProvisioning persistentvolumeclaim/docker-pvc waiting for a volume to be created, either by external provisioner "csi.hetzner.cloud" or manually created by system administrator
|
||||
27s Normal Provisioning persistentvolumeclaim/docker-pvc External provisioner is provisioning volume for claim "opendevin/docker-pvc"
|
||||
17s Normal ProvisioningSucceeded persistentvolumeclaim/docker-pvc Successfully provisioned volume pvc-2b1d223a-1c8f-4990-8e3d-68061a9ae252
|
||||
16s Normal Scheduled pod/opendevin-app-2024 Successfully assigned opendevin/opendevin-app-2024 to worker1.hub.internal.blakane.com
|
||||
9s Normal SuccessfulAttachVolume pod/opendevin-app-2024 AttachVolume.Attach succeeded for volume "pvc-2b1d223a-1c8f-4990-8e3d-68061a9ae252"
|
||||
9s Normal SuccessfulAttachVolume pod/opendevin-app-2024 AttachVolume.Attach succeeded for volume "pvc-31f15b25-faad-4665-a25f-201a530379af"
|
||||
6s Normal AddedInterface pod/opendevin-app-2024 Add eth0 [10.128.2.48/23] from openshift-sdn
|
||||
6s Normal Pulled pod/opendevin-app-2024 Container image "ghcr.io/opendevin/opendevin:0.7.1" already present on machine
|
||||
6s Normal Created pod/opendevin-app-2024 Created container opendevin-app-2024
|
||||
6s Normal Started pod/opendevin-app-2024 Started container opendevin-app-2024
|
||||
6s Normal Pulled pod/opendevin-app-2024 Container image "ghcr.io/opendevin/sandbox:main" already present on machine
|
||||
5s Normal Created pod/opendevin-app-2024 Created container opendevin-sandbox-2024
|
||||
5s Normal Started pod/opendevin-app-2024 Started container opendevin-sandbox-2024
|
||||
83s Normal WaitForFirstConsumer persistentvolumeclaim/workspace-pvc waiting for first consumer to be created before binding
|
||||
27s Normal Provisioning persistentvolumeclaim/workspace-pvc External provisioner is provisioning volume for claim "opendevin/workspace-pvc"
|
||||
17s Normal ProvisioningSucceeded persistentvolumeclaim/workspace-pvc Successfully provisioned volume pvc-31f15b25-faad-4665-a25f-201a530379af
|
||||
|
||||
$ oc get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
opendevin-app-2024 2/2 Running 0 23s
|
||||
|
||||
$ oc get pvc
|
||||
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
|
||||
docker-pvc Bound pvc-2b1d223a-1c8f-4990-8e3d-68061a9ae252 10Gi RWO hcloud-volumes 10m
|
||||
workspace-pvc Bound pvc-31f15b25-faad-4665-a25f-201a530379af 10Gi RWO hcloud-volumes 13m
|
||||
|
||||
```
|
||||
|
||||
4. Create a NodePort service.
|
||||
Sample service creation command below:
|
||||
|
||||
```bash
|
||||
# create the service of type NodePort
|
||||
$ oc create svc nodeport opendevin-app-2024 --tcp=3000:3000
|
||||
service/opendevin-app-2024 created
|
||||
|
||||
# review
|
||||
|
||||
$ oc get svc
|
||||
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
|
||||
opendevin-app-2024 NodePort 172.30.225.42 <none> 3000:30495/TCP 4s
|
||||
|
||||
$ oc describe svc opendevin-app-2024
|
||||
Name: opendevin-app-2024
|
||||
Namespace: opendevin
|
||||
Labels: app=opendevin-app-2024
|
||||
Annotations: <none>
|
||||
Selector: app=opendevin-app-2024
|
||||
Type: NodePort
|
||||
IP Family Policy: SingleStack
|
||||
IP Families: IPv4
|
||||
IP: 172.30.225.42
|
||||
IPs: 172.30.225.42
|
||||
Port: 3000-3000 3000/TCP
|
||||
TargetPort: 3000/TCP
|
||||
NodePort: 3000-3000 30495/TCP
|
||||
Endpoints: 10.128.2.48:3000
|
||||
Session Affinity: None
|
||||
External Traffic Policy: Cluster
|
||||
Events: <none>
|
||||
```
|
||||
|
||||
6. Connect to OpenDevin UI, configure the Agent, then test:
|
||||
|
||||

|
||||
|
||||
|
||||
## Challenges
|
||||
Some of the challenages that would be needed to improve:
|
||||
|
||||
1. Install GIT into the container:
|
||||
This can be resolved by building a custom image which includes GIT software and use that image during pod deplyment.
|
||||
|
||||
Example below: "to be tested!"
|
||||
|
||||
```dockerfile
|
||||
FROM ghcr.io/opendevin/opendevin:0.7.1
|
||||
|
||||
# Install Git
|
||||
RUN apt-get update && apt-get install -y git
|
||||
|
||||
# Ensure /opt/workspace_base is writable
|
||||
RUN mkdir -p /opt/workspace_base && chown -R 1000:1000 /opt/workspace_base
|
||||
|
||||
# Verify Git installation
|
||||
RUN git --version
|
||||
```
|
||||
|
||||
2. Mount a shared development directory "i.e. one hosted in EC2 instance" to the POD:
|
||||
This can be also done by sharing the developement directory to the worker node through a sharing software (NFS), then creating a pv and pvc as described above to access that directory.
|
||||
|
||||
3. Not all Agents working! Just tested CoderAgent with an openai API key and produced results.
|
||||
|
||||
|
||||
## Discuss
|
||||
|
||||
For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
|
||||
@@ -34,7 +34,7 @@ If you're running on Windows and having trouble, check out our [guide for Window
|
||||
**Symptoms**
|
||||
|
||||
```bash
|
||||
Error creating controller. Please check Docker is running and visit `https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting` for more debugging information.
|
||||
Error creating controller. Please check Docker is running and visit `https://docs.all-hands.dev/modules/usage/troubleshooting` for more debugging information.
|
||||
```
|
||||
|
||||
```bash
|
||||
|
||||
Generated
+20
-26
@@ -17,13 +17,13 @@
|
||||
"react": "^18.3.1",
|
||||
"react-dom": "^18.3.1",
|
||||
"react-icons": "^5.2.1",
|
||||
"react-use": "^17.5.0"
|
||||
"react-use": "^17.5.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@docusaurus/module-type-aliases": "^3.4.0",
|
||||
"@docusaurus/tsconfig": "^3.4.0",
|
||||
"@docusaurus/types": "^3.4.0",
|
||||
"typescript": "~5.5.3"
|
||||
"typescript": "~5.5.4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18.0"
|
||||
@@ -6150,11 +6150,6 @@
|
||||
"resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
|
||||
"integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw=="
|
||||
},
|
||||
"node_modules/fast-loops": {
|
||||
"version": "1.1.3",
|
||||
"resolved": "https://registry.npmjs.org/fast-loops/-/fast-loops-1.1.3.tgz",
|
||||
"integrity": "sha512-8EZzEP0eKkEEVX+drtd9mtuQ+/QrlfW/5MlwcwK5Nds6EkZ/tRzEexkzUY2mIssnAyVLT+TKHuRXmFNNXYUd6g=="
|
||||
},
|
||||
"node_modules/fast-shallow-equal": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/fast-shallow-equal/-/fast-shallow-equal-1.0.0.tgz",
|
||||
@@ -7408,9 +7403,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/hyphenate-style-name": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/hyphenate-style-name/-/hyphenate-style-name-1.0.4.tgz",
|
||||
"integrity": "sha512-ygGZLjmXfPHj+ZWh6LwbC37l43MhfztxetbFCoYTM2VjkIUpeHgSNn7QIyVFj7YQ1Wl9Cbw5sholVJPzWvC2MQ=="
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/hyphenate-style-name/-/hyphenate-style-name-1.1.0.tgz",
|
||||
"integrity": "sha512-WDC/ui2VVRrz3jOVi+XtjqkDjiVjTtFaAGiW37k6b+ohyQ5wYDOGkvCZa8+H0nx3gyvv0+BST9xuOgIyGQ00gw=="
|
||||
},
|
||||
"node_modules/iconv-lite": {
|
||||
"version": "0.4.24",
|
||||
@@ -7537,12 +7532,11 @@
|
||||
"integrity": "sha512-7NXolsK4CAS5+xvdj5OMMbI962hU/wvwoxk+LWR9Ek9bVtyuuYScDN6eS0rUm6TxApFpw7CX1o4uJzcd4AyD3Q=="
|
||||
},
|
||||
"node_modules/inline-style-prefixer": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/inline-style-prefixer/-/inline-style-prefixer-7.0.0.tgz",
|
||||
"integrity": "sha512-I7GEdScunP1dQ6IM2mQWh6v0mOYdYmH3Bp31UecKdrcUgcURTcctSe1IECdUznSHKSmsHtjrT3CwCPI1pyxfUQ==",
|
||||
"version": "7.0.1",
|
||||
"resolved": "https://registry.npmjs.org/inline-style-prefixer/-/inline-style-prefixer-7.0.1.tgz",
|
||||
"integrity": "sha512-lhYo5qNTQp3EvSSp3sRvXMbVQTLrvGV6DycRMJ5dm2BLMiJ30wpXKdDdgX+GmJZ5uQMucwRKHamXSst3Sj/Giw==",
|
||||
"dependencies": {
|
||||
"css-in-js-utils": "^3.1.0",
|
||||
"fast-loops": "^1.1.3"
|
||||
"css-in-js-utils": "^3.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/interpret": {
|
||||
@@ -10408,15 +10402,15 @@
|
||||
}
|
||||
},
|
||||
"node_modules/nano-css": {
|
||||
"version": "5.6.1",
|
||||
"resolved": "https://registry.npmjs.org/nano-css/-/nano-css-5.6.1.tgz",
|
||||
"integrity": "sha512-T2Mhc//CepkTa3X4pUhKgbEheJHYAxD0VptuqFhDbGMUWVV2m+lkNiW/Ieuj35wrfC8Zm0l7HvssQh7zcEttSw==",
|
||||
"version": "5.6.2",
|
||||
"resolved": "https://registry.npmjs.org/nano-css/-/nano-css-5.6.2.tgz",
|
||||
"integrity": "sha512-+6bHaC8dSDGALM1HJjOHVXpuastdu2xFoZlC77Jh4cg+33Zcgm+Gxd+1xsnpZK14eyHObSp82+ll5y3SX75liw==",
|
||||
"dependencies": {
|
||||
"@jridgewell/sourcemap-codec": "^1.4.15",
|
||||
"css-tree": "^1.1.2",
|
||||
"csstype": "^3.1.2",
|
||||
"fastest-stable-stringify": "^2.0.2",
|
||||
"inline-style-prefixer": "^7.0.0",
|
||||
"inline-style-prefixer": "^7.0.1",
|
||||
"rtl-css-js": "^1.16.1",
|
||||
"stacktrace-js": "^2.0.2",
|
||||
"stylis": "^4.3.0"
|
||||
@@ -12069,9 +12063,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/react-use": {
|
||||
"version": "17.5.0",
|
||||
"resolved": "https://registry.npmjs.org/react-use/-/react-use-17.5.0.tgz",
|
||||
"integrity": "sha512-PbfwSPMwp/hoL847rLnm/qkjg3sTRCvn6YhUZiHaUa3FA6/aNoFX79ul5Xt70O1rK+9GxSVqkY0eTwMdsR/bWg==",
|
||||
"version": "17.5.1",
|
||||
"resolved": "https://registry.npmjs.org/react-use/-/react-use-17.5.1.tgz",
|
||||
"integrity": "sha512-LG/uPEVRflLWMwi3j/sZqR00nF6JGqTTDblkXK2nzXsIvij06hXl1V/MZIlwj1OKIQUtlh1l9jK8gLsRyCQxMg==",
|
||||
"dependencies": {
|
||||
"@types/js-cookie": "^2.2.6",
|
||||
"@xobotyi/scrollbar-width": "^1.9.5",
|
||||
@@ -12079,7 +12073,7 @@
|
||||
"fast-deep-equal": "^3.1.3",
|
||||
"fast-shallow-equal": "^1.0.0",
|
||||
"js-cookie": "^2.2.1",
|
||||
"nano-css": "^5.6.1",
|
||||
"nano-css": "^5.6.2",
|
||||
"react-universal-interface": "^0.6.2",
|
||||
"resize-observer-polyfill": "^1.5.1",
|
||||
"screenfull": "^5.1.0",
|
||||
@@ -13766,9 +13760,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/typescript": {
|
||||
"version": "5.5.3",
|
||||
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.3.tgz",
|
||||
"integrity": "sha512-/hreyEujaB0w76zKo6717l3L0o/qEUtRgdvUBvlkhoWeOVMjMuHNHk0BRBzikzuGDqNmPQbg5ifMEqsHLiIUcQ==",
|
||||
"version": "5.5.4",
|
||||
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.4.tgz",
|
||||
"integrity": "sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==",
|
||||
"bin": {
|
||||
"tsc": "bin/tsc",
|
||||
"tsserver": "bin/tsserver"
|
||||
|
||||
+2
-2
@@ -24,13 +24,13 @@
|
||||
"react": "^18.3.1",
|
||||
"react-dom": "^18.3.1",
|
||||
"react-icons": "^5.2.1",
|
||||
"react-use": "^17.5.0"
|
||||
"react-use": "^17.5.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@docusaurus/module-type-aliases": "^3.4.0",
|
||||
"@docusaurus/tsconfig": "^3.4.0",
|
||||
"@docusaurus/types": "^3.4.0",
|
||||
"typescript": "~5.5.3"
|
||||
"typescript": "~5.5.4"
|
||||
},
|
||||
"browserslist": {
|
||||
"production": [
|
||||
|
||||
@@ -11,7 +11,6 @@ from evaluation.EDA.game import Q20Game, Q20GameCelebrity
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
@@ -19,12 +18,14 @@ from opendevin.controller.agent import Agent
|
||||
|
||||
# from evaluation.EDA.scorer import question_scorer
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
game = None
|
||||
|
||||
|
||||
@@ -48,7 +49,6 @@ def codeact_user_response_eda(state: State) -> str:
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response_eda,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
@@ -62,7 +62,7 @@ def process_instance(
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
eval_output_dir = metadata.eval_output_dir
|
||||
if reset_logger:
|
||||
@@ -124,6 +124,7 @@ def process_instance(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
|
||||
@@ -33,13 +33,15 @@ workspace_mount_path = "/path/to/your/workspace"
|
||||
|
||||
ssh_hostname = "localhost"
|
||||
|
||||
run_as_devin = false
|
||||
|
||||
[sandbox]
|
||||
# SWEBench eval specific - but you can tweak it to your needs
|
||||
use_host_network = false
|
||||
run_as_devin = false
|
||||
# linting python after editing helps LLM fix indentations
|
||||
enable_auto_lint = true
|
||||
|
||||
[sandbox]
|
||||
|
||||
box_type = "ssh"
|
||||
timeout = 120
|
||||
|
||||
|
||||
@@ -20,12 +20,12 @@ workspace_mount_path = "/path/to/workspace"
|
||||
|
||||
ssh_hostname = "localhost"
|
||||
|
||||
use_host_network = false
|
||||
# AgentBench specific
|
||||
run_as_devin = true
|
||||
enable_auto_lint = true
|
||||
|
||||
[sandbox]
|
||||
use_host_network = false
|
||||
enable_auto_lint = true
|
||||
box_type = "ssh"
|
||||
timeout = 120
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@@ -30,6 +30,8 @@ from opendevin.events.action import CmdRunAction, MessageAction
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance: pd.Series,
|
||||
@@ -37,7 +39,7 @@ def process_instance(
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
|
||||
|
||||
inst_id = instance.instance_id
|
||||
question = instance.description
|
||||
@@ -97,7 +99,14 @@ def process_instance(
|
||||
# create sandbox and run the agent
|
||||
# =============================================
|
||||
|
||||
sandbox = DockerSSHBox()
|
||||
sandbox = DockerSSHBox(
|
||||
config=config.sandbox,
|
||||
persist_sandbox=False,
|
||||
workspace_mount_path=config.workspace_mount_path,
|
||||
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
|
||||
cache_dir=config.cache_dir,
|
||||
run_as_devin=config.run_as_devin,
|
||||
)
|
||||
sandbox.execute(f'cd {inst_id}')
|
||||
|
||||
init_cmd = instance.init
|
||||
@@ -116,6 +125,7 @@ def process_instance(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
fake_user_response_fn=FAKE_RESPONSES[agent.__class__.__name__],
|
||||
sandbox=sandbox,
|
||||
sid=inst_id,
|
||||
|
||||
@@ -7,7 +7,7 @@ from dataclasses import dataclass
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
from opendevin.core.config import config
|
||||
from opendevin.core.config import load_app_config
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
from opendevin.runtime.plugins import (
|
||||
@@ -16,6 +16,8 @@ from opendevin.runtime.plugins import (
|
||||
SWEAgentCommandsRequirement,
|
||||
)
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
|
||||
|
||||
|
||||
@@ -217,7 +219,7 @@ class BiocoderSSHBox(DockerSSHBox):
|
||||
config.workspace_mount_path = workspace_base
|
||||
|
||||
# linting python after editing helps LLM fix indentations
|
||||
config.enable_auto_lint = True
|
||||
config.sandbox.enable_auto_lint = True
|
||||
|
||||
# create folder for transferring files back/forth
|
||||
biocoder_cache_folder = 'biocoder_cache'
|
||||
@@ -268,7 +270,7 @@ class BiocoderSSHBox(DockerSSHBox):
|
||||
f.write(json.dumps(testcase_json, indent=4))
|
||||
|
||||
# linting python after editing helps LLM fix indentations
|
||||
config.enable_auto_lint = True
|
||||
config.sandbox.enable_auto_lint = True
|
||||
|
||||
sandbox = cls(
|
||||
container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
|
||||
|
||||
@@ -13,23 +13,23 @@ from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': partial(
|
||||
codeact_user_response, encapsulate_solution=True, try_parse=None
|
||||
),
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
@@ -87,7 +87,7 @@ def process_instance(
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
|
||||
instance = BiocoderData(**instance)
|
||||
print(instance)
|
||||
workspace_dir_name = (
|
||||
@@ -171,6 +171,7 @@ def process_instance(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
|
||||
@@ -18,6 +18,8 @@ Add the following configurations:
|
||||
max_iterations = 100
|
||||
cache_dir = "/tmp/cache"
|
||||
ssh_hostname = "localhost"
|
||||
|
||||
[sandbox]
|
||||
enable_auto_lint = true
|
||||
|
||||
# TODO: Change these to the model you want to evaluate
|
||||
|
||||
@@ -21,13 +21,15 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
msg = (
|
||||
@@ -51,13 +53,8 @@ def codeact_user_response(state: State) -> str:
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
@@ -66,9 +63,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
||||
|
||||
|
||||
def execute_sql(db_path, gen_sql, gold_sql):
|
||||
"""
|
||||
Execute the generated SQL and the ground truth SQL and compare the results.
|
||||
"""
|
||||
"""Execute the generated SQL and the ground truth SQL and compare the results."""
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(gen_sql)
|
||||
@@ -128,7 +123,7 @@ def process_instance(
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
|
||||
workspace_mount_path = os.path.join(
|
||||
config.workspace_mount_path, 'bird_eval_workspace'
|
||||
)
|
||||
@@ -220,6 +215,7 @@ def process_instance(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
@@ -255,18 +251,14 @@ def process_instance(
|
||||
|
||||
|
||||
def load_bird():
|
||||
"""
|
||||
Main function to handle the flow of downloading, processing, and loading the bird dataset.
|
||||
"""
|
||||
"""Main function to handle the flow of downloading, processing, and loading the bird dataset."""
|
||||
raw_dataset_path = download_bird()
|
||||
bird_dataset = process_bird(raw_dataset_path)
|
||||
return bird_dataset
|
||||
|
||||
|
||||
def download_bird():
|
||||
"""
|
||||
Downloads and extracts the bird dataset from a specified URL into a local directory.
|
||||
"""
|
||||
"""Downloads and extracts the bird dataset from a specified URL into a local directory."""
|
||||
dataset_path = os.path.join(config.workspace_base, 'evaluation_bird')
|
||||
devset_path = os.path.join(dataset_path, 'dev')
|
||||
if not os.path.exists(dataset_path):
|
||||
@@ -292,9 +284,7 @@ def download_bird():
|
||||
|
||||
|
||||
def process_bird(dataset_path):
|
||||
"""
|
||||
Processes the raw bird dataset into a structured format and saves it as JSON.
|
||||
"""
|
||||
"""Processes the raw bird dataset into a structured format and saves it as JSON."""
|
||||
processed_path = os.path.join(dataset_path, 'processed_dev.json')
|
||||
if not os.path.exists(processed_path):
|
||||
logger.info(f'{processed_path} folder does not exist, starting processing...')
|
||||
@@ -325,9 +315,7 @@ def process_bird(dataset_path):
|
||||
|
||||
|
||||
def extract_create_table_prompt(db_path, limit_value=0):
|
||||
"""
|
||||
Generates a SQL prompt with CREATE TABLE statements and sample data from the database.
|
||||
"""
|
||||
"""Generates a SQL prompt with CREATE TABLE statements and sample data from the database."""
|
||||
table_query = "SELECT * FROM sqlite_master WHERE type='table';"
|
||||
tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall()
|
||||
prompt = ''
|
||||
@@ -367,9 +355,7 @@ def extract_create_table_prompt(db_path, limit_value=0):
|
||||
|
||||
|
||||
def create_prompt(e, database_path):
|
||||
"""
|
||||
Create a prompt for the given example
|
||||
"""
|
||||
"""Create a prompt for the given example"""
|
||||
db_id = e['db_id']
|
||||
db_path = pathlib.Path(database_path) / db_id / f'{db_id}.sqlite'
|
||||
|
||||
|
||||
@@ -0,0 +1,51 @@
|
||||
# Browsing Delegation Evalution
|
||||
|
||||
Some of OpenDevin's agent supports agent delegation action, for example, CodeActAgent can delegate browsing tasks to BrowsingAgent.
|
||||
|
||||
This evaluation tests whether CodeActAgent can correctly delegate the instruction from WebArena and MiniWob benchmark to the BrowsingAgent.
|
||||
If so, the browsing performance upper-bound of CodeActAgent will be the performance of BrowsingAgent.
|
||||
|
||||
|
||||
## Setup Environment
|
||||
|
||||
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to set up a local development environment for OpenDevin.
|
||||
|
||||
## Configure OpenDevin and your LLM
|
||||
|
||||
Create a `config.toml` file if it does not exist at the root of the workspace.
|
||||
|
||||
Add the following configurations:
|
||||
|
||||
```toml
|
||||
# TODO: Change these to the model you want to evaluate
|
||||
[llm.eval_gpt4_1106_preview_llm]
|
||||
model = "gpt-4-1106-preview"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
|
||||
[llm.eval_some_openai_compatible_model_llm]
|
||||
model = "openai/MODEL_NAME"
|
||||
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
```
|
||||
|
||||
## Run Inference
|
||||
|
||||
```bash
|
||||
./evaluation/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
|
||||
# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
|
||||
```
|
||||
|
||||
where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
|
||||
|
||||
`model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
|
||||
LLM settings, as defined in your `config.toml`.
|
||||
|
||||
`git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
|
||||
like to evaluate. It could also be a release tag like `0.6.2`.
|
||||
|
||||
`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
|
||||
to `CodeActAgent`.
|
||||
|
||||
`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances.
|
||||
@@ -0,0 +1,167 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
import nltk
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
make_metadata,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
# Only CodeActAgent can delegate to BrowsingAgent
|
||||
SUPPORTED_AGENT_CLS = {'CodeActAgent'}
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
|
||||
env_id = instance.instance_id
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
# add back the console handler to print ONE line
|
||||
logger.addHandler(get_console_handler())
|
||||
logger.info(
|
||||
f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(
|
||||
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
else:
|
||||
logger.info(f'Starting evaluation for instance {env_id}.')
|
||||
|
||||
instruction = (
|
||||
f'You can delegate browsing tasks to a browser agent. '
|
||||
f"For example, for query 'Who is the president of the United States?', you can delegate the task to a browser agent via <execute_browse> Who is the president of the United States? </execute_browse>.\n"
|
||||
f'Now, solve the following query: "{instance.instruction}"\n'
|
||||
f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
|
||||
)
|
||||
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
sid=env_id,
|
||||
)
|
||||
)
|
||||
|
||||
# ======= Attempt to evaluate the agent's environment impact =======
|
||||
|
||||
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
|
||||
# for compatibility with the existing output format, we can remake the pairs here
|
||||
# remove when it becomes unnecessary
|
||||
histories = state.history.compatibility_for_eval_history_pairs()
|
||||
|
||||
# find the last delegate action
|
||||
last_delegate_action = None
|
||||
result = {}
|
||||
for action, _ in histories:
|
||||
if action['action'] == 'delegate':
|
||||
last_delegate_action = action
|
||||
instruction_for_delegate = action['args']['inputs']['task']
|
||||
# parse `browse_actions` from `instruction_for_delegate`
|
||||
# task = f'{thought}. I should start with: {browse_actions}'
|
||||
instruction_for_delegate = re.search(
|
||||
r'I should start with: (.*)', instruction_for_delegate
|
||||
).group(1)
|
||||
|
||||
# calculate the edit distance between the instance.instruction and the instruction_for_delegate
|
||||
edit_distance = nltk.edit_distance(
|
||||
instance.instruction, instruction_for_delegate
|
||||
)
|
||||
is_exact_match = (
|
||||
instance.instruction.strip() == instruction_for_delegate.strip()
|
||||
)
|
||||
result['edit_distance'] = edit_distance
|
||||
result['is_exact_match'] = is_exact_match
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
'instance_id': env_id,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': histories,
|
||||
'metrics': metrics,
|
||||
'error': state.last_error if state and state.last_error else None,
|
||||
'test_result': {
|
||||
'query': instance.instruction,
|
||||
'action': last_delegate_action,
|
||||
'result': result,
|
||||
},
|
||||
}
|
||||
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
|
||||
dataset = load_dataset('OpenDevin/eval-browsing-instructions')
|
||||
dataset = dataset['train'].to_pandas()
|
||||
assert dataset.columns.tolist() == ['instance_id', 'instruction']
|
||||
id_column = 'instance_id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
'browsing_delegation',
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
)
|
||||
if metadata.agent_class not in SUPPORTED_AGENT_CLS:
|
||||
raise ValueError(
|
||||
f'Agent class {metadata.agent_class} not supported with AgentDelegation.'
|
||||
)
|
||||
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
+45
@@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
EVAL_LIMIT=$4
|
||||
NUM_WORKERS=$5
|
||||
|
||||
if [ -z "$NUM_WORKERS" ]; then
|
||||
NUM_WORKERS=1
|
||||
echo "Number of workers not specified, use default $NUM_WORKERS"
|
||||
fi
|
||||
checkout_eval_branch
|
||||
|
||||
if [ -z "$AGENT" ]; then
|
||||
echo "Agent not specified, use default CodeActAgent"
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="$AGENT_VERSION"
|
||||
|
||||
COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 1 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $EVAL_NOTE"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
@@ -10,45 +10,31 @@ import huggingface_hub
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
|
||||
from agenthub.gptswarm_agent.gptswarm_agent import GPTSwarm
|
||||
from evaluation.gaia.scorer import question_scorer
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import CmdRunAction, MessageAction
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
DATASET_CACHE_DIR = '~/.cache/open-devin/evals/gaia'
|
||||
DATASET_CACHE_DIR = os.path.expanduser(DATASET_CACHE_DIR)
|
||||
|
||||
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN') or (
|
||||
open(os.path.expanduser('~/.huggingface/token')).read().strip()
|
||||
if os.path.exists(os.path.expanduser('~/.huggingface/token'))
|
||||
else input('Please enter your Hugging Face token: ').strip()
|
||||
)
|
||||
|
||||
|
||||
def gptswarm_user_response(state: State) -> str:
|
||||
# NOTE: For the AI assistant, state-based design may introduce more uncertainties.
|
||||
# TODO: It is stateless now. Find a way to make it stateful.
|
||||
print('Not implemented.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': partial(codeact_user_response, encapsulate_solution=True),
|
||||
'MonologueAgent': monologue_user_response,
|
||||
'GPTSwarmAgent': gptswarm_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
@@ -60,10 +46,9 @@ def process_instance(
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
single_agent: bool = False,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
|
||||
# create process-specific workspace dir
|
||||
# we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
@@ -136,6 +121,7 @@ def process_instance(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
@@ -185,102 +171,17 @@ def process_instance(
|
||||
# remove when it becomes unnecessary
|
||||
histories = state.history.compatibility_for_eval_history_pairs()
|
||||
|
||||
# Prepare instruction
|
||||
instruction = f"{instance['Question']}\n"
|
||||
logger.info(f'Instruction: {instruction}')
|
||||
if dest_file:
|
||||
instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"
|
||||
|
||||
# TODO: Need further improve for new V1.1 version and drop if-else.
|
||||
if agent.__class__.__name__ == 'GPTSwarmAgent':
|
||||
if dest_file:
|
||||
inputs = [{'task': instruction, 'files': [dest_file]}]
|
||||
else:
|
||||
inputs = [{'task': instruction}]
|
||||
|
||||
model_name = metadata['model_name']
|
||||
gptswarm_agent = GPTSwarm(llm=LLM(), model_name=model_name)
|
||||
if single_agent:
|
||||
model_answer_raw = asyncio.run(gptswarm_agent.run(inputs))
|
||||
else:
|
||||
model_answer_raw = asyncio.run(gptswarm_agent.swarm_run(inputs))
|
||||
|
||||
model_answer = model_answer_raw[-1].split('FINAL ANSWER: ')[-1]
|
||||
|
||||
else:
|
||||
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
|
||||
instruction += 'For example: The answer to the question is <solution> 42 </solution>.\n'
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent.__class__.__name__, '')
|
||||
logger.info(
|
||||
f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'}
|
||||
)
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
sid=instance['text'].strip(),
|
||||
)
|
||||
)
|
||||
# ======= Attempt to evaluate the agent's edits =======
|
||||
# If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
model_answer_raw = ''
|
||||
for act, _ in reversed(state.history):
|
||||
if isinstance(act, CmdRunAction) and act.source == 'agent':
|
||||
model_answer_raw = act.thought
|
||||
break
|
||||
elif isinstance(act, MessageAction) and act.source == 'agent':
|
||||
model_answer_raw = act.content
|
||||
break
|
||||
|
||||
# attempt to parse model_answer
|
||||
model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
|
||||
if len(model_answer) == 0:
|
||||
logger.warning(f'Failed to parse model answer: {model_answer_raw}')
|
||||
model_answer = model_answer_raw
|
||||
else:
|
||||
model_answer = model_answer[0]
|
||||
|
||||
logger.info(
|
||||
f'Final message: {model_answer} | Ground truth: {instance["Final answer"]}'
|
||||
)
|
||||
score = question_scorer(
|
||||
model_answer=model_answer, ground_truth=instance['Final answer']
|
||||
)
|
||||
test_result = {
|
||||
'score': score,
|
||||
'model_answer_raw': model_answer_raw,
|
||||
'model_answer': model_answer,
|
||||
'ground_truth': instance['Final answer'],
|
||||
}
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
'instance_id': instance['task_id'],
|
||||
'instance': instance,
|
||||
'instruction': instance['Question'],
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': histories,
|
||||
# [
|
||||
# (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
# ],
|
||||
'error': state.error if state and state.error else None,
|
||||
'metrics': metrics,
|
||||
'error': state.last_error if state and state.last_error else None,
|
||||
'test_result': test_result,
|
||||
}
|
||||
|
||||
except Exception:
|
||||
logger.error('Process instance failed')
|
||||
raise
|
||||
|
||||
@@ -80,14 +80,14 @@ def question_scorer(
|
||||
|
||||
|
||||
def normalize_str(input_str, remove_punct=True) -> str:
|
||||
"""
|
||||
Normalize a string by:
|
||||
"""Normalize a string by:
|
||||
- Removing all white spaces
|
||||
- Optionally removing punctuation (if remove_punct is True)
|
||||
- Converting to lowercase
|
||||
Parameters:
|
||||
- input_str: str, the string to normalize
|
||||
- remove_punct: bool, whether to remove punctuation (default: True)
|
||||
|
||||
Returns:
|
||||
- str, the normalized string
|
||||
"""
|
||||
|
||||
@@ -17,8 +17,8 @@ fi
|
||||
checkout_eval_branch
|
||||
|
||||
if [ -z "$AGENT" ]; then
|
||||
echo "Agent not specified, use default GPTSwarmAgent"
|
||||
AGENT="GPTSwarmAgent"
|
||||
echo "Agent not specified, use default CodeActAgent"
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
@@ -38,8 +38,12 @@ echo "LEVELS: $LEVELS"
|
||||
COMMAND="poetry run python ./evaluation/gaia/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 30 \
|
||||
--level $LEVELS \
|
||||
--data-split validation"
|
||||
--data-split validation \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${LEVELS}"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
AGENT=$2
|
||||
EVAL_LIMIT=$3
|
||||
LEVELS=$4
|
||||
NUM_RUNS=5
|
||||
OUTPUT_BASE_DIR="/Users/zhugem/Desktop/OpenDevin/evaluation/evaluation_outputs/outputs/gaia"
|
||||
|
||||
if [ -z "$AGENT" ]; then
|
||||
echo "Agent not specified, use default GPTSwarmAgent"
|
||||
AGENT="GPTSwarmAgent"
|
||||
fi
|
||||
|
||||
if [ -z "$LEVELS" ]; then
|
||||
LEVELS="2023_level1"
|
||||
echo "Levels not specified, use default $LEVELS"
|
||||
fi
|
||||
|
||||
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
|
||||
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
||||
AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "LEVELS: $LEVELS"
|
||||
|
||||
for i in $(seq 1 $NUM_RUNS)
|
||||
do
|
||||
RANDOM_SUFFIX=$(date +%s%N)
|
||||
OUTPUT_DIR="${OUTPUT_BASE_DIR}/${AGENT}/${MODEL_CONFIG}-${RANDOM_SUFFIX}"
|
||||
#OUTPUT_DIR="${OUTPUT_BASE_DIR}/${AGENT}/${MODEL_CONFIG}"
|
||||
echo "Running iteration $i, output will be stored in $OUTPUT_DIR"
|
||||
|
||||
COMMAND="poetry run python ./evaluation/gaia/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--level $LEVELS \
|
||||
--data-split validation \
|
||||
--eval-output-dir $OUTPUT_DIR"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
done
|
||||
@@ -12,7 +12,7 @@ from tqdm import tqdm
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@@ -21,6 +21,8 @@ from opendevin.llm.llm import LLM
|
||||
|
||||
from .utils import encode_question, get_data
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
|
||||
def cleanup():
|
||||
print('Cleaning up child processes...')
|
||||
@@ -53,13 +55,8 @@ def codeact_user_response(state: State) -> str:
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
@@ -118,6 +115,7 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent.__class__.__name__
|
||||
),
|
||||
|
||||
@@ -10,7 +10,6 @@ from ast_eval_th import ast_eval_th
|
||||
# This function is modified from Gorilla's APIBench implementations (https://github.com/ShishirPatil/gorilla/blob/main/eval/get_llm_responses.py).
|
||||
def encode_question(question, api_name):
|
||||
"""Encode multiple prompt instructions into a single string."""
|
||||
|
||||
prompts = []
|
||||
if api_name == 'torch':
|
||||
api_name = 'torchhub'
|
||||
|
||||
@@ -15,10 +15,6 @@ Further references:
|
||||
- https://paperswithcode.com/dataset/gpqa
|
||||
- https://github.com/idavidrein/gpqa
|
||||
|
||||
## TODOs
|
||||
- [ ] Add support for other agents (currently only tested on `CodeActAgent`)
|
||||
- [ ] Complete full benchmark evaluation
|
||||
- [ ] Fix intermittent `BrowserException: Failed to start browser environment` error
|
||||
|
||||
## Setup Environment
|
||||
|
||||
@@ -27,17 +23,11 @@ Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/D
|
||||
|
||||
## Configure OpenDevin and your LLM
|
||||
|
||||
Create a `config.toml` file if it does not exist at the root of the workspace.
|
||||
Create a `config.toml` file (you can copy from `config.template.toml`) if it does not exist at the root of the workspace.
|
||||
|
||||
Add the following configurations:
|
||||
|
||||
```toml
|
||||
[core]
|
||||
max_iterations = 100
|
||||
cache_dir = "/tmp/cache"
|
||||
ssh_hostname = "localhost"
|
||||
enable_auto_lint = true
|
||||
|
||||
# TODO: Change these to the model you want to evaluate
|
||||
[llm.eval_gpt4_1106_preview]
|
||||
model = "gpt-4-1106-preview"
|
||||
|
||||
+153
-65
@@ -1,5 +1,4 @@
|
||||
"""
|
||||
Overview:
|
||||
"""Overview:
|
||||
This code implements the evaluation of agents on the GPQA Benchmark with Open Book setting.
|
||||
- The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-expert validators achieve only 34% accuracy despite unrestricted access to the web.
|
||||
- Even experts in the corresponding domains achieve only 65% accuracy.
|
||||
@@ -13,7 +12,7 @@ Further references:
|
||||
- https://github.com/idavidrein/gpqa
|
||||
|
||||
TODOs:
|
||||
- Add evaluation on other Agent classes (e.g., MonologueAgent)
|
||||
- Add evaluation on other Agent classes
|
||||
- Batch inference and evaluation of agents on the GPQA Benchmark.
|
||||
"""
|
||||
|
||||
@@ -23,6 +22,7 @@ import os
|
||||
import pathlib
|
||||
import random
|
||||
import re
|
||||
from typing import Callable
|
||||
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
@@ -31,69 +31,95 @@ from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import Action, AgentFinishAction, MessageAction
|
||||
from opendevin.events.observation import Observation
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
config = load_app_config()
|
||||
|
||||
ACTION_FORMAT = """
|
||||
<<FINAL_ANSWER||
|
||||
<insert correct answer here, must be one of A, B, C, D> (Please dont use any additional characters. Just the letter of the correct answer (A/B/C/D).)
|
||||
||FINAL_ANSWER>>
|
||||
""".strip()
|
||||
|
||||
|
||||
def gpqa_codeact_user_response(
|
||||
state: State,
|
||||
encapsulate_solution: bool = False,
|
||||
try_parse: Callable[[Action], str] | None = None,
|
||||
) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'Feel free to use all tools for calculations and solving the problem, and web-search for finding relevant facts during the process if needed\n'
|
||||
'If you have finished reporting the answer in the expected format, (and only once that is done), please run the following command to submit: <execute_bash> exit </execute_bash>.\n'
|
||||
'Again you are being told a million times to first report the answer in the requested format (see again below for reference) before exiting. DO NOT EXIT WITHOUT REPORTING THE ANSWER FIRST.\n'
|
||||
'That is, when you have decided on the answer report in the following format:\n'
|
||||
f'{ACTION_FORMAT}\n'
|
||||
'<execute_bash> exit </execute_bash>\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP TO SOLVE THIS TASK.\n'
|
||||
)
|
||||
|
||||
return msg
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {'CodeActAgent': codeact_user_response}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
'CodeActAgent': '\n\n SUPER IMPORTANT: When you think you have solved the question, first report it back to the user in the requested format. Only once that is done, in the next turn, please run the following command: <execute_bash> exit </execute_bash>.\n'
|
||||
}
|
||||
|
||||
|
||||
def parse_final_answer(final_answer: str) -> str:
|
||||
"""
|
||||
Parse the final answer from the final message generated by the agent
|
||||
def parse_final_answer(final_answer: str | None) -> str | None:
|
||||
"""Parse the final answer from the final message generated by the agent
|
||||
to extract the final answer. The final answer is usually enclosed in the format:
|
||||
<<FINAL_ANSWER||
|
||||
<insert correct answer here>
|
||||
||FINAL_ANSWER>>
|
||||
"""
|
||||
# to do this first extract the part enclosed in the format <<FINAL_ANSWER|| ... ||FINAL_ANSWER>>
|
||||
pattern = re.compile(r'<<FINAL_ANSWER\|\|(.*?)\|\|FINAL_ANSWER>>', re.DOTALL)
|
||||
match = pattern.search(final_answer)
|
||||
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
else:
|
||||
return 'No final answer found in the provided string.'
|
||||
# and then strip it, remove any leading/trailing spaces line breaks etc.
|
||||
answer = match.group(1).strip()
|
||||
# finally capitalize it
|
||||
answer = answer.upper()
|
||||
# and then return A, B, C, D depending on whether the answer A, B, C, D is found in the final answer
|
||||
for letter in ['A', 'B', 'C', 'D']:
|
||||
if letter in answer:
|
||||
return letter
|
||||
|
||||
|
||||
def compare_answers(predicted_answer, ground_truth):
|
||||
"""
|
||||
Compare the predicted answer with the ground truth answer
|
||||
"""
|
||||
def compare_answers(model_output: str | None, ground_truth: str):
|
||||
"""Compare the predicted answer with the ground truth answer"""
|
||||
try:
|
||||
# parse the final answer from model output
|
||||
predicted_answer = parse_final_answer(model_output)
|
||||
except Exception as e:
|
||||
# Log the exception
|
||||
logger.error(f'An error occurred: {e}\n defaulting to random guess ...')
|
||||
# choose a random answer if the model output is not in the correct format
|
||||
predicted_answer = random.choice(['A', 'B', 'C', 'D'])
|
||||
|
||||
logger.info('#############################################')
|
||||
logger.info(f'Predicted answer: {predicted_answer}')
|
||||
logger.info(f'Ground truth answer: {ground_truth}')
|
||||
logger.info('#############################################')
|
||||
return predicted_answer == ground_truth
|
||||
|
||||
|
||||
def get_test_result(model_output, ground_truth):
|
||||
"""
|
||||
Implements the evaluation logic for GPQA
|
||||
Checks if the output of a given instance is correct (as per the ground truth)
|
||||
"""
|
||||
# parse the final answer from model output
|
||||
predicted_answer = parse_final_answer(model_output)
|
||||
|
||||
# check if the model output matches the ground truth
|
||||
result = compare_answers(predicted_answer, ground_truth)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def convert_instance_dict(instance):
|
||||
"""
|
||||
Used for preprocessing the hf dataset into a format that can be used by the agent.
|
||||
"""Used for preprocessing the hf dataset into a format that can be used by the agent.
|
||||
Reads and extracts relevant information from the dataset instance.
|
||||
"""
|
||||
out_instance_dict = {}
|
||||
@@ -126,7 +152,7 @@ def process_instance(
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
|
||||
old_workspace_mount_path = config.workspace_mount_path
|
||||
old_workspace_base = config.workspace_base
|
||||
try:
|
||||
@@ -171,27 +197,33 @@ def process_instance(
|
||||
# ======= Run the agent on the instance =======
|
||||
# Prepare instruction for the agent using suggested format in gpqa codebase
|
||||
instruction = f"""
|
||||
What is the correct answer to this question:\n
|
||||
{instance['question']}\n
|
||||
What is the correct answer to this question:\n
|
||||
{instance['question']}\n
|
||||
|
||||
Choices:\n
|
||||
(A) {instance['choices'][0]}\n
|
||||
(B) {instance['choices'][1]}\n
|
||||
(C) {instance['choices'][2]}\n
|
||||
(D) {instance['choices'][3]}\n
|
||||
\n\n
|
||||
Choices:\n
|
||||
(A) {instance['choices'][0]}\n
|
||||
(B) {instance['choices'][1]}\n
|
||||
(C) {instance['choices'][2]}\n
|
||||
(D) {instance['choices'][3]}\n
|
||||
\n\n
|
||||
|
||||
MOST IMPORTANT: Format your response as follows:
|
||||
<<FINAL_ANSWER||
|
||||
<insert correct answer here, must be one of A, B, C, D> (Please dont use any additional characters. Just the letter of the correct answer (A/B/C/D).)
|
||||
||FINAL_ANSWER>>
|
||||
MOST IMPORTANT: Format your response as follows:
|
||||
{ACTION_FORMAT}
|
||||
|
||||
Additional Instructions:
|
||||
- You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.
|
||||
"""
|
||||
Additional Instructions:
|
||||
- Do not try to solve the question in a single step. Break it down into smaller steps.
|
||||
- You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.
|
||||
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
- SUPER IMPORTANT: When you have reported the answer to the user in the requested format, (and only once that is done) in the next turn, please run the following command: <execute_bash> exit </execute_bash>.
|
||||
- Again you are being told a million times to first report the answer in the requested format (see again below for reference) before exiting. DO NOT EXIT WITHOUT REPORTING THE ANSWER FIRST.
|
||||
That is, when you have decided on the answer report in the following format:
|
||||
|
||||
{ACTION_FORMAT}
|
||||
<execute_bash> exit </execute_bash>
|
||||
|
||||
Again do not quit without reporting the answer first.
|
||||
Ok now its time to start solving the question. Good luck!
|
||||
"""
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State | None = asyncio.run(
|
||||
@@ -199,21 +231,73 @@ def process_instance(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent.__class__.__name__
|
||||
),
|
||||
sid=instance.instance_id,
|
||||
sid=f'gptq_{str(instance.instance_id)}',
|
||||
)
|
||||
)
|
||||
assert state is not None, 'State should not be None.'
|
||||
|
||||
# ======= Attempt to evaluate the agent's edits =======
|
||||
|
||||
question_choices = {
|
||||
'A': instance['choices'][0],
|
||||
'B': instance['choices'][1],
|
||||
'C': instance['choices'][2],
|
||||
'D': instance['choices'][3],
|
||||
}
|
||||
# get the final message from the state history (default to empty if not found)
|
||||
final_message = state.history.get_last_agent_message()
|
||||
found_answers = {
|
||||
'A': False,
|
||||
'B': False,
|
||||
'C': False,
|
||||
'D': False,
|
||||
}
|
||||
for event in state.history.get_events(reverse=True):
|
||||
if (
|
||||
isinstance(event, AgentFinishAction)
|
||||
and event.source != 'user'
|
||||
and '<<FINAL_ANSWER||' in event.thought
|
||||
):
|
||||
final_message = event.thought
|
||||
break
|
||||
elif (
|
||||
isinstance(event, MessageAction)
|
||||
and event.source != 'user'
|
||||
and '<<FINAL_ANSWER||' in event.content
|
||||
):
|
||||
final_message = event.content
|
||||
break
|
||||
elif isinstance(event, Observation):
|
||||
for option, option_text in question_choices.items():
|
||||
if option_text in event.content:
|
||||
found_answers[option] = True
|
||||
else:
|
||||
final_message = None
|
||||
|
||||
found_options = [option for option, found in found_answers.items() if found]
|
||||
logger.info('#############################################')
|
||||
logger.info(f'Final message generated by the agent: {final_message}')
|
||||
logger.info('#############################################')
|
||||
|
||||
test_result = get_test_result(final_message, instance.correct_solution)
|
||||
# check if the model output matches the ground truth
|
||||
test_result = compare_answers(final_message, instance.correct_solution)
|
||||
if final_message is None and len(found_options) > 0:
|
||||
_selected = random.choice(found_options)
|
||||
# if the final message is None, then the agent did not report the answer in the correct format
|
||||
# so we randomly select one of the found options and compare it with the correct solution
|
||||
test_result = _selected == instance.correct_solution
|
||||
logger.info('#############################################')
|
||||
logger.info('Agent did not report the answer in the correct format.')
|
||||
logger.info(f'Found options: {found_options}')
|
||||
logger.info(f'Selected option: {_selected}')
|
||||
logger.info('#############################################')
|
||||
|
||||
logger.info('#############################################')
|
||||
logger.info(f'Test result: {test_result}')
|
||||
logger.info('#############################################')
|
||||
|
||||
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
@@ -222,21 +306,20 @@ def process_instance(
|
||||
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
|
||||
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
|
||||
# for compatibility with the existing output format, we can remake the pairs here
|
||||
# remove when it becomes unnecessary
|
||||
histories = state.history.compatibility_for_eval_history_pairs()
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
'task_id': instance.task_id,
|
||||
'instance_id': instance.instance_id,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': histories,
|
||||
'history': state.history.compatibility_for_eval_history_pairs(),
|
||||
'metrics': metrics,
|
||||
'error': state.last_error if state and state.last_error else None,
|
||||
'test_result': test_result,
|
||||
'test_result': {
|
||||
'result': test_result,
|
||||
'found_answers': found_answers,
|
||||
'last_message': final_message,
|
||||
},
|
||||
}
|
||||
|
||||
except Exception:
|
||||
@@ -275,9 +358,14 @@ if __name__ == '__main__':
|
||||
gpqa_dataset['task_id'] = gpqa_dataset.index
|
||||
# gpqa_dataset = dataset['train'].to_pandas().sort_values(by='id').reset_index(drop=True)
|
||||
|
||||
if args.agent_cls != 'CodeActAgent':
|
||||
raise ValueError(
|
||||
f'Agent class {args.agent_cls} not supported for GPQA evaluation.'
|
||||
)
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config=llm_config,
|
||||
dataset_name='gpqa',
|
||||
dataset_name=args.data_split,
|
||||
agent_class=args.agent_cls,
|
||||
max_iterations=args.max_iterations,
|
||||
eval_note=args.eval_note,
|
||||
|
||||
Regular → Executable
@@ -18,6 +18,8 @@ Add the following configurations:
|
||||
max_iterations = 100
|
||||
cache_dir = "/tmp/cache"
|
||||
ssh_hostname = "localhost"
|
||||
|
||||
[sandbox]
|
||||
enable_auto_lint = true
|
||||
|
||||
# TODO: Change these to the model you want to evaluate
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
"""
|
||||
Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark introduced in
|
||||
"""Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark introduced in
|
||||
"OctoPack: Instruction Tuning Code Large Language Models" (https://arxiv.org/abs/2308.07124).
|
||||
Please see https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py
|
||||
for the reference implementation used in the paper.
|
||||
@@ -22,18 +21,19 @@ from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
IMPORT_HELPER = {
|
||||
'python': [
|
||||
'import math',
|
||||
@@ -65,7 +65,6 @@ LANGUAGE_TO_NUM_WORKERS = {
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
@@ -109,7 +108,7 @@ def process_instance(
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
|
||||
old_workspace_mount_path = config.workspace_mount_path
|
||||
old_workspace_base = config.workspace_base
|
||||
|
||||
@@ -185,6 +184,7 @@ def process_instance(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent.__class__.__name__
|
||||
),
|
||||
|
||||
@@ -13,6 +13,8 @@ Add the following configurations:
|
||||
max_iterations = 100
|
||||
cache_dir = "/tmp/cache"
|
||||
ssh_hostname = "localhost"
|
||||
|
||||
[sandbox]
|
||||
enable_auto_lint = true
|
||||
|
||||
# TODO: Change these to the model you want to evaluate
|
||||
|
||||
@@ -12,21 +12,21 @@ from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
@@ -103,7 +103,7 @@ def process_instance(
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
|
||||
old_workspace_mount_path = config.workspace_mount_path
|
||||
old_workspace_base = config.workspace_base
|
||||
|
||||
@@ -173,7 +173,15 @@ def process_instance(
|
||||
|
||||
# use a session id for concurrent evaluation
|
||||
sid = instance['id'] + '_' + str(os.getpid())
|
||||
sandbox = DockerSSHBox(sid=sid)
|
||||
sandbox = DockerSSHBox(
|
||||
config=config.sandbox,
|
||||
persist_sandbox=False,
|
||||
workspace_mount_path=config.workspace_mount_path,
|
||||
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
|
||||
cache_dir=config.cache_dir,
|
||||
run_as_devin=config.run_as_devin,
|
||||
sid=sid,
|
||||
)
|
||||
exit_code, command_output = sandbox.execute('pip install scitools-pyke')
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
@@ -182,6 +190,7 @@ def process_instance(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent.__class__.__name__
|
||||
),
|
||||
|
||||
@@ -15,7 +15,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@@ -23,6 +23,8 @@ from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
from opendevin.runtime.tools import RuntimeTool
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
|
||||
|
||||
docker_ssh_box: DockerSSHBox | None = None
|
||||
@@ -31,7 +33,14 @@ docker_ssh_box: DockerSSHBox | None = None
|
||||
def get_sandbox():
|
||||
global docker_ssh_box
|
||||
if docker_ssh_box is None:
|
||||
docker_ssh_box = DockerSSHBox()
|
||||
docker_ssh_box = DockerSSHBox(
|
||||
config=config.sandbox,
|
||||
persist_sandbox=False,
|
||||
workspace_mount_path=config.workspace_mount_path,
|
||||
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
|
||||
cache_dir=config.cache_dir,
|
||||
run_as_devin=config.run_as_devin,
|
||||
)
|
||||
return docker_ssh_box
|
||||
|
||||
|
||||
@@ -41,7 +50,7 @@ def process_instance(
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
|
||||
env_id = instance.id
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
@@ -81,6 +90,7 @@ def process_instance(
|
||||
agent,
|
||||
'PLACEHOLDER_GOAL',
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
runtime_tools_config=runtime_tools_config,
|
||||
sandbox=get_sandbox(),
|
||||
sid=env_id,
|
||||
|
||||
@@ -11,13 +11,12 @@ from evaluation.swe_bench.swe_env_box import DockerSSHBox
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@@ -28,6 +27,8 @@ from .env import SimplifiedEnv
|
||||
from .prompts import ToolPromptTemplate
|
||||
from .tasks import Task
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
|
||||
def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str, int]):
|
||||
logger.info(f'Gold reference: {task.reference}')
|
||||
@@ -55,7 +56,6 @@ def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str,
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response_mint,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
@@ -101,7 +101,15 @@ def process_instance(
|
||||
|
||||
# use a session id for concurrent processing
|
||||
sid = instance.task_id + '_' + str(os.getpid())
|
||||
sandbox = DockerSSHBox(sid=sid)
|
||||
sandbox = DockerSSHBox(
|
||||
config=config.sandbox,
|
||||
persist_sandbox=False,
|
||||
workspace_mount_path=config.workspace_mount_path,
|
||||
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
|
||||
cache_dir=config.cache_dir,
|
||||
run_as_devin=config.run_as_devin,
|
||||
sid=sid,
|
||||
)
|
||||
|
||||
requirements_host_src = 'evaluation/mint/requirements.txt'
|
||||
requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'
|
||||
@@ -145,6 +153,7 @@ def process_instance(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
fake_user_response_fn=fake_user_response_fn,
|
||||
sandbox=sandbox,
|
||||
sid=sid,
|
||||
|
||||
@@ -74,7 +74,6 @@ class HumanEvalTask(CodeGenTask):
|
||||
Modified from:
|
||||
https://github.com/bigcode-project/bigcode-evaluation-harness/blob/d61afde130005ecc65cf800ad8eca790a9bc2115/lm_eval/tasks/humaneval.py#L56
|
||||
"""
|
||||
|
||||
# STOP_WORDS = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"]
|
||||
# # Remove the last block of the code containing stop_words for HumanEval
|
||||
# string_list = re.split("(%s)" % "|".join(STOP_WORDS), solution)
|
||||
|
||||
@@ -79,14 +79,12 @@ def check_correctness(
|
||||
timeout: float = 10,
|
||||
completion_id: Optional[int] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Evaluates the functional correctness of a completion by running the test
|
||||
"""Evaluates the functional correctness of a completion by running the test
|
||||
suite provided in the problem.
|
||||
|
||||
:param completion_id: an optional completion ID so we can match
|
||||
the results later even if execution finishes asynchronously.
|
||||
"""
|
||||
|
||||
manager = multiprocessing.Manager()
|
||||
result = manager.list()
|
||||
|
||||
@@ -181,18 +179,16 @@ def chdir(root):
|
||||
|
||||
|
||||
def reliability_guard(maximum_memory_bytes: Optional[int] = None):
|
||||
"""
|
||||
This disables various destructive functions and prevents the generated code
|
||||
"""This disables various destructive functions and prevents the generated code
|
||||
from interfering with the test (e.g. fork bomb, killing other processes,
|
||||
removing filesystem files, etc.)
|
||||
|
||||
WARNING
|
||||
Warning:
|
||||
This function is NOT a security sandbox. Untrusted code, including, model-
|
||||
generated code, should not be blindly executed outside of one. See the
|
||||
Codex paper for more information about OpenAI's code sandbox, and proceed
|
||||
with caution.
|
||||
"""
|
||||
|
||||
if maximum_memory_bytes is not None:
|
||||
import resource
|
||||
|
||||
|
||||
@@ -25,10 +25,13 @@ Add the following configurations:
|
||||
max_iterations = 100
|
||||
cache_dir = "/tmp/cache"
|
||||
ssh_hostname = "localhost"
|
||||
enable_auto_lint = true
|
||||
run_as_devin = false
|
||||
sandbox_container_image = "public.ecr.aws/i5g0m1f6/ml-bench" # Use the latest image from the ML-Bench repository
|
||||
|
||||
[sandbox]
|
||||
enable_auto_lint = true
|
||||
|
||||
|
||||
# TODO: Change these to the model you want to evaluate
|
||||
[llm.eval_gpt4_1106_preview]
|
||||
model = "gpt-4-1106-preview"
|
||||
|
||||
@@ -4,10 +4,12 @@ import pprint
|
||||
|
||||
import tqdm
|
||||
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
|
||||
def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
|
||||
passed = []
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
"""
|
||||
Implements evaluation of agents on ML-Bench, a benchmark for assessing the effectiveness of
|
||||
"""Implements evaluation of agents on ML-Bench, a benchmark for assessing the effectiveness of
|
||||
Large Language Models (LLMs) in leveraging existing functions in open-source libraries for
|
||||
machine learning tasks. The benchmark is introduced in the paper "ML-Bench: Evaluating Large
|
||||
Language Models for Code Generation in Repository-Level Machine Learning Tasks"
|
||||
@@ -26,22 +25,22 @@ from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
@@ -68,7 +67,7 @@ ID2CONDA = {
|
||||
|
||||
|
||||
def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
|
||||
old_workspace_mount_path = config.workspace_mount_path
|
||||
old_workspace_base = config.workspace_base
|
||||
try:
|
||||
@@ -113,7 +112,15 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
|
||||
|
||||
# Create a sandbox, using the instance ID and PID as the session ID to avoid conflicts
|
||||
sid = str(instance['id']) + '_' + str(os.getpid())
|
||||
sandbox = DockerSSHBox(sid=sid)
|
||||
sandbox = DockerSSHBox(
|
||||
config=config.sandbox,
|
||||
persist_sandbox=False,
|
||||
workspace_mount_path=config.workspace_mount_path,
|
||||
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
|
||||
cache_dir=config.cache_dir,
|
||||
run_as_devin=config.run_as_devin,
|
||||
sid=sid,
|
||||
)
|
||||
|
||||
# Set up the task environment
|
||||
sandbox.execute(f'conda activate {ID2CONDA[instance["github_id"]]}')
|
||||
@@ -153,6 +160,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
max_budget_per_task=config.max_budget_per_task,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent.__class__.__name__
|
||||
),
|
||||
|
||||
@@ -29,9 +29,6 @@ cases/
|
||||
├── hello-world/
|
||||
│ ├── task.txt
|
||||
│ ├── outputs/
|
||||
│ │ ├── monologue_agent/
|
||||
│ │ │ └── workspace/
|
||||
│ │ │ ├── hello_world.sh
|
||||
│ │ └── codeact_agent/
|
||||
│ │ └── workspace/
|
||||
│ │ ├── hello_world.sh
|
||||
@@ -39,12 +36,6 @@ cases/
|
||||
├── create_web_app/
|
||||
│ ├── task.txt
|
||||
│ ├── outputs/
|
||||
│ │ ├── monologue_agent/
|
||||
│ │ │ └── workspace/
|
||||
│ │ │ ├── app.py
|
||||
│ │ │ ├── requirements.txt
|
||||
│ │ │ ├── static/
|
||||
│ │ │ └── templates/
|
||||
│ │ └── codeact_agent/
|
||||
│ │ └── workspace/
|
||||
│ │ ├── app.py
|
||||
|
||||
@@ -6,9 +6,7 @@ from conftest import agents
|
||||
|
||||
@pytest.mark.parametrize('agent', agents())
|
||||
def test_hello_world(task_file, run_test_case, agent):
|
||||
"""
|
||||
Test case for the "Hello, World!" Bash script using different agents.
|
||||
"""
|
||||
"""Test case for the "Hello, World!" Bash script using different agents."""
|
||||
# Run the test case for the specified agent
|
||||
workspace_dir = run_test_case(agent, 'hello-world')
|
||||
|
||||
@@ -16,7 +14,7 @@ def test_hello_world(task_file, run_test_case, agent):
|
||||
assert os.path.exists(workspace_dir)
|
||||
assert os.path.isfile(os.path.join(workspace_dir, 'hello_world.sh'))
|
||||
|
||||
# Execute the hello_world.sh script
|
||||
# Execute the hello_world.sh script
|
||||
os.chdir(workspace_dir)
|
||||
output = os.popen('bash hello_world.sh').read()
|
||||
assert output == 'Hello, World!\n'
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user