Minor SWE-Bench inference config tweak (#2381 )

* save infer logs to infer_logs * set max budget for swebench eval
avoid repeat logging of unneeded messages (#2380 )
2026-04-29 03:00:45 -04:00 · 2024-06-10 20:14:22 +00:00 · 2024-06-10 20:08:09 +00:00 · 2024-06-10 19:30:40 +00:00 · 2024-06-10 17:18:40 +00:00 · 2024-06-11 00:32:10 +08:00
337 changed files with 20506 additions and 4314 deletions
--- a/.github/ISSUE_TEMPLATE/question.md
+++ b/.github/ISSUE_TEMPLATE/question.md
@@ -1,16 +0,0 @@
---
-name: Question
-about: Use this template to ask a question regarding the project.
-title: ''
-labels: question
-assignees: ''
-
---
-
-## Describe your question
-
-<!--A clear and concise description of what you want to know.-->
-
-## Additional context
-
-<!--Add any other context about the question here, like what you've tried so far.-->
--- a/.github/workflows/dummy-agent-test.yml
+++ b/.github/workflows/dummy-agent-test.yml
@@ -10,6 +10,9 @@ on:
    - main
  pull_request:

+env:
+  PERSIST_SANDBOX : "false"
+
 jobs:
  test:
    runs-on: ubuntu-latest
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -47,11 +47,4 @@ jobs:
      - name: Install pre-commit
        run: pip install pre-commit==3.7.0
      - name: Run pre-commit hooks
-        if: github.ref != 'refs/heads/main'
-        run: |
-          git fetch https://github.com/OpenDevin/OpenDevin.git main:main && \
-          pre-commit run \
-            --files \
-            $(git diff --name-only $(git merge-base main $(git branch --show-current)) $(git branch --show-current) | tr '\n' ' ') \
-            --show-diff-on-failure \
-            --config ./dev_config/python/.pre-commit-config.yaml
+        run: pre-commit run --files opendevin/**/* agenthub/**/* evaluation/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
--- a/.github/workflows/review-pr.yml
+++ b/.github/workflows/review-pr.yml
@@ -44,12 +44,24 @@ jobs:
        echo "" >> task.txt
        echo "Diff file is: ${{ github.event.pull_request.number }}.diff" >> task.txt

+    - name: Set up environment
+      run: |
+        curl -sSL https://install.python-poetry.org | python3 -
+        export PATH="/github/home/.local/bin:$PATH"
+        poetry install --without evaluation
+        poetry run playwright install --with-deps chromium
+
    - name: Run OpenDevin
      env:
        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        SANDBOX_TYPE: exec
      run: |
-        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
+        # Append path to launch poetry
+        export PATH="/github/home/.local/bin:$PATH"
+        # Append path to correctly import package, note: must set pwd at first
+        export PYTHONPATH=$(pwd):$PYTHONPATH
+        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
        rm task.txt

    - name: Check if review file is non-empty
--- a/.github/workflows/run-integration-tests.yml
+++ b/.github/workflows/run-integration-tests.yml
@@ -15,6 +15,9 @@ on:
      - 'evaluation/**'
  pull_request:

+env:
+  PERSIST_SANDBOX : "false"
+
 jobs:
  integration-tests-on-linux:
    name: Integration Tests on Linux
--- a/.github/workflows/run-unit-tests.yml
+++ b/.github/workflows/run-unit-tests.yml
@@ -15,12 +15,15 @@ on:
      - 'evaluation/**'
  pull_request:

+env:
+  PERSIST_SANDBOX : "false"
+
 jobs:
  test-on-macos:
    name: Test on macOS
    runs-on: macos-13
    env:
-      INSTALL_DOCKER: "0" # Set to '0' to skip Docker installation
+      INSTALL_DOCKER: "1" # Set to '0' to skip Docker installation
    strategy:
      matrix:
        python-version: ["3.11"]
--- a/.github/workflows/solve-issue.yml
+++ b/.github/workflows/solve-issue.yml
@@ -35,15 +35,28 @@ jobs:
        echo "" >> task.txt
        echo "BODY:" >> task.txt
        echo "${ISSUE_BODY}" >> task.txt
+    
+    - name: Set up environment
+      run: |
+        curl -sSL https://install.python-poetry.org | python3 -
+        export PATH="/github/home/.local/bin:$PATH"
+        poetry install --without evaluation
+        poetry run playwright install --with-deps chromium
+

    - name: Run OpenDevin
      env:
        ISSUE_TITLE: ${{ github.event.issue.title }}
        ISSUE_BODY: ${{ github.event.issue.body }}
        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        SANDBOX_TYPE: exec
      run: |
-        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
+        # Append path to launch poetry
+        export PATH="/github/home/.local/bin:$PATH"
+        # Append path to correctly import package, note: must set pwd at first
+        export PYTHONPATH=$(pwd):$PYTHONPATH
+        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
        rm task.txt

    - name: Setup Git, Create Branch, and Commit Changes
--- a/.gitignore
+++ b/.gitignore
@@ -161,9 +161,14 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 .vscode/
+.cursorignore

 # evaluation
+evaluation/evaluation_outputs
+evaluation/outputs
+evaluation/swe_bench/eval_workspace*
 evaluation/SWE-bench/data
+evaluation/webarena/scripts/webarena_env.sh

 # frontend

@@ -176,6 +181,8 @@ frontend/yarn.lock

 # testing
 frontend/coverage
+test_results*
+/_test_files_tmp/

 # production
 frontend/build
@@ -204,8 +211,3 @@ cache
 # configuration
 config.toml
 config.toml.bak
-evaluation/swe_bench/eval_workspace*
-evaluation/outputs
-evaluation/evaluation_outputs
-test_results*
-/_test_files_tmp/
--- a/Development.md
+++ b/Development.md
@@ -5,8 +5,8 @@ This guide is for people working on OpenDevin and editing the source code.

 ### 1. Requirements
 * Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install)
-* [Docker](https://docs.docker.com/engine/install/)(For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
-* [Python](https://www.python.org/downloads/) >= 3.11
+* [Docker](https://docs.docker.com/engine/install/) (For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
+* [Python](https://www.python.org/downloads/) = 3.11
 * [NodeJS](https://nodejs.org/en/download/package-manager) >= 18.17.1
 * [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8

@@ -45,6 +45,7 @@ To configure the LM of your choice, follow these steps:
   make setup-config
   ```
   This command will prompt you to enter the LLM API key, model name, and other variables ensuring that OpenDevin is tailored to your specific needs. Note that the model name will apply only when you run headless. If you use the UI, please set the model in the UI.
+   Set `persist_sandbox` to false if you want to use clean sandbox for each task. If `persist_sandbox` is set to true, you will need to set the `ssh_password` as well.

 **Note on Alternative Models:**
 Some alternative models may prove more challenging to tame than others. Fear not, brave adventurer! We shall soon unveil LLM-specific documentation to guide you on your quest. And if you've already mastered the art of wielding a model other than OpenAI's GPT, we encourage you to [share your setup instructions with us](https://github.com/OpenDevin/OpenDevin/issues/417).
@@ -97,5 +98,5 @@ Please refer to [this README](./tests/integration/README.md) for details.

 ### 9. Add or update dependency

-1. Add your dependency in `pyproject.toml` or use `peotry add xxx`
-2. Update the poetry.lock file via `poetry lock --no-update`
+1. Add your dependency in `pyproject.toml` or use `poetry add xxx`
+2. Update the poetry.lock file via `poetry lock --no-update`
--- a/38
+++ b/38
@@ -7,9 +7,10 @@ BACKEND_PORT = 3000
 BACKEND_HOST = "127.0.0.1:$(BACKEND_PORT)"
 FRONTEND_PORT = 3001
 DEFAULT_WORKSPACE_DIR = "./workspace"
-DEFAULT_MODEL = "gpt-3.5-turbo"
+DEFAULT_MODEL = "gpt-4o"
 CONFIG_FILE = config.toml
 PRECOMMIT_CONFIG_PATH = "./dev_config/python/.pre-commit-config.yaml"
+PYTHON_VERSION = 3.11

 # ANSI color codes
 GREEN=$(shell tput -Txterm setaf 2)
@@ -62,10 +63,10 @@ check-system:

 check-python:
 	@echo "$(YELLOW)Checking Python installation...$(RESET)"
-	@if command -v python3.11 > /dev/null; then \
-		echo "$(BLUE)$(shell python3.11 --version) is already installed.$(RESET)"; \
+	@if command -v python$(PYTHON_VERSION) > /dev/null; then \
+		echo "$(BLUE)$(shell python$(PYTHON_VERSION) --version) is already installed.$(RESET)"; \
 	else \
-		echo "$(RED)Python 3.11 is not installed. Please install Python 3.11 to continue.$(RESET)"; \
+		echo "$(RED)Python $(PYTHON_VERSION) is not installed. Please install Python $(PYTHON_VERSION) to continue.$(RESET)"; \
 		exit 1; \
 	fi

@@ -112,13 +113,13 @@ check-poetry:
 			echo "$(BLUE)$(shell poetry --version) is already installed.$(RESET)"; \
 		else \
 			echo "$(RED)Poetry 1.8 or later is required. You can install poetry by running the following command, then adding Poetry to your PATH:"; \
-			echo "$(RED) curl -sSL https://install.python-poetry.org | python3 -$(RESET)"; \
+			echo "$(RED) curl -sSL https://install.python-poetry.org | python$(PYTHON_VERSION) -$(RESET)"; \
 			echo "$(RED)More detail here: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
 			exit 1; \
 		fi; \
 	else \
 		echo "$(RED)Poetry is not installed. You can install poetry by running the following command, then adding Poetry to your PATH:"; \
-		echo "$(RED) curl -sSL https://install.python-poetry.org | python3.11 -$(RESET)"; \
+		echo "$(RED) curl -sSL https://install.python-poetry.org | python$(PYTHON_VERSION) -$(RESET)"; \
 		echo "$(RED)More detail here: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
 		exit 1; \
 	fi
@@ -130,7 +131,7 @@ pull-docker-image:

 install-python-dependencies:
 	@echo "$(GREEN)Installing Python dependencies...$(RESET)"
-	poetry env use python3.11
+	poetry env use python$(PYTHON_VERSION)
 	@if [ "$(shell uname)" = "Darwin" ]; then \
 		echo "$(BLUE)Installing chroma-hnswlib...$(RESET)"; \
 		export HNSWLIB_NO_NATIVE=1; \
@@ -142,7 +143,14 @@ install-python-dependencies:
 		poetry run pip install playwright; \
 		poetry run playwright install chromium; \
 	else \
-		poetry run playwright install --with-deps chromium; \
+		if [ ! -f cache/playwright_chromium_is_installed.txt ]; then \
+			echo "Running playwright install --with-deps chromium..."; \
+			poetry run playwright install --with-deps chromium; \
+			mkdir -p cache; \
+			touch cache/playwright_chromium_is_installed.txt; \
+		else \
+			echo "Setup already done. Skipping playwright installation."; \
+		fi \
 	fi
 	@echo "$(GREEN)Python dependencies installed successfully.$(RESET)"

@@ -165,7 +173,7 @@ install-precommit-hooks:

 lint-backend:
 	@echo "$(YELLOW)Running linters...$(RESET)"
-	@poetry run pre-commit run --files $$(git diff --name-only $$(git merge-base main $$(git branch --show-current)) $$(git branch --show-current) | tr '\n' ' ') --show-diff-on-failure --config $(PRECOMMIT_CONFIG_PATH)
+	@poetry run pre-commit run --files opendevin/**/* agenthub/**/* evaluation/**/* --show-diff-on-failure --config $(PRECOMMIT_CONFIG_PATH)

 lint-frontend:
 	@echo "$(YELLOW)Running linters for frontend...$(RESET)"
@@ -222,10 +230,20 @@ setup-config:
 setup-config-prompts:
 	@echo "[core]" > $(CONFIG_FILE).tmp

-	@read -p "Enter your workspace directory [default: $(DEFAULT_WORKSPACE_DIR)]: " workspace_dir; \
+	@read -p "Enter your workspace directory (as absolute path) [default: $(DEFAULT_WORKSPACE_DIR)]: " workspace_dir; \
 	 workspace_dir=$${workspace_dir:-$(DEFAULT_WORKSPACE_DIR)}; \
 	 echo "workspace_base=\"$$workspace_dir\"" >> $(CONFIG_FILE).tmp

+	@read -p "Do you want to persist the sandbox container? [true/false] [default: true]: " persist_sandbox; \
+	 persist_sandbox=$${persist_sandbox:-true}; \
+	 if [ "$$persist_sandbox" = "true" ]; then \
+		 read -p "Enter a password for the sandbox container: " ssh_password; \
+		 echo "ssh_password=\"$$ssh_password\"" >> $(CONFIG_FILE).tmp; \
+		 echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
+	 else \
+		echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
+	 fi
+
 	@echo "" >> $(CONFIG_FILE).tmp

 	@echo "[llm]" >> $(CONFIG_FILE).tmp
--- a/README.md
+++ b/README.md
@@ -49,31 +49,30 @@ OpenDevin agents collaborate with human developers to write code, fix bugs, and
 The easiest way to run OpenDevin is inside a Docker container. It works best with the most recent version of Docker, `26.0.0`.
 You must be using Linux, Mac OS, or WSL on Windows.

-To start the app, run these commands, replacing `$(pwd)/workspace` with the directory you want OpenDevin to work with.
+To start OpenDevin in a docker container, run the following commands in your terminal:
+
+> [!WARNING]
+> When you run the following command, files in `./workspace` may be modified or deleted.

 ```bash
-# The directory you want OpenDevin to work with. MUST be an absolute path!
-export WORKSPACE_BASE=$(pwd)/workspace;
-```
-
-> [!WARNING]  
-> OpenDevin runs bash commands within a Docker sandbox, so it should not affect your machine. 
-> But your workspace directory will be attached to that sandbox, and files in the directory may be modified or deleted.
-
-```bash
-docker run \
-    -it \
+OPENDEVIN_WORKSPACE=$(pwd)/workspace
+docker run -it \
    --pull=always \
    -e SANDBOX_USER_ID=$(id -u) \
-    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-    -v $WORKSPACE_BASE:/opt/workspace_base \
+    -e PERSIST_SANDBOX="true" \
+    -e SSH_PASSWORD="make something up here" \
+    -e WORKSPACE_MOUNT_PATH=$OPENDEVIN_WORKSPACE \
+    -v $OPENDEVIN_WORKSPACE:/opt/workspace_base \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
-    ghcr.io/opendevin/opendevin:0.5
+    --name opendevin-app-$(date +%Y%m%d%H%M%S) \
+    ghcr.io/opendevin/opendevin:0.6
 ```

-You'll find OpenDevin running at [http://localhost:3000](http://localhost:3000).
+You'll find OpenDevin running at [http://localhost:3000](http://localhost:3000) with access to `./workspace`. To have OpenDevin operate on your code, place it in `./workspace`.
+
+OpenDevin will only have access to this workspace folder. The rest of your system will not be affected as it runs in a secured docker sandbox.

 ## 🚀 Documentation

@@ -100,7 +99,7 @@ For details, please check [CONTRIBUTING.md](./CONTRIBUTING.md).
 Whether you're a developer, a researcher, or simply enthusiastic about OpenDevin, we'd love to have you in our community.
 Let's make software engineering better together!

- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw) - Here we talk about research, architecture, and future development.
+- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) - Here we talk about research, architecture, and future development.
 - [Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback.

 ## 📈 Progress
@@ -130,3 +129,16 @@ Distributed under the MIT License. See [`LICENSE`](./LICENSE) for more informati
 [issues-url]: https://github.com/OpenDevin/OpenDevin/issues
 [license-shield]: https://img.shields.io/github/license/opendevin/opendevin?style=for-the-badge
 [license-url]: https://github.com/OpenDevin/OpenDevin/blob/main/LICENSE
+
+## 📚 Cite
+
+```
+@misc{opendevin2024,
+  author       = {{OpenDevin Team}},
+  title        = {{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
+  year         = {2024},
+  version      = {v1.0},
+  howpublished = {\url{https://github.com/OpenDevin/OpenDevin}},
+  note         = {Accessed: ENTER THE DATE YOU ACCESSED THE PROJECT}
+}
+```
--- a/agenthub/README.md
+++ b/agenthub/README.md
@@ -2,15 +2,15 @@

 In this folder, there may exist multiple implementations of `Agent` that will be used by the framework.

-For example, `agenthub/monologue_agent`, `agenthub/metagpt_agent`, `agenthub/codeact_agent`, etc.
+For example, `agenthub/codeact_agent`, etc.
 Contributors from different backgrounds and interests can choose to contribute to any (or all!) of these directions.

 ## Constructing an Agent

-The abstraction for an agent can be found [here](../opendevin/agent.py).
+The abstraction for an agent can be found [here](../opendevin/controller/agent.py).

 Agents are run inside of a loop. At each iteration, `agent.step()` is called with a
-[State](../opendevin/state.py) input, and the agent must output an [Action](../opendevin/action).
+[State](../opendevin/controller/state/state.py) input, and the agent must output an [Action](../opendevin/events/action).

 Every agent also has a `self.llm` which it can use to interact with the LLM configured by the user.
 See the [LiteLLM docs for `self.llm.completion`](https://docs.litellm.ai/docs/completion).
@@ -28,21 +28,19 @@ The `state` contains:

 Here is a list of available Actions, which can be returned by `agent.step()`:

- [`CmdRunAction`](../opendevin/action/bash.py) - Runs a command inside a sandboxed terminal
- [`CmdKillAction`](../opendevin/action/bash.py) - Kills a background command
- [`IPythonRunCellAction`](../opendevin/action/bash.py) - Execute a block of Python code interactively (in Jupyter notebook) and receives `CmdOutputObservation`. Requires setting up `jupyter` [plugin](../opendevin/sandbox/plugins) as a requirement.
- [`FileReadAction`](../opendevin/action/fileop.py) - Reads the content of a file
- [`FileWriteAction`](../opendevin/action/fileop.py) - Writes new content to a file
- [`BrowseURLAction`](../opendevin/action/browse.py) - Gets the content of a URL
- [`AgentRecallAction`](../opendevin/action/agent.py) - Searches memory (e.g. a vector database)
- [`AddTaskAction`](../opendevin/action/tasks.py) - Adds a subtask to the plan
- [`ModifyTaskAction`](../opendevin/action/tasks.py) - Changes the state of a subtask
- [`AgentThinkAction`](../opendevin/action/agent.py) - A no-op that allows the agent to add plaintext to the history (as well as the chat log)
- [`AgentTalkAction`](../opendevin/action/agent.py) - A no-op that allows the agent to add plaintext to the history and talk to the user.
- [`AgentFinishAction`](../opendevin/action/agent.py) - Stops the control loop, allowing the user/delegator agent to enter a new task
- [`AgentRejectAction`](../opendevin/action/agent.py) - Stops the control loop, allowing the user/delegator agent to enter a new task
- [`AgentFinishAction`](../opendevin/action/agent.py) - Stops the control loop, allowing the user to enter a new task
- [`MessageAction`](../opendevin/action/message.py) - Represents a message from an agent or the user
+- [`CmdRunAction`](../opendevin/events/action/commands.py) - Runs a command inside a sandboxed terminal
+- [`CmdKillAction`](../opendevin/events/action/commands.py) - Kills a background command
+- [`IPythonRunCellAction`](../opendevin/events/action/commands.py) - Execute a block of Python code interactively (in Jupyter notebook) and receives `CmdOutputObservation`. Requires setting up `jupyter` [plugin](../opendevin/runtime/plugins) as a requirement.
+- [`FileReadAction`](../opendevin/events/action/files.py) - Reads the content of a file
+- [`FileWriteAction`](../opendevin/events/action/files.py) - Writes new content to a file
+- [`BrowseURLAction`](../opendevin/events/action/browse.py) - Gets the content of a URL
+- [`AgentRecallAction`](../opendevin/events/action/agent.py) - Searches memory (e.g. a vector database)
+- [`AddTaskAction`](../opendevin/events/action/tasks.py) - Adds a subtask to the plan
+- [`ModifyTaskAction`](../opendevin/events/action/tasks.py) - Changes the state of a subtask.
+- [`AgentFinishAction`](../opendevin/events/action/agent.py) - Stops the control loop, allowing the user/delegator agent to enter a new task
+- [`AgentRejectAction`](../opendevin/events/action/agent.py) - Stops the control loop, allowing the user/delegator agent to enter a new task
+- [`AgentFinishAction`](../opendevin/events/action/agent.py) - Stops the control loop, allowing the user to enter a new task
+- [`MessageAction`](../opendevin/events/action/message.py) - Represents a message from an agent or the user

 You can use `action.to_dict()` and `action_from_dict` to serialize and deserialize actions.

@@ -54,13 +52,13 @@ in the background).

 Here is a list of available Observations:

- [`CmdOutputObservation`](../opendevin/observation/run.py)
- [`BrowserOutputObservation`](../opendevin/observation/browse.py)
- [`FileReadObservation`](../opendevin/observation/files.py)
- [`FileWriteObservation`](../opendevin/observation/files.py)
- [`AgentRecallObservation`](../opendevin/observation/recall.py)
- [`ErrorObservation`](../opendevin/observation/error.py)
- [`SuccessObservation`](../opendevin/observation/success.py)
+- [`CmdOutputObservation`](../opendevin/events/observation/commands.py)
+- [`BrowserOutputObservation`](../opendevin/events/observation/browse.py)
+- [`FileReadObservation`](../opendevin/events/observation/files.py)
+- [`FileWriteObservation`](../opendevin/events/observation/files.py)
+- [`AgentRecallObservation`](../opendevin/events/observation/recall.py)
+- [`ErrorObservation`](../opendevin/events/observation/error.py)
+- [`SuccessObservation`](../opendevin/events/observation/success.py)

 You can use `observation.to_dict()` and `observation_from_dict` to serialize and deserialize observations.

--- a/agenthub/SWE_agent/agent.py
+++ b/agenthub/SWE_agent/agent.py
@@ -8,6 +8,7 @@ from opendevin.events.action import (
 )
 from opendevin.events.serialization.event import event_to_memory
 from opendevin.llm.llm import LLM
+from opendevin.runtime.tools import RuntimeTool

 from .parser import parse_command
 from .prompts import (
@@ -27,6 +28,7 @@ class SWEAgent(Agent):

    SWE-agent includes ACI functions like 'goto', 'search_for', 'edit', 'scroll', 'run'
    """
+    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]

    def __init__(self, llm: LLM):
        super().__init__(llm)
--- a/agenthub/browsing_agent/browsing_agent.py
+++ b/agenthub/browsing_agent/browsing_agent.py
@@ -1,4 +1,5 @@
 import ast
+import os

 from browsergym.core.action.highlevel import HighLevelActionSet
 from browsergym.utils.obs import flatten_axtree_to_str
@@ -12,27 +13,25 @@ from opendevin.events.action import (
    BrowseInteractiveAction,
    MessageAction,
 )
+from opendevin.events.event import EventSource
 from opendevin.events.observation import BrowserOutputObservation
 from opendevin.llm.llm import LLM
 from opendevin.runtime.plugins import (
    PluginRequirement,
 )
+from opendevin.runtime.tools import RuntimeTool

+USE_NAV = (
+    os.environ.get('USE_NAV', 'true') == 'true'
+)  # only disable NAV actions when running webarena and miniwob benchmarks
+USE_CONCISE_ANSWER = (
+    os.environ.get('USE_CONCISE_ANSWER', 'false') == 'true'
+)  # only return concise answer when running webarena and miniwob benchmarks

-def parse_response(response: str) -> Action:
-    if '```' not in response:
-        # unexpected response format, message back to user
-        return MessageAction(response)
-    thought = response.split('```')[0].strip()
-    action_str = response.split('```')[1].strip()
-    # handle send message to user function call in BrowserGym
-    for sub_action in action_str.split('\n'):
-        if 'send_msg_to_user(' in sub_action:
-            tree = ast.parse(sub_action)
-            args = tree.body[0].value.args  # type: ignore
-            return MessageAction(args[0].value)
-
-    return BrowseInteractiveAction(browser_actions=action_str, thought=thought)
+if not USE_NAV and USE_CONCISE_ANSWER:
+    EVAL_MODE = True  # disabled NAV actions and only return concise answer, for webarena and miniwob benchmarks\
+else:
+    EVAL_MODE = False


 class BrowsingAgent(Agent):
@@ -42,6 +41,7 @@ class BrowsingAgent(Agent):
    """

    sandbox_plugins: list[PluginRequirement] = []
+    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]

    def __init__(
        self,
@@ -54,13 +54,13 @@ class BrowsingAgent(Agent):
        - llm (LLM): The llm to be used by this agent
        """
        super().__init__(llm)
+        # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
+        # see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
+        action_subsets = ['chat', 'bid']
+        if USE_NAV:
+            action_subsets.append('nav')
        self.action_space = HighLevelActionSet(
-            # see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
-            subsets=[
-                'chat',
-                'bid',
-                'nav',
-            ],  # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
+            subsets=action_subsets,
            strict=False,  # less strict on the parsing of the actions
            multiaction=True,  # enable to agent to take multiple actions at once
        )
@@ -73,6 +73,32 @@ class BrowsingAgent(Agent):
        """
        super().reset()
        self.cost_accumulator = 0
+        self.error_accumulator = 0
+
+    def parse_response(self, response: str) -> Action:
+        if '```' not in response:
+            # unexpected response format, message back to user
+            action_str = f'send_msg_to_user("""{response}""")'
+            return BrowseInteractiveAction(
+                browser_actions=action_str,
+                thought=response,
+                browsergym_send_msg_to_user=response,
+            )
+        thought = response.split('```')[0].strip()
+        action_str = response.split('```')[1].strip()
+        # handle send message to user function call in BrowserGym
+        msg_content = ''
+        for sub_action in action_str.split('\n'):
+            if 'send_msg_to_user(' in sub_action:
+                tree = ast.parse(sub_action)
+                args = tree.body[0].value.args  # type: ignore
+                msg_content = args[0].value
+
+        return BrowseInteractiveAction(
+            browser_actions=action_str,
+            thought=thought,
+            browsergym_send_msg_to_user=msg_content,
+        )

    def step(self, state: State) -> Action:
        """
@@ -88,27 +114,66 @@ class BrowsingAgent(Agent):
        - AgentFinishAction() - end the interaction
        """
        goal = state.get_current_user_intent()
+        if goal is None:
+            goal = state.inputs['task']
        messages = []
-        prev_actions = ''
+        prev_actions = []
        cur_axtree_txt = ''
        error_prefix = ''
        last_obs = None
+        last_action = None
+
+        if EVAL_MODE and len(state.history) == 1:
+            # for webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
+            # initialize and retrieve the first observation by issuing an noop OP
+            # For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
+            return BrowseInteractiveAction(browser_actions='noop()')
+
        for prev_action, obs in state.history:
            if isinstance(prev_action, BrowseInteractiveAction):
-                prev_actions += f'{prev_action.browser_actions}\n'
+                prev_actions.append(prev_action.browser_actions)
                last_obs = obs
+                last_action = prev_action
            elif (
-                isinstance(prev_action, MessageAction) and prev_action.source != 'user'
+                isinstance(prev_action, MessageAction)
+                and prev_action.source == EventSource.AGENT
            ):
                # agent has responded, task finish.
-                return AgentFinishAction()
+                return AgentFinishAction(outputs={'content': prev_action.content})
+
+        if EVAL_MODE:
+            prev_actions = prev_actions[1:]  # remove the first noop action
+
+        prev_action_str = '\n'.join(prev_actions)
+        # if the final BrowserInteractiveAction exec BrowserGym's send_msg_to_user,
+        # we should also send a message back to the user in OpenDevin and call it a day
+        if (
+            isinstance(last_action, BrowseInteractiveAction)
+            and last_action.browsergym_send_msg_to_user
+        ):
+            return MessageAction(last_action.browsergym_send_msg_to_user)

        if isinstance(last_obs, BrowserOutputObservation):
            if last_obs.error:
                # add error recovery prompt prefix
                error_prefix = f'IMPORTANT! Last action is incorrect:\n{last_obs.last_browser_action}\nThink again with the current observation of the page.\n'
-            cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object)
+            try:
+                cur_axtree_txt = flatten_axtree_to_str(
+                    last_obs.axtree_object,
+                    extra_properties=last_obs.extra_element_properties,
+                    with_clickable=True,
+                    filter_visible_only=True,
+                )
+            except Exception as e:
+                logger.error(
+                    'Error when trying to process the accessibility tree: %s', e
+                )
+                return MessageAction('Error encountered when browsing.')

+        if error_prefix:
+            self.error_accumulator += 1
+            if self.error_accumulator > 5:
+                return MessageAction('Too many errors encountered. Task failed.')
        system_msg = f"""\
 # Instructions
 Review the current state of the page and all other information to find the best
@@ -131,7 +196,7 @@ and executed by a program, make sure to follow the formatting instructions.
 {cur_axtree_txt}

 # Previous Actions
-{prev_actions}
+{prev_action_str}

 Here is an example with chain of thought of a valid action when clicking on a button:
 "
@@ -139,16 +204,31 @@ In order to accomplish my goal I need to click on the button with bid 12
 ```click("12")```
 "
 """.strip()
+
+        if USE_CONCISE_ANSWER:
+            concise_instruction = """\
+
+Here is another example with chain of thought of a valid action when providing a concise answer to user:
+"
+In order to accomplish my goal I need to send the information asked back to the user. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I will send a message back to user with the answer.
+```send_msg_to_user("$279.49")```
+"
+"""
+            prompt += concise_instruction
        messages.append({'role': 'user', 'content': prompt})
        response = self.llm.completion(
            messages=messages,
            temperature=0.0,
+            stop=[')```', ')\n```'],
        )
        self.log_cost(response)
-        action_resp = response['choices'][0]['message']['content']
+        action_resp = response['choices'][0]['message']['content'].strip()
+        if not action_resp.endswith('```'):
+            action_resp = action_resp + ')```'
+
        logger.info(prompt)
        logger.info(action_resp)
-        return parse_response(action_resp)
+        return self.parse_response(action_resp)

    def search_memory(self, query: str) -> list[str]:
        raise NotImplementedError('Implement this abstract method')
--- a/agenthub/browsing_agent/prompt.py
+++ b/agenthub/browsing_agent/prompt.py
@@ -146,7 +146,7 @@ class Shrinkable(PromptElement, abc.ABC):
        """Implement shrinking of this prompt element.

        You need to recursively call all shrinkable elements that are part of
-        this prompt. You can also implement a shriking startegy for this prompt.
+        this prompt. You can also implement a shrinking strategy for this prompt.
        Shrinking is can be called multiple times to progressively shrink the
        prompt until it fits max_tokens. Default max shrink iterations is 20.
        """
@@ -161,7 +161,7 @@ class Truncater(Shrinkable):

    def __init__(self, visible, shrink_speed=0.3, start_truncate_iteration=10):
        super().__init__(visible=visible)
-        self.shrink_speed = shrink_speed  # the percentage shrinked in each iteration
+        self.shrink_speed = shrink_speed  # the percentage shrunk in each iteration
        self.start_truncate_iteration = (
            start_truncate_iteration  # the iteration to start truncating
        )
@@ -494,11 +494,13 @@ def _get_action_space(flags: Flags) -> AbstractActionSet:
            action_space = PythonActionSet(strict=flags.is_strict)
            if flags.multi_actions:
                warn(
-                    f'Flag action_space={repr(flags.action_space)} incompatible with multi_actions={repr(flags.multi_actions)}.'
+                    f'Flag action_space={repr(flags.action_space)} incompatible with multi_actions={repr(flags.multi_actions)}.',
+                    stacklevel=2,
                )
            if flags.demo_mode != 'off':
                warn(
-                    f'Flag action_space={repr(flags.action_space)} incompatible with demo_mode={repr(flags.demo_mode)}.'
+                    f'Flag action_space={repr(flags.action_space)} incompatible with demo_mode={repr(flags.demo_mode)}.',
+                    stacklevel=2,
                )
            return action_space
        case 'bid':
--- a/agenthub/browsing_agent/utils.py
+++ b/agenthub/browsing_agent/utils.py
@@ -16,7 +16,7 @@ def yaml_parser(message):
        valid = True
        retry_message = ''
    except yaml.YAMLError as e:
-        warn(str(e))
+        warn(str(e), stacklevel=2)
        value = {}
        valid = False
        retry_message = "Your response is not a valid yaml. Please try again and be careful to the format. Don't add any apology or comment, just the answer."
--- a/agenthub/codeact_agent/action_parser.py
+++ b/agenthub/codeact_agent/action_parser.py
@@ -0,0 +1,182 @@
+import re
+
+from opendevin.controller.action_parser import ActionParser, ResponseParser
+from opendevin.events.action import (
+    Action,
+    AgentDelegateAction,
+    AgentFinishAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+
+
+class CodeActResponseParser(ResponseParser):
+    """
+    Parser action:
+        - CmdRunAction(command) - bash command to run
+        - IPythonRunCellAction(code) - IPython code to run
+        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        # Need pay attention to the item order in self.action_parsers
+        self.action_parsers = [
+            CodeActActionParserFinish(),
+            CodeActActionParserCmdRun(),
+            CodeActActionParserIPythonRunCell(),
+            CodeActActionParserAgentDelegate(),
+        ]
+        self.default_parser = CodeActActionParserMessage()
+
+    def parse(self, response: str) -> Action:
+        action_str = self.parse_response(response)
+        return self.parse_action(action_str)
+
+    def parse_response(self, response) -> str:
+        action = response.choices[0].message.content
+        for lang in ['bash', 'ipython', 'browse']:
+            if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
+                action += f'</execute_{lang}>'
+        return action
+
+    def parse_action(self, action_str: str) -> Action:
+        for action_parser in self.action_parsers:
+            if action_parser.check_condition(action_str):
+                return action_parser.parse(action_str)
+        return self.default_parser.parse(action_str)
+
+
+class CodeActActionParserFinish(ActionParser):
+    """
+    Parser action:
+        - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        self.finish_command = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.finish_command = re.search(r'<finish>.*</finish>', action_str, re.DOTALL)
+        return self.finish_command is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.finish_command is not None
+        ), 'self.finish_command should not be None when parse is called'
+        thought = action_str.replace(self.finish_command.group(0), '').strip()
+        return AgentFinishAction(thought=thought)
+
+
+class CodeActActionParserCmdRun(ActionParser):
+    """
+    Parser action:
+        - CmdRunAction(command) - bash command to run
+        - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        self.bash_command = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.bash_command = re.search(
+            r'<execute_bash>(.*?)</execute_bash>', action_str, re.DOTALL
+        )
+        return self.bash_command is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.bash_command is not None
+        ), 'self.bash_command should not be None when parse is called'
+        thought = action_str.replace(self.bash_command.group(0), '').strip()
+        # a command was found
+        command_group = self.bash_command.group(1).strip()
+        if command_group.strip() == 'exit':
+            return AgentFinishAction()
+        return CmdRunAction(command=command_group, thought=thought)
+
+
+class CodeActActionParserIPythonRunCell(ActionParser):
+    """
+    Parser action:
+        - IPythonRunCellAction(code) - IPython code to run
+    """
+
+    def __init__(
+        self,
+    ):
+        self.python_code = None
+        self.jupyter_kernel_init_code: str = 'from agentskills import *'
+
+    def check_condition(self, action_str: str) -> bool:
+        self.python_code = re.search(
+            r'<execute_ipython>(.*?)</execute_ipython>', action_str, re.DOTALL
+        )
+        return self.python_code is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.python_code is not None
+        ), 'self.python_code should not be None when parse is called'
+        code_group = self.python_code.group(1).strip()
+        thought = action_str.replace(self.python_code.group(0), '').strip()
+        return IPythonRunCellAction(
+            code=code_group,
+            thought=thought,
+            kernel_init_code=self.jupyter_kernel_init_code,
+        )
+
+
+class CodeActActionParserAgentDelegate(ActionParser):
+    """
+    Parser action:
+        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    """
+
+    def __init__(
+        self,
+    ):
+        self.agent_delegate = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.agent_delegate = re.search(
+            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
+        )
+        return self.agent_delegate is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.agent_delegate is not None
+        ), 'self.agent_delegate should not be None when parse is called'
+        thought = action_str.replace(self.agent_delegate.group(0), '').strip()
+        browse_actions = self.agent_delegate.group(1).strip()
+        task = f'{thought}. I should start with: {browse_actions}'
+        return AgentDelegateAction(agent='BrowsingAgent', inputs={'task': task})
+
+
+class CodeActActionParserMessage(ActionParser):
+    """
+    Parser action:
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    """
+
+    def __init__(
+        self,
+    ):
+        pass
+
+    def check_condition(self, action_str: str) -> bool:
+        # We assume the LLM is GOOD enough that when it returns pure natural language
+        # it wants to talk to the user
+        return True
+
+    def parse(self, action_str: str) -> Action:
+        return MessageAction(content=action_str, wait_for_response=True)
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@@ -1,5 +1,4 @@
-import re
-
+from agenthub.codeact_agent.action_parser import CodeActResponseParser
 from agenthub.codeact_agent.prompt import (
    COMMAND_DOCS,
    EXAMPLES,
@@ -18,6 +17,7 @@ from opendevin.events.action import (
    MessageAction,
 )
 from opendevin.events.observation import (
+    AgentDelegateObservation,
    BrowserOutputObservation,
    CmdOutputObservation,
    IPythonRunCellObservation,
@@ -28,18 +28,11 @@ from opendevin.runtime.plugins import (
    JupyterRequirement,
    PluginRequirement,
 )
+from opendevin.runtime.tools import RuntimeTool

 ENABLE_GITHUB = True


-def parse_response(response) -> str:
-    action = response.choices[0].message.content
-    for lang in ['bash', 'ipython', 'browse']:
-        if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
-            action += f'</execute_{lang}>'
-    return action
-
-
 def action_to_str(action: Action) -> str:
    if isinstance(action, CmdRunAction):
        return f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
@@ -88,6 +81,9 @@ def get_observation_message(obs) -> dict[str, str] | None:
    elif isinstance(obs, BrowserOutputObservation):
        content = 'OBSERVATION:\n' + truncate_observation(obs.content)
        return {'role': 'user', 'content': content}
+    elif isinstance(obs, AgentDelegateObservation):
+        content = 'OBSERVATION:\n' + truncate_observation(str(obs.outputs))
+        return {'role': 'user', 'content': content}
    return None


@@ -118,7 +114,7 @@ def get_in_context_example() -> str:


 class CodeActAgent(Agent):
-    VERSION = '1.5'
+    VERSION = '1.6'
    """
    The Code Act Agent is a minimalist agent.
    The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
@@ -162,11 +158,13 @@ class CodeActAgent(Agent):
        AgentSkillsRequirement(),
        JupyterRequirement(),
    ]
-    jupyter_kernel_init_code: str = 'from agentskills import *'
+    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]

    system_message: str = get_system_message()
    in_context_example: str = f"Here is an example of how you can interact with the environment for task solving:\n{get_in_context_example()}\n\nNOW, LET'S START!"

+    action_parser = CodeActResponseParser()
+
    def __init__(
        self,
        llm: LLM,
@@ -197,7 +195,7 @@ class CodeActAgent(Agent):
        Returns:
        - CmdRunAction(command) - bash command to run
        - IPythonRunCellAction(code) - IPython code to run
-        - BrowseInteractiveAction(browsergym_command) - BrowserGym commands to run
+        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
        - MessageAction(content) - Message action to run (e.g. ask for clarification)
        - AgentFinishAction() - end the interaction
        """
@@ -232,50 +230,10 @@ class CodeActAgent(Agent):
            ],
            temperature=0.0,
        )
-
-        action_str: str = parse_response(response)
        state.num_of_chars += sum(
            len(message['content']) for message in messages
-        ) + len(action_str)
-
-        if finish_command := re.search(r'<finish>.*</finish>', action_str, re.DOTALL):
-            thought = action_str.replace(finish_command.group(0), '').strip()
-            return AgentFinishAction(thought=thought)
-        if bash_command := re.search(
-            r'<execute_bash>(.*?)</execute_bash>', action_str, re.DOTALL
-        ):
-            # remove the command from the action string to get thought
-            thought = action_str.replace(bash_command.group(0), '').strip()
-            # a command was found
-            command_group = bash_command.group(1).strip()
-
-            if command_group.strip() == 'exit':
-                return AgentFinishAction()
-            return CmdRunAction(command=command_group, thought=thought)
-        elif python_code := re.search(
-            r'<execute_ipython>(.*?)</execute_ipython>', action_str, re.DOTALL
-        ):
-            # a code block was found
-            code_group = python_code.group(1).strip()
-            thought = action_str.replace(python_code.group(0), '').strip()
-            return IPythonRunCellAction(
-                code=code_group,
-                thought=thought,
-                kernel_init_code=self.jupyter_kernel_init_code,
-            )
-        elif browse_command := re.search(
-            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
-        ):
-            # BrowserGym actions was found
-            browse_actions = browse_command.group(1).strip()
-            thought = action_str.replace(browse_command.group(0), '').strip()
-            return BrowseInteractiveAction(
-                browser_actions=browse_actions, thought=thought
-            )
-        else:
-            # We assume the LLM is GOOD enough that when it returns pure natural language
-            # it want to talk to the user
-            return MessageAction(content=action_str, wait_for_response=True)
+        ) + len(response.choices[0].message.content)
+        return self.action_parser.parse(response)

    def search_memory(self, query: str) -> list[str]:
        raise NotImplementedError('Implement this abstract method')
--- a/agenthub/codeact_agent/prompt.py
+++ b/agenthub/codeact_agent/prompt.py
@@ -5,35 +5,41 @@ _AGENT_SKILLS_DOCS = AgentSkillsRequirement.documentation
 COMMAND_DOCS = (
    '\nApart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:\n'
    f'{_AGENT_SKILLS_DOCS}'
-    "Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run."
+    "Please note that THE `edit_file` and `append_file` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run."
 )

 # ======= SYSTEM MESSAGE =======
 MINIMAL_SYSTEM_PREFIX = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
-The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
 <execute_ipython>
 print("Hello World!")
 </execute_ipython>
 The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
 For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
 """

-BROWSING_PREFIX = """The assistant can browse the Internet with commands on behalf of the user by wrapping them with <execute_browse> and </execute_browse>.
-For example, you can browse a given URL by <execute_browse> goto("<URL>") </execute_browse>.
-The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
+BROWSING_PREFIX = """The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
 """
 PIP_INSTALL_PREFIX = """The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them."""

 SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX

-GITHUB_MESSAGE = """To do any activities on GitHub, the assistant should use the token in the $GITHUB_TOKEN environment variable.
-For instance, to push a local branch `my_branch` to the github repo `owner/repo`, the assistant can use the following four commands:
+GITHUB_MESSAGE = """To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
 <execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
-If the assistant require access to GitHub but $GITHUB_TOKEN is not set, ask the user to set it."""
+If $GITHUB_TOKEN is not set, ask the user to set it."""

-SYSTEM_SUFFIX = """The assistant's response should be concise.
-The assistant should include ONLY ONE <execute_ipython> or <execute_bash> or <execute_browse> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
-IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> or <execute_browse> instead of providing it.
+SYSTEM_SUFFIX = """Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
 """


@@ -41,10 +47,10 @@ IMPORTANT: Whenever possible, execute the code for the user using <execute_ipyth
 EXAMPLES = """
 --- START OF EXAMPLE ---

-USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.

 ASSISTANT:
-Sure! Let me create a file first:
+Sure! Let me create a Python file `app.py`:
 <execute_ipython>
 create_file('app.py')
 </execute_ipython>
@@ -68,7 +74,7 @@ def index():

 if __name__ == '__main__':
    app.run(port=5000)\"\"\"
-edit_file(start=1, end=1, content=EDITED_CODE)
+edit_file('app.py', start=1, end=1, content=EDITED_CODE)
 </execute_ipython>

 USER:
@@ -163,7 +169,7 @@ USER: Now browse the newly started server's homepage and show me the content.
 ASSISTANT:
 Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
 <execute_browse>
-goto("http://127.0.0.1:5000")
+Get the content on "http://127.0.0.1:5000"
 </execute_browse>

 USER:
@@ -207,7 +213,7 @@ USER:
 ASSISTANT:
 I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
 <execute_ipython>
-edit_file(start=7, end=7, content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'")
+edit_file('app.py', start=7, end=7, content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'")
 </execute_ipython>

 USER:
@@ -226,7 +232,7 @@ Observation:
 [File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]

 ASSISTANT:
-The file has been updated. Let me run the Python file again with the new changes:
+Running the updated file:
 <execute_bash>
 python3 app.py > server.log 2>&1 &
 </execute_bash>
@@ -236,14 +242,14 @@ Observation:
 [1] 126

 ASSISTANT:
-The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Free free to let me know if you have any further requests!
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!

 --- END OF EXAMPLE ---
 """

 INVALID_INPUT_MESSAGE = (
    "I don't understand your input. \n"
-    'If you want to execute a bash command, please use <execute_bash> YOUR_COMMAND_HERE </execute_bash>.\n'
-    'If you want to execute a block of Python code, please use <execute_ipython> YOUR_COMMAND_HERE </execute_ipython>.\n'
-    'If you want to browse the Internet, please use <execute_browse> YOUR_COMMAND_HERE </execute_browse>.\n'
+    'For bash commands, use <execute_bash> YOUR_COMMAND </execute_bash>.\n'
+    'For Python code, use <execute_ipython> YOUR_CODE </execute_ipython>.\n'
+    'For browsing, use <execute_browse> YOUR_COMMAND </execute_browse>.\n'
 )
--- a/agenthub/codeact_swe_agent/README.md
+++ b/agenthub/codeact_swe_agent/README.md
@@ -2,6 +2,6 @@

 This agent is an adaptation of the original [SWE Agent](https://swe-agent.com/) based on CodeAct using the `agentskills` library of OpenDevin.

-It is intended use is **solving Github issues**.
+Its intended use is **solving Github issues**.

 It removes web-browsing and Github capability from the original CodeAct agent to avoid confusion to the agent.
--- a/agenthub/codeact_swe_agent/codeact_swe_agent.py
+++ b/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -27,6 +27,7 @@ from opendevin.runtime.plugins import (
    JupyterRequirement,
    PluginRequirement,
 )
+from opendevin.runtime.tools import RuntimeTool


 def parse_response(response) -> str:
@@ -127,6 +128,7 @@ class CodeActSWEAgent(Agent):
        AgentSkillsRequirement(),
        JupyterRequirement(),
    ]
+    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]
    jupyter_kernel_init_code: str = 'from agentskills import *'

    system_message: str = get_system_message()
--- a/agenthub/micro/README.md
+++ b/agenthub/micro/README.md
@@ -12,3 +12,6 @@ in the following structure:
 Note that `prompt.md` could use jinja2 template syntax. During runtime, `prompt.md`
 is loaded and rendered, and used together with `agent.yaml` to initialize a
 micro-agent.
+
+Micro-agents can be used independently. You can also use `ManagerAgent` which knows
+how to coordinate the agents and collaboratively finish a task.
--- a/agenthub/micro/_instructions/actions/finish.md
+++ b/agenthub/micro/_instructions/actions/finish.md
@@ -1,2 +1,2 @@
-* `finish` - if you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working. Arguments:
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
  * `outputs` - a dictionary representing the outputs of your task, if any
--- a/agenthub/micro/_instructions/actions/reject.md
+++ b/agenthub/micro/_instructions/actions/reject.md
@@ -1,2 +1,2 @@
 * `reject` - reject the task. Arguments:
-  * `outputs` - a dictionary representing the outputs of your task, if any
+  * `outputs` - a dictionary with only a `reason` attribute
--- a/agenthub/micro/agent.py
+++ b/agenthub/micro/agent.py
@@ -55,14 +55,13 @@ class MicroAgent(Agent):
        del self.delegates[self.agent_definition['name']]

    def step(self, state: State) -> Action:
-        latest_user_message = state.get_current_user_intent()
        prompt = self.prompt_template.render(
            state=state,
            instructions=instructions,
            to_json=to_json,
            history_to_json=history_to_json,
            delegates=self.delegates,
-            latest_user_message=latest_user_message,
+            latest_user_message=state.get_current_user_intent(),
        )
        messages = [{'content': prompt, 'role': 'user'}]
        resp = self.llm.do_completion(messages=messages)
--- a/agenthub/micro/coder/agent.yaml
+++ b/agenthub/micro/coder/agent.yaml
@@ -2,5 +2,5 @@ name: CoderAgent
 description: Given a particular task, and a detailed description of the codebase, accomplishes the task
 inputs:
  task: string
-  codebase_summary: string
+  summary: string
 outputs: {}
--- a/agenthub/micro/coder/prompt.md
+++ b/agenthub/micro/coder/prompt.md
@@ -2,7 +2,7 @@
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:

-{{ latest_user_message }}
+{{ state.inputs.task }}

 {% if state.inputs.summary %}
 Here's a summary of the codebase, as it relates to this task:
--- a/agenthub/micro/commit_writer/agent.yaml
+++ b/agenthub/micro/commit_writer/agent.yaml
@@ -3,3 +3,4 @@ description: "Write a git commit message for files in the git staging area"
 inputs: {}
 outputs:
  answer: string
+  reason: string
--- a/agenthub/micro/commit_writer/prompt.md
+++ b/agenthub/micro/commit_writer/prompt.md
@@ -14,7 +14,7 @@ changes. The commit message should include:
 You should find the diff using `git diff --cached`, compile a commit message,
 and call the `finish` action with `outputs.answer` set to the answer. If current
 repo is not a valid git repo, or there is no diff in the staging area, please call
-the `reject` action with `outputs.answer` set to the reason.
+the `reject` action.

 ## History
 {{ instructions.history_truncated }}
--- a/agenthub/micro/manager/agent.yaml
+++ b/agenthub/micro/manager/agent.yaml
@@ -3,4 +3,6 @@ description: Delegates tasks to microagents based on their area of expertise
 generates: Action
 inputs:
  task: string
-outputs: {}
+outputs:
+  summary: string # if finished
+  reason: string # if rejected
--- a/agenthub/micro/manager/prompt.md
+++ b/agenthub/micro/manager/prompt.md
@@ -7,6 +7,15 @@ can do the actual work. A description of each agent is provided below. You MUST
 select one of the delegates below to move towards accomplishing the task, and you MUST
 provide the correct inputs for the delegate you select.

+Note: the delegated agent either returns "finish" or "reject".
+- If the action is "finish", but the full task is not done yet, you should
+continue to delegate to one of the agents below to until the full task is finished.
+- If the action is "reject", it means the delegated agent is not capable of the
+task you send to. You should revisit the input you send to the delegate, and consider
+whether any other delegate would be able to solve the task. If you cannot find
+a proper delegate agent, or the delegate attempts keep failing, call the `reject`
+action.
+
 ## Agents
 {% for name, details in delegates.items() %}
 ### {{ name }}
@@ -19,9 +28,13 @@ provide the correct inputs for the delegate you select.
 {{ instructions.history_truncated }}
 {{ history_to_json(state.history[-10:]) }}

+If the last item in the history is an error, you should try to fix it. If you
+cannot fix it, call the `reject` action.
+
 ## Available Actions
 {{ instructions.actions.delegate }}
 {{ instructions.actions.finish }}
+{{ instructions.actions.reject }}

 ## Format
 {{ instructions.format.action }}
--- a/agenthub/micro/math_agent/prompt.md
+++ b/agenthub/micro/math_agent/prompt.md
@@ -1,7 +1,7 @@
 # Task
 You are a brilliant mathematician and programmer. You've been given the following problem to solve:

-{{ latest_user_message }}
+`{{ state.inputs.task }}`

 Please write a python script that solves this problem, and prints the answer to stdout.
 ONLY print the answer to stdout, nothing else.
--- a/agenthub/micro/postgres_agent/prompt.md
+++ b/agenthub/micro/postgres_agent/prompt.md
@@ -2,7 +2,7 @@
 You are a database engineer. You are working on an existing Postgres project, and have been given
 the following task:

-{{ latest_user_message }}
+{{ state.inputs.task }}

 You must:
 * Investigate the existing migrations to understand the current schema
--- a/agenthub/micro/registry.py
+++ b/agenthub/micro/registry.py
@@ -4,7 +4,10 @@ import yaml

 all_microagents = {}

-for dir in os.listdir(os.path.dirname(__file__)):
+# Get the list of directories and sort them to preserve determinism
+dirs = sorted(os.listdir(os.path.dirname(__file__)))
+
+for dir in dirs:
    base = os.path.dirname(__file__) + '/' + dir
    if os.path.isfile(base):
        continue
--- a/agenthub/micro/study_repo_for_task/prompt.md
+++ b/agenthub/micro/study_repo_for_task/prompt.md
@@ -1,9 +1,11 @@
 # Task
-You are a software engineer. You've inherited an existing codebase, which you're
-learning about for the first time. You need to study the codebase to find all
-the information needed to complete this task:
+You are a software architect. Your team has inherited an existing codebase, and
+need to finish a project:

-{{ latest_user_message }}
+{{ state.inputs.task }}
+
+As an architect, you need to study the codebase to find all the information that
+might be helpful for your software engineering team.

 ## Available Actions
 {{ instructions.actions.run }}
@@ -11,11 +13,14 @@ the information needed to complete this task:
 {{ instructions.actions.message }}
 {{ instructions.actions.finish }}

-You must ONLY `run` commands that have no side-effects, like `ls` and `grep`.
+You must ONLY `run` commands that have no side-effects, like `ls` and `grep`. You
+MUST NOT modify or write to any file.

 Do NOT finish until you have a complete understanding of which parts of the
-codebase are relevant to the task, including particular files, functions, and classes.
+codebase are relevant to the project, including particular files, functions, and classes.
 When you're done, put your summary in `outputs.summary` in the `finish` action.
+Remember, your task is to explore and study the current repository, not actually
+implement the solution. If the codebase is empty, you should call the `finish` action.

 ## History
 {{ instructions.history_truncated }}
@@ -23,3 +28,36 @@ When you're done, put your summary in `outputs.summary` in the `finish` action.

 ## Format
 {{ instructions.format.action }}
+
+## Examples
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+{
+  "action": "run",
+  "args": {
+    "command": "ls",
+    "background": false
+  }
+}
+
+USER:
+OBSERVATION:
+[]
+
+ASSISTANT:
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "summary": "The codebase appears to be empty. Engineers should start everything from scratch."
+    }
+  }
+}
+
+--- END OF EXAMPLE ---
--- a/agenthub/micro/typo_fixer_agent/agent.yaml
+++ b/agenthub/micro/typo_fixer_agent/agent.yaml
@@ -1,5 +1,6 @@
 name: TypoFixerAgent
 description: Fixes typos in files in the current working directory
-inputs: {}
+inputs:
+  task: string
 outputs:
  summary: string
--- a/agenthub/micro/typo_fixer_agent/prompt.md
+++ b/agenthub/micro/typo_fixer_agent/prompt.md
@@ -1,5 +1,13 @@
 # Task
-You are a proofreader tasked with fixing typos in the files in your current working directory. Your goal is to:
+You are a proofreader tasked with fixing typos in the files in your current working directory.
+
+{% if state.inputs.task %}
+Specifically, your task is:
+{{ state.inputs.task }}
+{% endif %}
+
+To achieve this goal, you should:
+
 1. Scan the files for typos
 2. Overwrite the files with the typos fixed
 3. Provide a summary of the typos fixed
@@ -13,10 +21,10 @@ You are a proofreader tasked with fixing typos in the files in your current work

 To complete this task:
 1. Use the `read` action to read the contents of the files in your current working directory. Make sure to provide the file path in the format `'./file_name.ext'`.
-2. Use the `think` action to analyze the contents and identify typos.
+2. Use the `message` action to analyze the contents and identify typos.
 3. Use the `write` action to create new versions of the files with the typos fixed.
  - Overwrite the original files with the corrected content. Make sure to provide the file path in the format `'./file_name.ext'`.
-4. Use the `think` action to generate a summary of the typos fixed, including the original and fixed versions of each typo, and the file(s) they were found in.
+4. Use the `message` action to generate a summary of the typos fixed, including the original and fixed versions of each typo, and the file(s) they were found in.
 5. Use the `finish` action to return the summary in the `outputs.summary` field.

 Do NOT finish until you have fixed all the typos and generated a summary.
--- a/agenthub/micro/verifier/prompt.md
+++ b/agenthub/micro/verifier/prompt.md
@@ -2,9 +2,10 @@
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:

-{{ latest_user_message }}
+{{ state.inputs.task }}

-Your goal is to verify that the changes are correct and bug-free.
+Note the changes might have already been applied in-line. You should focus on
+validating if the task is solved, nothing else.

 ## Available Actions
 {{ instructions.actions.run }}
--- a/agenthub/monologue_agent/agent.py
+++ b/agenthub/monologue_agent/agent.py
@@ -26,6 +26,7 @@ from opendevin.events.observation import (
 from opendevin.events.serialization.event import event_to_memory
 from opendevin.llm.llm import LLM
 from opendevin.memory.condenser import MemoryCondenser
+from opendevin.runtime.tools import RuntimeTool

 if config.agent.memory_enabled:
    from opendevin.memory.memory import LongTermMemory
@@ -46,6 +47,7 @@ class MonologueAgent(Agent):
    initial_thoughts: list[dict[str, str]]
    memory: 'LongTermMemory | None'
    memory_condenser: MemoryCondenser
+    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]

    def __init__(self, llm: LLM):
        """
--- a/agenthub/planner_agent/agent.py
+++ b/agenthub/planner_agent/agent.py
@@ -2,6 +2,7 @@ from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.events.action import Action, AgentFinishAction
 from opendevin.llm.llm import LLM
+from opendevin.runtime.tools import RuntimeTool

 from .prompt import get_prompt, parse_response

@@ -12,6 +13,7 @@ class PlannerAgent(Agent):
    The planner agent utilizes a special prompting strategy to create long term plans for solving problems.
    The agent is given its previous action-observation pairs, current task, and hint based on last action taken at every step.
    """
+    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]

    def __init__(self, llm: LLM):
        """
--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -10,7 +10,7 @@ RUN npm ci
 COPY ./frontend ./
 RUN npm run make-i18n && npm run build

-FROM python:3.12-slim as backend-builder
+FROM python:3.12.3-slim as backend-builder

 WORKDIR /app
 ENV PYTHONPATH '/app'
@@ -28,7 +28,7 @@ COPY ./pyproject.toml ./poetry.lock ./
 RUN touch README.md
 RUN poetry install --without evaluation --no-root && rm -rf $POETRY_CACHE_DIR

-FROM python:3.12-slim as runtime
+FROM python:3.12.3-slim as runtime

 WORKDIR /app

--- a/containers/app/entrypoint.sh
+++ b/containers/app/entrypoint.sh
@@ -50,6 +50,7 @@ else
    groupadd -g $DOCKER_SOCKET_GID docker
  fi

+  mkdir -p /home/enduser/.cache/huggingface/hub/
  mkdir -p /home/enduser/.cache/ms-playwright/
  mv /home/opendevin/.cache/ms-playwright/ /home/enduser/.cache/

--- a/docs/docusaurus.config.ts
+++ b/docs/docusaurus.config.ts
@@ -51,7 +51,6 @@ const config: Config = {
      } satisfies Preset.Options,
    ],
  ],
-
  themeConfig: {
    image: "img/docusaurus.png",
    navbar: {
@@ -81,43 +80,6 @@ const config: Config = {
        },
      ],
    },
-    footer: {
-      style: "dark",
-      links: [
-        {
-          title: "OpenDevin",
-          items: [
-            {
-              label: "Docs",
-              to: "/modules/usage/intro",
-            },
-          ],
-        },
-        {
-          title: "Community",
-          items: [
-            {
-              label: "Slack",
-              href: "https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw"
-            },
-            {
-              label: "Discord",
-              href: "https://discord.gg/ESHStjSjD4",
-            },
-          ],
-        },
-        {
-          title: "More",
-          items: [
-            {
-              label: "GitHub",
-              href: "https://github.com/OpenDevin/OpenDevin",
-            },
-          ],
-        },
-      ],
-      copyright: `Copyright © ${new Date().getFullYear()} OpenDevin`,
-    },
    prism: {
      theme: prismThemes.oneLight,
      darkTheme: prismThemes.oneDark,
--- a/docs/modules/usage/about.md
+++ b/docs/modules/usage/about.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 6
+sidebar_position: 7
 ---

 # 📚 Misc
@@ -15,7 +15,7 @@ Achieving full replication of production-grade applications with LLMs is a compl

 ## 🚧 Default Agent

- Our default Agent is currently the MonologueAgent, which has limited capabilities, but is fairly stable. We're working on other Agent implementations, including [SWE Agent](https://swe-agent.com/). You can [read about our current set of agents here](./agents).
+- Our default Agent is currently the CodeActAgent, which is capable of generating code and handling files. We're working on other Agent implementations, including [SWE Agent](https://swe-agent.com/). You can [read about our current set of agents here](./agents).

 ## 🤝 How to Contribute

@@ -31,7 +31,7 @@ For details, please check [this document](https://github.com/OpenDevin/OpenDevin

 Now we have both Slack workspace for the collaboration on building OpenDevin and Discord server for discussion about anything related, e.g., this project, LLM, agent, etc.

- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw)
+- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
 - [Discord server](https://discord.gg/ESHStjSjD4)

 If you would love to contribute, feel free to join our community. Let's simplify software engineering together!
--- a/docs/modules/usage/agents.md
+++ b/docs/modules/usage/agents.md
@@ -139,4 +139,4 @@ The agent is given its previous action-observation pairs, current task, and hint
 | --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `__init__`      | Initializes an agent with `llm`                                                                                                                                                           |
 | `step`          | Checks to see if current step is completed, returns `AgentFinishAction` if True. Otherwise, creates a plan prompt and sends to model for inference, adding the result as the next action. |
-| `search_memory` | Not yet implemented                                                                                                                                                                       |
+| `search_memory` | Not yet implemented                                                                                                                                                                       |
--- a/docs/modules/usage/feedback.md
+++ b/docs/modules/usage/feedback.md
@@ -0,0 +1,18 @@
+---
+sidebar_position: 6
+---
+
+# ✅ Providing Feedback
+
+When using OpenDevin, you will undoubtably encounter cases where things work well, and others where they don't. We encourage you to provide feedback when you use OpenDevin to help give feedback to the development team, and perhaps more importantly, create an open corpus of coding agent training examples -- Share-OpenDevin!
+
+## 📝 How to Provide Feedback
+
+Providing feedback is easy! When you are using OpenDevin, you can press the thumbs-up or thumbs-down button at any point during your interaction with. You will be prompted to provide your email address (e.g. so we can contact you if we want to ask any follow-up questions), and you can choose whether you want to provide feedback publicly or privately.
+
+<iframe width="560" height="315" src="https://www.youtube.com/embed/5rFx-StMVV0?si=svo7xzp6LhGK_GXr" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
+
+## 📜 Data License and Privacy
+
+* **Public** data will be distributed under the MIT License, like OpenDevin itself, and can be used by the community to train and test models. Obviously, feedback that you can make public will be more valuable for the community as a whole, so when you are not dealing with sensitive information, we would encourage you to choose this option!
+* **Private** data will only be shared with the OpenDevin team for the purpose of improving OpenDevin.
--- a/docs/modules/usage/intro.mdx
+++ b/docs/modules/usage/intro.mdx
@@ -42,7 +42,7 @@ Explore the codebase of OpenDevin on [GitHub](https://github.com/OpenDevin/OpenD
  />
 </a>
 <br></br>
-<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw">
+<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
  <img
    src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
    alt="Join our Slack community"
@@ -61,38 +61,37 @@ Explore the codebase of OpenDevin on [GitHub](https://github.com/OpenDevin/OpenD
 The easiest way to run OpenDevin is inside a Docker container. It works best with the most recent version of Docker, `26.0.0`.
 You must be using Linux, Mac OS, or WSL on Windows.

-To start the app, run these commands, replacing `$(pwd)/workspace` with the directory you want OpenDevin to work with.
-
-```
-# The directory you want OpenDevin to work with. It MUST be an absolute path!
-export WORKSPACE_BASE=$(pwd)/workspace
-```
+To start OpenDevin in a docker container, run the following commands in your terminal:

 :::warning
-OpenDevin runs bash commands within a Docker sandbox, so it should not affect your machine. But your workspace directory will be attached to that sandbox, and files in the directory may be modified or deleted.
+When you run the following command, files in `./workspace` may be modified or deleted.
 :::

-```
-docker run \
-    -it \
+```bash
+OPENDEVIN_WORKSPACE=$(pwd)/workspace
+docker run -it \
    --pull=always \
-    -e LLM_API_KEY \
    -e SANDBOX_USER_ID=$(id -u) \
-    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-    -v $WORKSPACE_BASE:/opt/workspace_base \
+    -e PERSIST_SANDBOX="true" \
+    -e SSH_PASSWORD="make something up here" \
+    -e WORKSPACE_MOUNT_PATH=$OPENDEVIN_WORKSPACE \
+    -v $OPENDEVIN_WORKSPACE:/opt/workspace_base \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
-    ghcr.io/opendevin/opendevin:0.5
+    --name opendevin-app-$(date +%Y%m%d%H%M%S) \
+    ghcr.io/opendevin/opendevin:0.6
 ```

-You'll find OpenDevin running at [http://localhost:3000](http://localhost:3000).
+You'll find OpenDevin running at [http://localhost:3000](http://localhost:3000) with access to `./workspace`. To have OpenDevin operate on your code, place it in `./workspace`.
+
+OpenDevin will only have access to this workspace folder. The rest of your system will not be affected as it runs in a secured docker sandbox.

 :::tip
 If you want to use the **(unstable!)** bleeding edge, you can use `ghcr.io/opendevin/opendevin:main` as the image (last line).
 :::

-See [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) for instructions on running OpenDevin without Docker.
+For the development workflow, see [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md).

 Are you having trouble? Check out our [Troubleshooting Guide](https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting).

--- a/docs/modules/usage/troubleshooting/troubleshooting.md
+++ b/docs/modules/usage/troubleshooting/troubleshooting.md
@@ -4,52 +4,53 @@ sidebar_position: 5

 # 🚧 Troubleshooting

-There are some error messages that get reported over and over by users.
-We'll try to make the install process easier, and to make these error messages
-better in the future. But for now, you can look for your error message below,
-and see if there are any workaround.
+There are some error messages that frequently get reported by users.
+
+We'll try to make the install process easier and these error messages
+better in the future. But for now, you can look for your error message below and see if there are any workarounds.

 For each of these error messages **there is an existing issue**. Please do not
-open an new issue--just comment there.
+open a new issue--just comment there.

 If you find more information or a workaround for one of these issues, please
-open a PR to add details to this file.
+open a *PR* to add details to this file.

 :::tip
-If you're running on Windows and having trouble, check out our [guide for Windows users](troubleshooting/windows)
+If you're running on Windows and having trouble, check out our [guide for Windows (WSL) users](troubleshooting/windows).
 :::

-## Unable to connect to docker
+## Unable to connect to Docker

 [GitHub Issue](https://github.com/OpenDevin/OpenDevin/issues/1226)

 ### Symptoms

-```
+```bash
 Error creating controller. Please check Docker is running and visit `https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting` for more debugging information.
 ```

-```
+```bash
 docker.errors.DockerException: Error while fetching server API version: ('Connection aborted.', FileNotFoundError(2, 'No such file or directory'))
 ```

 ### Details

-OpenDevin uses a docker container to do its work safely, without potentially breaking your machine.
+OpenDevin uses a Docker container to do its work safely, without potentially breaking your machine.

 ### Workarounds

 * Run `docker ps` to ensure that docker is running
 * Make sure you don't need `sudo` to run docker [see here](https://www.baeldung.com/linux/docker-run-without-sudo)
-* If you are on a mac, check the [permissions requirements](https://docs.docker.com/desktop/mac/permission-requirements/) and in particular consider enabling the "Allow the default Docker socket to be used" under "Settings > Advanced" in Docker Desktop.
-* If you are on a mac, Upgrade your Docker to the latest version under "Check for Updates"
+* If you are on a Mac, check the [permissions requirements](https://docs.docker.com/desktop/mac/permission-requirements/) and in particular consider enabling the `Allow the default Docker socket to be used` under `Settings > Advanced` in Docker Desktop.
+* In addition, upgrade your Docker to the latest version under `Check for Updates`

 ## Unable to connect to SSH box
+
 [GitHub Issue](https://github.com/OpenDevin/OpenDevin/issues/1156)

 ### Symptoms

-```
+```python
 self.shell = DockerSSHBox(
 ...
 pexpect.pxssh.ExceptionPxssh: Could not establish connection to host
@@ -62,17 +63,19 @@ especially Windows, this seems to fail.

 ### Workarounds

- Restart your computer (sometimes works?)
- Be sure to have the latest versions of WSL and Docker
- Try [this reinstallation guide](https://github.com/OpenDevin/OpenDevin/issues/1156#issuecomment-2064549427)
- Set `-e SANDBOX_TYPE=exec` to switch to the ExecBox docker container
+* Restart your computer (sometimes it does work)
+* Be sure to have the latest versions of WSL and Docker
+* Check that your distribution in WSL is up to date as well
+* Try [this reinstallation guide](https://github.com/OpenDevin/OpenDevin/issues/1156#issuecomment-2064549427)
+* Set `-e SANDBOX_TYPE=exec` to switch to the ExecBox docker container

 ## Unable to connect to LLM
+
 [GitHub Issue](https://github.com/OpenDevin/OpenDevin/issues/1208)

 ### Symptoms

-```
+```python
  File "/app/.venv/lib/python3.12/site-packages/openai/_exceptions.py", line 81, in __init__
    super().__init__(message, response.request, body=body)
                              ^^^^^^^^^^^^^^^^
@@ -83,18 +86,20 @@ AttributeError: 'NoneType' object has no attribute 'request'

 [GitHub Issues](https://github.com/OpenDevin/OpenDevin/issues?q=is%3Aissue+is%3Aopen+404)

-This usually happens with local LLM setups, when OpenDevin can't connect to the LLM server.
+This usually happens with *local* LLM setups, when OpenDevin can't connect to the LLM server.
 See our guide for [local LLMs](llms/localLLMs) for more information.

 ### Workarounds

- Check your `LLM_BASE_URL`
- Check that ollama is running OK
- Make sure you're using `--add-host host.docker.internal:host-gateway` when running in docker
+* Check your `base_url` in your config.toml (if it exists) under the "llm" section
+* Check that ollama (or whatever LLM you're using) is running OK
+* Make sure you're using `--add-host host.docker.internal:host-gateway` when running in Docker
+
+## `404 Resource not found`

-## 404 Resource not found
 ### Symptoms
-```
+
+```python
 Traceback (most recent call last):
  File "/app/.venv/lib/python3.12/site-packages/litellm/llms/openai.py", line 414, in completion
    raise e
@@ -119,18 +124,86 @@ openai.NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Re
 ```

 ### Details
+
 This happens when LiteLLM (our library for connecting to different LLM providers) can't find
-the API you're trying to connect to. Most often this happens for Azure or ollama users.
+the API endpoint you're trying to connect to. Most often this happens for Azure or ollama users.

 ### Workarounds
- Check that you've set `LLM_BASE_URL` properly
- Check that model is set properly, based on the [LiteLLM docs](https://docs.litellm.ai/docs/providers)
-  - If you're running inside the UI, be sure to set the `model` in the settings modal
-  - If you're running headless (via main.py) be sure to set `LLM_MODEL` in your env/config
- Make sure you've followed any special instructions for your LLM provider
-  - [ollama](/OpenDevin/modules/usage/llms/localLLMs)
-  - [Azure](/OpenDevin/modules/usage/llms/azureLLMs)
-  - [Google](/OpenDevin/modules/usage/llms/googleLLMs)
- Make sure your API key is correct
- See if you can connect to the LLM using `curl`
- Try [connecting via LiteLLM directly](https://github.com/BerriAI/litellm) to test your setup
+
+* Check that you've set `LLM_BASE_URL` properly
+* Check that model is set properly, based on the [LiteLLM docs](https://docs.litellm.ai/docs/providers)
+  * If you're running inside the UI, be sure to set the `model` in the settings modal
+  * If you're running headless (via main.py) be sure to set `LLM_MODEL` in your env/config
+* Make sure you've followed any special instructions for your LLM provider
+  * [ollama](/OpenDevin/modules/usage/llms/localLLMs)
+  * [Azure](/OpenDevin/modules/usage/llms/azureLLMs)
+  * [Google](/OpenDevin/modules/usage/llms/googleLLMs)
+* Make sure your API key is correct
+* See if you can connect to the LLM using `curl`
+* Try [connecting via LiteLLM directly](https://github.com/BerriAI/litellm) to test your setup
+
+## `make build` getting stuck on package installations
+
+### Symptoms
+
+Package installation stuck on `Pending...` without any error message:
+
+```bash
+Package operations: 286 installs, 0 updates, 0 removals
+
+  - Installing certifi (2024.2.2): Pending...
+  - Installing h11 (0.14.0): Pending...
+  - Installing idna (3.7): Pending...
+  - Installing sniffio (1.3.1): Pending...
+  - Installing typing-extensions (4.11.0): Pending...
+```
+
+### Details
+
+In rare cases, `make build` can seemingly get stuck on package installations
+without any error message.
+
+### Workarounds
+
+* The package installer Poetry may miss a configuration setting for
+where credentials are to be looked up (keyring).
+
+### Workaround
+
+First check with `env` if a value for `PYTHON_KEYRING_BACKEND` exists.
+If not, run the below command to set it to a known value and retry the build:
+
+```bash
+export PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring
+```
+
+## Sessions are not restored
+
+### Symptoms
+
+OpenDevin usually asks whether to resume or start a new session when opening the UI.
+But clicking "Resume" still starts a fresh new chat.
+
+### Details
+
+With a standard installation as of today session data is stored in memory.
+Currently, if OpenDevin's service is restarted, previous sessions become
+invalid (a new secret is generated) and thus not recoverable.
+
+### Workarounds
+
+* Change configuration to make sessions persistent by editing the `config.toml`
+file (in OpenDevin's root folder) by specifying a `file_store` and an
+absolute `file_store_path`:
+
+```toml
+file_store="local"
+file_store_path="/absolute/path/to/opendevin/cache/directory"
+```
+
+* Add a fixed jwt secret in your .bashrc, like below, so that previous session id's
+should stay accepted.
+
+```bash
+EXPORT JWT_SECRET=A_CONST_VALUE
+```
--- a/docs/modules/usage/troubleshooting/windows.md
+++ b/docs/modules/usage/troubleshooting/windows.md
@@ -5,32 +5,72 @@ Please be sure to run all commands inside your WSL terminal.

 ## Troubleshooting

+### Error: 'docker' could not be found in this WSL 2 distro.
+
+If you are using Docker Desktop, make sure to start it before calling any docker command from inside WSL.
+Docker also needs to have the WSL integration option activated.
+
+### Recommendation: Do not run as root user
+
+For security reasons, it is highly recommended to not run OpenDevin as the root user, but a user with a non-zero UID.
+In addition, persistent sandboxes won't be supported when running as root and during start of OpenDevin an appropriate message may appear.
+
+References:
+
+* [Why it is bad to login as root](https://askubuntu.com/questions/16178/why-is-it-bad-to-log-in-as-root)
+* [Set default user in WSL](https://www.tenforums.com/tutorials/128152-set-default-user-windows-subsystem-linux-distro-windows-10-a.html#option2)  
+Hint about the 2nd reference: for Ubuntu users, the command could actually be "ubuntupreview" instead of "ubuntu".
+
 ### Failed to create opendevin user

-If you encounter the following error during setup: `Exception: Failed to create opendevin user in sandbox: b'useradd: UID 0 is not unique\n'`.
+If you encounter the following error during setup:
+
+```sh
+Exception: Failed to create opendevin user in sandbox: 'useradd: UID 0 is not unique'
+ ```
+
 You can resolve it by running:
-`    export SANDBOX_USER_ID=1000
-   `
+
+```sh
+export SANDBOX_USER_ID=1000
+```

 ### Poetry Installation

-If you face issues running Poetry even after installing it during the build process, you may need to add its binary path to your environment:
-`    export PATH="$HOME/.local/bin:$PATH"
-   `
+* If you face issues running Poetry even after installing it during the build process, you may need to add its binary path to your environment:
+
+```sh
+export PATH="$HOME/.local/bin:$PATH"
+```
+
+* If make build stops on an error like this:
+
+```sh
+ModuleNotFoundError: no module named <module-name>
+```
+
+This could be an issue with Poetry's cache.
+Try to run these 2 commands after another:
+
+```sh
+rm -r ~/.cache/pypoetry
+make build
+```

 ### NoneType object has no attribute 'request'

 If you are experiencing issues related to networking, such as `NoneType object has no attribute 'request'` when executing `make run`, you may need to configure your WSL2 networking settings. Follow these steps:

- Open or create the `.wslconfig` file located at `C:\Users\%username%\.wslconfig` on your Windows host machine.
- Add the following configuration to the `.wslconfig` file:
+* Open or create the `.wslconfig` file located at `C:\Users\%username%\.wslconfig` on your Windows host machine.
+* Add the following configuration to the `.wslconfig` file:

-```
+```sh
 [wsl2]
 networkingMode=mirrored
 localhostForwarding=true
 ```

- Save the `.wslconfig` file.
- Restart WSL2 completely by exiting any running WSL2 instances and executing the command `wsl --shutdown` in your command prompt or terminal.
- After restarting WSL, attempt to execute `make run` again. The networking issue should be resolved.
+* Save the `.wslconfig` file.
+* Restart WSL2 completely by exiting any running WSL2 instances and executing the command `wsl --shutdown` in your command prompt or terminal.
+* After restarting WSL, attempt to execute `make run` again.  
+The networking issue should be resolved.
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
--- a/docs/package.json
+++ b/docs/package.json
@@ -16,16 +16,15 @@
  },
  "dependencies": {
    "@docusaurus/core": "3.2.1",
+    "@docusaurus/plugin-content-pages": "^3.3.2",
    "@docusaurus/preset-classic": "3.2.1",
    "@mdx-js/react": "^3.0.0",
-    "autoprefixer": "^10.4.19",
    "clsx": "^2.0.0",
-    "postcss": "^8.4.38",
    "prism-react-renderer": "^2.3.0",
    "react": "^18.0.0",
    "react-dom": "^18.0.0",
-    "react-use": "^17.5.0",
-    "tailwindcss": "^3.4.3"
+    "react-icons": "^5.2.1",
+    "react-use": "^17.5.0"
  },
  "devDependencies": {
    "@docusaurus/module-type-aliases": "3.2.1",
--- a/docs/plugins/tailwind-config.cjs
+++ b/docs/plugins/tailwind-config.cjs
@@ -0,0 +1,13 @@
+export default function tailwindPlugin(context, options) {
+  return {
+    name: 'tailwind-plugin',
+    configurePostCss(postcssOptions) {
+      postcssOptions.plugins = [
+        require('postcss-import'),
+        require('tailwindcss'),
+        require('autoprefixer'),
+      ];
+      return postcssOptions;
+    },
+  };
+}
--- a/docs/src/components/CustomFooter.tsx
+++ b/docs/src/components/CustomFooter.tsx
@@ -0,0 +1,35 @@
+import React from "react";
+import { FaSlack, FaDiscord, FaGithub } from "react-icons/fa";
+import "../css/footer.css"; // Importing the CSS file
+
+function CustomFooter() {
+  return (
+    <footer className="custom-footer">
+      <div className="footer-content">
+        <div className="footer-top">
+          <div className="footer-title">OpenDevin</div>
+          <div className="footer-link">
+            <a href="/modules/usage/intro">Docs</a>
+          </div>
+        </div>
+        <div className="footer-community">Community</div>
+        <div className="footer-icons">
+          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA" target="_blank" rel="noopener noreferrer">
+            <FaSlack />
+          </a>
+          <a href="https://discord.gg/ESHStjSjD4" target="_blank" rel="noopener noreferrer">
+            <FaDiscord />
+          </a>
+          <a href="https://github.com/OpenDevin/OpenDevin" target="_blank" rel="noopener noreferrer">
+            <FaGithub />
+          </a>
+        </div>
+        <div className="footer-bottom">
+          <p>Copyright &copy; {new Date().getFullYear()} OpenDevin</p>
+        </div>
+      </div>
+    </footer>
+  );
+}
+
+export default CustomFooter;
--- a/docs/src/components/HomepageHeader/HomepageHeader.tsx
+++ b/docs/src/components/HomepageHeader/HomepageHeader.tsx
@@ -2,18 +2,18 @@ import Link from "@docusaurus/Link";
 import useDocusaurusContext from "@docusaurus/useDocusaurusContext";
 import Heading from "@theme/Heading";
 import { Demo } from "../Demo/Demo";
-import styles from "./index.module.css";
+import "../../css/homepageHeader.css"; // Importing the CSS file

 export function HomepageHeader() {
  const { siteConfig } = useDocusaurusContext();
  return (
-    <div className={styles.headerContainer}>
-      <div className={styles.header}>
-        <Heading as="h1" className="hero__title">
+    <div className="homepage-header">
+      <div className="header-content">
+        <Heading as="h1" className="header-title">
          {siteConfig.title}
        </Heading>
-        <p className="hero__subtitle">{siteConfig.tagline}</p>
-        <div className={styles.buttons}>
+        <p className="header-subtitle">{siteConfig.tagline}</p>
+        <div className="header-buttons">
          <Link
            className="button button--secondary button--lg"
            to="/modules/usage/intro"
@@ -21,8 +21,9 @@ export function HomepageHeader() {
            Get Started
          </Link>
        </div>
-      </div>{" "}
-      <Demo />
+        <Demo />
+      </div>
    </div>
  );
 }
+
--- a/docs/src/components/HomepageHeader/index.module.css
+++ b/docs/src/components/HomepageHeader/index.module.css
@@ -1,37 +0,0 @@
-.headerContainer {
-  background: radial-gradient(circle, var(--secondary), var(--secondary-light));
-  background-size: 200% 200%;
-  animation: gradientAnimation 10s linear infinite;
-  display: flex;
-  justify-content: center;
-}
-
-@media only screen and (max-width: 600px) {
-  .headerContainer {
-    flex-direction: column;
-  }
-}
-
-@keyframes gradientAnimation {
-  0% {
-    background-position: left center;
-  }
-  50% {
-    background-position: right center;
-  }
-  100% {
-    background-position: left center;
-  }
-}
-.header {
-  max-width: 1300px;
-  color: white;
-  display: flex;
-  margin-left: 100px;
-  margin-right: 100px;
-  flex-direction: column;
-  align-items: center;
-  justify-content: center;
-  overflow: hidden;
-  padding: 70px 30px 30px;
-}
--- a/docs/src/components/Welcome/Welcome.tsx
+++ b/docs/src/components/Welcome/Welcome.tsx
@@ -1,11 +1,12 @@
-import styles from "./styles.module.css";
+import React from "react";
+import "../../css/welcome.css";  // Importing the CSS file

 export function Welcome() {
  return (
-    <div className={styles.container}>
-      <div className={styles.innerContainer}>
-        <img src="img/logo.png" className={styles.sidebarImage} />
-        <p className={styles.welcomeText}>
+    <div className="text-white">
+      <div className="welcome-container">
+        <img src="img/logo.png" className="welcome-logo" />
+        <p className="welcome-text">
          Welcome to OpenDevin, an open-source project aiming to replicate
          Devin, an autonomous AI software engineer who is capable of executing
          complex engineering tasks and collaborating actively with users on
--- a/docs/src/components/Welcome/styles.module.css
+++ b/docs/src/components/Welcome/styles.module.css
@@ -1,27 +0,0 @@
-.container {
-  display: flex;
-  flex-direction: column;
-  padding-top: 25px;
-  padding-bottom: 25px;
-  width: 100%;
-}
-
-.innerContainer {
-  padding: 50px;
-  width: 100%;
-  max-width: 1300px;
-  padding-top: 30px;
-  margin: auto;
-  display: flex;
-  align-items: center;
-}
-
-.sidebarImage {
-  max-width: 400px;
-  padding-right: 30px;
-}
-
-.welcomeText {
-  text-align: justify;
-  font-size: larger;
-}
--- a/docs/src/css/custom.css
+++ b/docs/src/css/custom.css
@@ -5,6 +5,7 @@
 */

 /* You can override the default Infima variables here. */
+
 :root {
  --ifm-color-primary: #4465db;
  --ifm-code-font-size: 95%;
@@ -33,4 +34,4 @@

 .a {
  text-decoration: underline;
-}
+}
--- a/docs/src/css/faq.css
+++ b/docs/src/css/faq.css
@@ -0,0 +1,66 @@
+/* faq.css */
+
+.faq-container {
+    margin: auto;
+    padding: 24px;
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+    margin-bottom: 24px;
+  }
+  
+  .faq-title {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 2rem;
+    padding: 8px;
+    text-transform: uppercase;
+    font-weight: bold;
+  }
+  
+  @media (min-width: 1024px) {
+    .faq-title {
+      font-size: 6rem;
+    }
+  }
+  
+  .faq-section {
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+    width: 100%;
+    margin-bottom: 24px;
+  }
+  
+  .faq-section-title {
+    text-transform: uppercase;
+    font-weight: bold;
+    font-size: 2rem;
+    letter-spacing: 0.1em;
+  }
+  
+  .highlight {
+    font-weight: 600;
+    color: var(--logo);
+  }
+  
+  .faq-steps ol {
+    padding-left: 24px;
+  }
+  
+  .command-box {
+    display: flex;
+    flex-direction: column;
+    padding: 8px;
+    background-color: #e0e0e0;
+    border-radius: 0.375rem;
+    height: 6vh;
+    text-transform: uppercase;
+    color: #4a5568;
+  }
+  
+  .command-box + .command-box {
+    height: 8vh;
+  }
+  
--- a/docs/src/css/footer.css
+++ b/docs/src/css/footer.css
@@ -0,0 +1,72 @@
+/* customFooter.css */
+
+.custom-footer {
+    background-color: dark;
+    color: white;
+    height: 25vh;
+    /* background: linear-gradient(to bottom, #1a1a1a, #1a1a1a); */
+    background: linear-gradient(to bottom, #1f2937, #000000);
+
+  }
+  
+  .footer-content {
+    display: flex;
+    flex-direction: column;
+    justify-content: space-between;
+    align-items: center;
+    padding: 8px;
+    height: 100%;
+  }
+  
+  .footer-top {
+    display: flex;
+    gap: 8px;
+    align-items: center;
+  }
+  
+  .footer-title {
+    font-weight: bold;
+    font-size: 1.125rem;
+  }
+  
+  @media (min-width: 768px) {
+    .footer-title {
+      font-size: 1.875rem;
+    }
+  }
+  
+  .footer-link a {
+    font-size: 0.875rem;
+    text-decoration: none;
+    color: gray;
+    transition: color 0.3s ease;
+  }
+  
+  .footer-link a:hover {
+    color: white;
+  }
+  
+  .footer-community {
+    text-transform: uppercase;
+    font-weight: 300;
+  }
+  
+  .footer-icons {
+    display: flex;
+    gap: 24px;
+    font-size: 1.875rem;
+  }
+  
+  .footer-icons a {
+    color:gray;
+    transition: color 0.3s ease;
+  }
+  
+  .footer-icons a:hover {
+    color: white;
+  }
+  
+  .footer-bottom {
+    text-transform: uppercase;
+  }
+  
--- a/docs/src/css/homepageHeader.css
+++ b/docs/src/css/homepageHeader.css
@@ -0,0 +1,36 @@
+/* homepageHeader.css */
+
+.homepage-header {
+    height: 100vh;
+    color: white;
+    background: linear-gradient(to top, #64748b, #000000);
+  }
+  
+  .header-content {
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+    align-items: center;
+    padding: 24px;
+    font-weight: 300;
+    width: 100%;
+  }
+  
+  .header-title {
+    font-size: 3rem;
+  }
+  
+  @media (min-width: 768px) {
+    .header-title {
+      font-size: 5rem;
+    }
+  }
+  
+  .header-subtitle {
+    font-size: 1.25rem;
+  }
+  
+  .header-buttons {
+    margin-top: 24px;
+  }
+  
--- a/docs/src/css/welcome.css
+++ b/docs/src/css/welcome.css
@@ -0,0 +1,53 @@
+/* welcome.css */
+
+.text-white {
+    color: white;
+  }
+
+  .welcome-container {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    flex-direction: column;
+    background: linear-gradient(to bottom, #64748b, #1f2937);
+  }
+
+  @media (min-width: 768px) {
+    .welcome-container {
+      flex-direction: row;
+      background: linear-gradient(to bottom, #64748b, #1f2937);
+    }
+  }
+
+  .welcome-logo {
+    height: 45vh;
+    width: 45vw;
+  }
+
+  @media (max-width: 640px) {
+    .welcome-logo {
+      height: 40vw;
+      width: 40vw;
+    }
+  }
+
+  @media (min-width: 768px) {
+    .welcome-logo {
+      height: auto;
+      width: 350px;
+    }
+  }
+
+  .welcome-text {
+    padding: 24px;
+    margin-bottom: 24px;
+    font-weight: 300;
+    font-size: 1.125rem;
+  }
+
+  @media (min-width: 768px) {
+    .welcome-text {
+      padding: 8px;
+      font-size: 1.5rem;
+    }
+  }
--- a/docs/src/pages/_footer.tsx
+++ b/docs/src/pages/_footer.tsx
@@ -0,0 +1,6 @@
+import React from 'react';
+import CustomFooter from '../components/CustomFooter';
+
+export default function Footer() {
+  return <CustomFooter />;
+}
--- a/docs/src/pages/faq.tsx
+++ b/docs/src/pages/faq.tsx
@@ -1,76 +1,78 @@
 import Layout from "@theme/Layout";
+import CustomFooter from "../components/CustomFooter";
+import "../css/faq.css"; 

 export default function FAQ() {
  return (
-    <Layout title="FAQ" description="Frequently Asked Questions">
-      <div
-        id="faq"
-        style={{
-          maxWidth: "900px",
-          margin: "0px auto",
-          padding: "40px",
-          textAlign: "justify",
-        }}
-      >
-        <h1 style={{ fontSize: "3rem" }}>Frequently Asked Questions</h1>
-        <h2 style={{ fontSize: "2rem" }}>Support</h2>
-        <h3>How can I report an issue with OpenDevin?</h3>
-        <p>
-          Please file a bug on{" "}
-          <a href="https://github.com/OpenDevin/OpenDevin/issues">GitHub</a> if
-          you notice a problem that likely affects others.
-          If you're having trouble installing, or have general questions, reach out on{" "}
-          <a href="https://discord.gg/mBuDGRzzES">Discord</a> or{" "}
-          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw">Slack</a>.
-        </p>
-        <h2 style={{ fontSize: "2rem" }}>General</h2>
-        <h3>What is Devin?</h3>
-        <p>
-          <span style={{ fontWeight: "600", color: "var(--logo)" }}>Devin</span>{" "}
-          represents a cutting-edge autonomous agent designed to navigate the
-          complexities of software engineering. It leverages a combination of
-          tools such as a shell, code editor, and web browser, showcasing the
-          untapped potential of LLMs in software development. Our goal is to
-          explore and expand upon Devin's capabilities, identifying both its
-          strengths and areas for improvement, to guide the progress of open
-          code models.
-        </p>
-        <h3>Why OpenDevin?</h3>
-        <p>
-          The{" "}
-          <span style={{ fontWeight: "600", color: "var(--logo)" }}>
-            OpenDevin
-          </span>{" "}
-          project is born out of a desire to replicate, enhance, and innovate
-          beyond the original Devin model. By engaging the{" "}
-          <a href="https://github.com/OpenDevin/OpenDevin">
-            open-source community
-          </a>
-          , we aim to tackle the challenges faced by Code LLMs in practical
-          scenarios, producing works that significantly contribute to the
-          community and pave the way for future advancements.
-        </p>
-        <h3>How to fix an issue on OpenDevin?</h3>
-        <p>
-          To fix an issue on GitHub using OpenDevin, send a prompt to OpenDevin asking it to follow these steps:
-          <ol>
-            <li>Read the issue on <a href="https://github.com/OpenDevin/OpenDevin/issues/1611">GitHub</a></li>
-            <li>Clone the repository and check out a new branch</li>
-            <li>Based on the instructions in the issue description, modify files to fix the issue</li>
-            <li>Push the resulting output to GitHub using the GITHUB_TOKEN environment variable</li>
-            <li>Tell me the link that I need to go to to send a pull request</li>
-          </ol>
-          Before you run OpenDevin, you can do:
-          <pre>
-            export SANDBOX_ENV_GITHUB_TOKEN=XXX
-          </pre>
-          where XXX is a GitHub token that you created that has permissions to push to the OpenDevin repo. If you don’t have write permission to the OpenDevin repo, you might need to change that to:
-          <pre>
-            4. Push the resulting output to my fork at https://github.com/USERNAME/OpenDevin/ using the GITHUB_TOKEN environment variable
-          </pre>
-          where USERNAME is your GitHub username.
-        </p>
-      </div>
-    </Layout>
+    <>
+      <Layout title="FAQ" description="Frequently Asked Questions">
+        <div id="faq" className="faq-container">
+          <div className="faq-title">Frequently Asked Questions</div>
+          <div className="faq-section">
+            <div className="faq-section-title">Support</div>
+            <div>How can I report an issue with OpenDevin?</div>
+            <div>
+              Please file a bug on{" "}
+              <a href="https://github.com/OpenDevin/OpenDevin/issues" target="_blank">GitHub</a> if
+              you notice a problem that likely affects others.
+              If you're having trouble installing, or have general questions, reach out on{" "}
+              <a href="https://discord.gg/mBuDGRzzES" target="_blank">Discord</a> or{" "}
+              <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA" target="_blank">Slack</a>.
+            </div>
+          </div>
+          <div className="faq-section">
+            <div className="faq-section-title">General</div>
+            <div>What is Devin?</div>
+            <div>
+              <span className="highlight">Devin</span>{" "}
+              represents a cutting-edge autonomous agent designed to navigate the
+              complexities of software engineering. It leverages a combination of
+              tools such as a shell, code editor, and web browser, showcasing the
+              untapped potential of LLMs in software development. Our goal is to
+              explore and expand upon Devin's capabilities, identifying both its
+              strengths and areas for improvement, to guide the progress of open
+              code models.
+            </div>
+          </div>
+          <div className="faq-section">
+            <div className="faq-section-title">Why OpenDevin?</div>
+            <p>
+              The{" "}
+              <span className="highlight">OpenDevin</span>{" "}
+              project is born out of a desire to replicate, enhance, and innovate
+              beyond the original Devin model. By engaging the{" "}
+              <a href="https://github.com/OpenDevin/OpenDevin">
+                open-source community
+              </a>
+              , we aim to tackle the challenges faced by Code LLMs in practical
+              scenarios, producing works that significantly contribute to the
+              community and pave the way for future advancements.
+            </p>
+          </div>
+          <div className="faq-section">
+            <div className="faq-section-title">How to fix an issue on OpenDevin?</div>
+            <div className="faq-steps">
+              To fix an issue on GitHub using OpenDevin, send a prompt to OpenDevin asking it to follow these steps:
+              <ol>
+                <li>Read the issue on <a href="https://github.com/OpenDevin/OpenDevin/issues/1611">GitHub</a></li>
+                <li>Clone the repository and check out a new branch</li>
+                <li>Based on the instructions in the issue description, modify files to fix the issue</li>
+                <li>Push the resulting output to GitHub using the GITHUB_TOKEN environment variable</li>
+                <li>Tell me the link that I need to go to to send a pull request</li>
+              </ol>
+              Before you run OpenDevin, you can do:
+              <div className="command-box">
+                export SANDBOX_ENV_GITHUB_TOKEN=XXX
+              </div>
+              where XXX is a GitHub token that you created that has permissions to push to the OpenDevin repo. If you don’t have write permission to the OpenDevin repo, you might need to change that to:
+              <div className="command-box">
+                Push the resulting output to my fork at https://github.com/USERNAME/OpenDevin/ using the GITHUB_TOKEN environment variable
+              </div>
+              where USERNAME is your GitHub username.
+            </div>
+          </div>
+        </div>
+      </Layout>
+    </>
  );
 }
--- a/docs/src/pages/index.module.css
+++ b/docs/src/pages/index.module.css
@@ -1,23 +0,0 @@
-/**
- * CSS files with the .module.css suffix will be treated as CSS modules
- * and scoped locally.
- */
-
-.heroBanner {
-  padding: 4rem 0;
-  text-align: center;
-  position: relative;
-  overflow: hidden;
-}
-
-@media screen and (max-width: 996px) {
-  .heroBanner {
-    padding: 2rem;
-  }
-}
-
-.buttons {
-  display: flex;
-  align-items: center;
-  justify-content: center;
-}
--- a/docs/src/pages/index.tsx
+++ b/docs/src/pages/index.tsx
@@ -1,12 +1,12 @@
 import useDocusaurusContext from "@docusaurus/useDocusaurusContext";
 import Layout from "@theme/Layout";
-
 import { HomepageHeader } from "../components/HomepageHeader/HomepageHeader";
 import { Welcome } from "../components/Welcome/Welcome";

 export function Header({ title, summary, description }): JSX.Element {
  return (
    <div>
+      <h1>{title}</h1>
      <h2 style={{ fontSize: "40px" }}>{summary}</h2>
      <h3 className="headerDescription">{description}</h3>
    </div>
@@ -16,8 +16,9 @@ export function Header({ title, summary, description }): JSX.Element {
 export default function Home(): JSX.Element {
  const { siteConfig } = useDocusaurusContext();
  return (
+    <>
    <Layout
-      title={`Hello from ${siteConfig.title}`}
+      title={`${siteConfig.title}`}
      description="AI-powered code generation for software engineering."
    >
      <div>
@@ -27,5 +28,6 @@ export default function Home(): JSX.Element {
        </div>
      </div>
    </Layout>
+    </>
  );
 }
--- a/docs/src/theme/Layout/index.tsx
+++ b/docs/src/theme/Layout/index.tsx
@@ -0,0 +1,12 @@
+import React from 'react';
+import OriginalLayout from '@theme-original/Layout';
+import Footer from '@site/src/pages/_footer';
+
+export default function Layout(props) {
+  return (
+    <>
+      <OriginalLayout {...props} />
+      <Footer />
+    </>
+  );
+}
--- a/docs/static/img/screenshot.png
+++ b/docs/static/img/screenshot.png
--- a/evaluation/EDA/game.py
+++ b/evaluation/EDA/game.py
@@ -1,31 +1,15 @@
-import json
 import logging
-import os
 import re
 from typing import Optional

 import openai
 import requests.exceptions
-import torch
 from openai import OpenAI
 from retry import retry
-from transformers import AutoModelForCausalLM, AutoTokenizer

 LOGGER = logging.getLogger(__name__)


-def load_model(path):
-    print('Loading model...')
-    tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False)
-    print('Tokenizer loaded.')
-    model = AutoModelForCausalLM.from_pretrained(
-        path, low_cpu_mem_usage=True, torch_dtype=torch.float16
-    ).cuda()
-    print('Model loaded.')
-    # model.half().cuda()
-    return model, tokenizer
-
-
 class Q20Game:
    def __init__(
        self,
@@ -36,8 +20,10 @@ class Q20Game:
        temperature: float = 0.8,
        openai_api: bool = True,
        openai_api_key: Optional[str] = None,
-        guesser_kargs={},
+        guesser_kargs=None,
    ) -> None:
+        if guesser_kargs is None:
+            guesser_kargs = {}
        self.item = item
        self.answerer_model = answerer_model
        self.guesser_model = guesser_model
@@ -70,124 +56,11 @@ class Q20Game:

        self.guesser_messages = []

-    def confusion_matrix(self, path):
-        self.reset()
-        with open(path) as f:
-            raw_messages = json.load(f)
-            self.item = path.split('/')[-1].split('_')[0]
-            roles = ['assistant', 'user']
-            for i, message in enumerate(raw_messages):
-                self.guesser_messages.append(
-                    {'role': roles[i % 2], 'content': message['content']}
-                )
-
-        self.guesser_messages = self.guesser_messages[:-2]
-        self.guesser_messages[-1]['content'] = (
-            self.guesser_messages[-1]['content'] + " You must guess now, what's it?"
-        )
-        guesser_msg = self.guesser(self.guesser_messages)
-        self.guesser_messages.append(guesser_msg)
-        guesser_question = guesser_msg['content'].strip()
-        self.guesser_messages[-1]['content'] = (
-            self.guesser_messages[-1]['content'] + ' Is it right?'
-        )
-        usr_msg = self.answerer(guesser_question)
-        self.guesser_messages.append(
-            {'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
-        )
-
-        if 'bingo' in self.guesser_messages[-1]['content'].lower():
-            self.guesser_win = True
-            return True
-
-        return False
-
-    @retry(
-        (
-            openai.Timeout,
-            requests.exceptions.ReadTimeout,
-            openai.RateLimitError,
-            openai.APIError,
-            requests.exceptions.HTTPError,
-            openai.APIConnectionError,
-        ),
-        tries=5,
-        delay=0.5,
-        backoff=0.5,
-        max_delay=2,
-        logger=LOGGER,
-    )
-    def guesser(self, messages):
-        if not self.guesser_model.startswith('gpt'):  # hf model
-            self.guesser_model, self.guesser_tokenizer = load_model(self.guesser_model)
-
-            # """Wraps hf's `generate` adding some specific method's defaults"""
-            assert not self.openai_api
-            prompt = self.dialog_history() + ' ASSISTANT:'
-            input_ids = torch.tensor(
-                [self.guesser_tokenizer.encode(prompt, add_special_tokens=True)]
-            )  # TODO check if huggingface is using the same format.
-            input_ids = input_ids.to(self.guesser_model.base_model.device)
-            attention_mask = None
-
-            with torch.no_grad():
-                gen = self.guesser_model.generate(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    **self.guesser_kargs,
-                )
-                gen_str = (
-                    self.guesser_tokenizer.decode(gen[0][input_ids[0].shape[0] :])
-                    .split('</s>')[0]
-                    .split('USER')[0]
-                    .lstrip()
-                    .strip()
-                )
-
-                return {
-                    'role': 'assistant',
-                    'content': gen_str,
-                }
-        else:
-            openai.api_base = self.guesser_api_base
-            client = OpenAI(api_key=openai.api_key)
-            response = client.chat.completions.create(
-                model=self.guesser_model,
-                messages=messages,
-                max_tokens=64,
-                n=1,
-                stop=None,
-                temperature=self.temperature,
-            )
-            return {
-                'role': 'assistant',
-                'content': response.choices[0].message.to_dict()['content'].strip(),
-            }
-
-    def dialog_history(self):
-        history = self.vicuna_prompt + ' '
-        for item in self.guesser_messages:
-            if item['role'].upper() == 'USER':
-                history += 'USER: ' + item['content']
-            elif item['role'].upper() == 'ASSISTANT':
-                history += ' ' + 'ASSISTANT: ' + item['content'] + '</s>'
-        return history
-
-
-    def preprocess_response(self,response):
-        response = re.sub(
-            r'the entity you are thinking of', 'it', response
-        )
-        response = re.sub(
-            r"the entity you're thinking of", 'it', response
-        )
-        response = re.sub(
-            r" you're thinking of", '', response
-        )
-        response = re.sub(
-            r' you are thinking of', '', response
-        )
-        self.guesser_messages.append(response)
+    def preprocess_response(self, response):
+        response = re.sub(r'the entity you are thinking of', 'it', response)
+        response = re.sub(r"the entity you're thinking of", 'it', response)
+        response = re.sub(r" you're thinking of", '', response)
+        response = re.sub(r' you are thinking of', '', response)
        return response

    def judge_winner(self, response):
@@ -195,101 +68,39 @@ class Q20Game:

        if self.curr_turn == self.num_turns - 1:
            guesser_question += ' Is it right?'
+
+        self.guesser_messages.append({'role': 'assistant', 'content': guesser_question})
        # ask for answer
        usr_msg = self.answerer(guesser_question)

+        self.guesser_messages.append(
+            {'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
+        )
+
        if 'bingo' in usr_msg['content'].lower():
            self.guesser_win = True
-            return True, ""
-        
+            return True, ''
+
        return False, usr_msg['content'].strip()
-    
+
    def generate_user_response(self, response):
        response = self.preprocess_response(response)
        # others
        bingo, anwser_reply = self.judge_winner(response)
        if bingo:
-            return "You are bingo! quit now, run: <execute_bash> exit </execute_bash>.\n"
+            return (
+                'You are bingo! quit now, run: <execute_bash> exit </execute_bash>.\n'
+            )
        if self.curr_turn == self.num_turns - 2:
            anwser_reply += " You must guess now, what's it?"
        return anwser_reply

-    def game_play(self, user_mode=False):
-        self.reset()
-        # print(f"Item: {self.item}")
-        for t in range(self.num_turns):
-            # System asking a question
-            if (not user_mode) or user_mode is None:
-                guesser_msg = self.guesser(self.guesser_messages)
-                guesser_msg['content'] = re.sub(
-                    r'the entity you are thinking of', 'it', guesser_msg['content']
-                )
-                guesser_msg['content'] = re.sub(
-                    r"the entity you're thinking of", 'it', guesser_msg['content']
-                )
-                guesser_msg['content'] = re.sub(
-                    r" you're thinking of", '', guesser_msg['content']
-                )
-                guesser_msg['content'] = re.sub(
-                    r' you are thinking of', '', guesser_msg['content']
-                )
-            else:
-                user_q = input(
-                    f'Type in your questions for turn {t+1}. (e.g. Is it a living thing?)\n'
-                )
-                guesser_msg = {'role': 'assistant', 'content': user_q}
-            self.guesser_messages.append(guesser_msg)
-            guesser_question = guesser_msg['content'].strip()
-
-            if t == self.num_turns - 1:
-                self.guesser_messages[-1]['content'] = (
-                    self.guesser_messages[-1]['content'] + ' Is it right?'
-                )
-
-            usr_msg = self.answerer(guesser_question)
-            self.guesser_messages.append(
-                {'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
-            )
-
-            if 'bingo' in usr_msg['content'].lower():
-                self.guesser_win = True
-                return True
-
-            if t == self.num_turns - 2:
-                self.guesser_messages[-1]['content'] = (
-                    self.guesser_messages[-1]['content']
-                    + " You must guess now, what's it?"
-                )
-
-        return False
-
-    def save_session(self, path):
-        # Print the conversation
-        if not os.path.exists(path):
-            os.makedirs(path)
-        output_file = os.path.join(path, f'{self.item}.txt')
-        with open(output_file, 'w') as out_f:
-            out_f.write(f'item: {self.item}\n')
-            for t, message in enumerate(self.guesser_messages):
-                out_f.write(
-                    f"Turn {(t+1)//2}, {message['role'].capitalize()}: {message['content'].lstrip()}\n"
-                )
-
    def reward(self):
        if self.guesser_win:
            n_turns = (len(self.guesser_messages) + 1) // 2
            return 1 - max(n_turns - 5, 0) * 0.02
        return 0

-    def num_success(self):
-        return 1 if self.guesser_win else 0
-
-    def num_yes(self):
-        n_yes = sum(
-            ['yes' in msg['content'].lower() for msg in self.guesser_messages[2::2]]
-        )
-        return n_yes
-
    @retry(
        (
            openai.Timeout,
@@ -339,16 +150,6 @@ class Q20Game:
            response.choices[0].message.content = 'Bingo!'
        return response.choices[0].message.to_dict()

-    def reset(self):
-        # Initialize the conversation
-        self.curr_turn = 0
-        self.guesser_messages = [
-            {
-                'role': 'user',
-                'content': self.first_user_utterance,
-            }
-        ]
-

 class Q20GameCelebrity(Q20Game):
    def __init__(self, item: str, **kwargs) -> None:
@@ -376,10 +177,11 @@ class Q20GameCelebrity(Q20Game):
    )
    def answerer(self, question):
        openai.api_base = self.user_api_base
+        client = OpenAI(api_key=openai.api_key)
        user_messages = [
            {
                'role': 'system',
-                'content': f'Based on on your knowledge about the celebrity: {self.item}, '
+                'content': f'Based on your knowledge about the celebrity: {self.item}, '
                f'respond to the following question or guess. '
                f"Limit your respond to only 'Yes.', 'No.' or 'Dunno.', with no explanation or other words. "
                f"Never say the name {self.item} in your response. Do not say 'Dunno.' if it can be answered by 'Yes.' or 'No.' "
@@ -391,7 +193,7 @@ class Q20GameCelebrity(Q20Game):
            },
        ]

-        response = openai.ChatCompletion.create(
+        response = client.chat.completions.create(
            model=self.answerer_model,
            messages=user_messages,
            max_tokens=6,
@@ -402,12 +204,3 @@ class Q20GameCelebrity(Q20Game):
        if re.search(rf'(?:^|\W){self.item.lower()}(?:$|\W)', question.lower()):
            response.choices[0].message.content = 'Bingo!'
        return response.choices[0].message.to_dict()
-
-    def reset(self):
-        # Initialize the conversation
-        self.guesser_messages = [
-            {
-                'role': 'user',
-                'content': self.first_user_utterance,
-            }
-        ]
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -45,7 +45,9 @@ def codeact_user_response(state: State) -> str:
    msg = game.generate_user_response(model_guess)
    game.curr_turn += 1
    logger.info(f'Model guess: {model_guess}')
-    logger.info(f'Anwser response: {msg}')
+    logger.info(f'Answer response: {msg}')
+    if 'bingo!' in msg.lower():
+        return '/exit'
    return msg


@@ -63,8 +65,10 @@ AGENT_CLS_TO_INST_SUFFIX = {
 }


-def process_instance(instance, agent_class, metadata, reset_logger: bool = True):
-    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+def process_instance(
+    instance, agent_class, metadata, openai_api_key, reset_logger: bool = True
+):
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    eval_output_dir = metadata['eval_output_dir']
    if reset_logger:
        # Set up logger
@@ -105,7 +109,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
        answerer_model=metadata['answerer_model'],
        guesser_model=None,
        num_turns=metadata['max_iterations'],
-        openai_api_key=metadata['openai_api'],
+        openai_api_key=openai_api_key,
        guesser_kargs=guesser_kargs,
    )

@@ -125,7 +129,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
        )
    )
    # ======= Attempt to evaluate the agent's edits =======
-    # If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.

    if state is None:
@@ -139,6 +143,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)

    logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
    test_result = game.reward()
+    metrics = state.metrics.get() if state.metrics else None

    # Save the output
    output = {
@@ -149,6 +154,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
        'history': [
            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
        ],
+        'metrics': metrics,
        'error': state.error if state and state.error else None,
        'test_result': {
            'success': test_result,
@@ -230,12 +236,11 @@ if __name__ == '__main__':
        'data_split': args.data_split,
        'answerer_model': args.answerer_model,
        'agent_class': agent_class,
-        'openai_api': args.OPENAI_API_KEY,
        'model_name': model_name,
        'max_iterations': max_iterations,
        'eval_output_dir': eval_output_dir,
        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
-        # get the commit id of current repo for reproduciblity
+        # get the commit id of current repo for reproducibility
        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
        .decode('utf-8')
        .strip(),
@@ -313,6 +318,7 @@ if __name__ == '__main__':
                    instance,
                    agent_class,
                    metadata,
+                    args.OPENAI_API_KEY,
                    reset_logger=bool(num_workers > 1),
                )
                future.add_done_callback(update_progress)
--- a/evaluation/EDA/scripts/run_infer.sh
+++ b/evaluation/EDA/scripts/run_infer.sh
@@ -46,4 +46,5 @@ if [ -n "$EVAL_LIMIT" ]; then
 fi

 # Run the command
+echo $COMMAND
 eval $COMMAND
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -13,9 +13,14 @@ all the preprocessing/evaluation/analysis scripts.
 ## Supported Benchmarks

 - SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
+- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
 - HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
 - GAIA: [`evaluation/gaia`](./gaia)
 - Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
+- MINT: [`evaluation/mint`](./mint)
+- AgentBench: [`evaluation/agent_bench`](./agent_bench)
+- BIRD: [`evaluation/bird`](./bird)
+- LogicReasoning: [`evaluation/logic_reasoning`](./logic_reasoning)

 ### Result Visualization

--- a/evaluation/agent_bench/README.md
+++ b/evaluation/agent_bench/README.md
@@ -0,0 +1,60 @@
+# AgentBench Evaluation
+
+This folder contains evaluation harness for evaluating agents on
+the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688).
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md)
+for how to set this up.
+
+Here is an example `config.toml` file:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/path/to/cache"
+
+workspace_base = "/path/to/workspace"
+workspace_mount_path = "/path/to/workspace"
+
+sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
+sandbox_type = "ssh"
+sandbox_timeout = 120
+ssh_hostname = "localhost"
+
+use_host_network = false
+# AgentBench specific
+run_as_devin = true
+enable_auto_lint = true
+
+[eval_gpt35_turbo]
+model = "gpt-3.5-turbo"
+api_key = "sk-123"
+temperature = 0.0
+
+[eval_gpt4o]
+model = "gpt-4o"
+api_key = "sk-123"
+temperature = 0.0
+```
+
+## Start the evaluation
+
+```bash
+./evaluation/agent_bench/scripts/run_infer.sh [model_config] [agent] [eval_limit]
+```
+
+Following is the basic command to start the evaluation. Here we are only evaluating the `osbench` for now.
+
+You can update the arguments in the script `evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.
+
+- `--agent-cls`, the agent to use. For example, `CodeActAgent`.
+- `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
+- `--max-iterations`: the number of iterations to run the evaluation. For example, `30`.
+- `--eval-num-workers`: the number of workers to use for evaluation. For example, `5`.
+- `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
+
+```bash
+./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo CodeActAgent 1
+```
--- a/evaluation/agent_bench/init.py
+++ b/evaluation/agent_bench/init.py
--- a/evaluation/agent_bench/helper.py
+++ b/evaluation/agent_bench/helper.py
@@ -0,0 +1,61 @@
+import os
+import re
+
+from opendevin.events.action import CmdRunAction, MessageAction
+
+
+def analysis_size(size_str):
+    size_str = size_str.strip()
+    avails = {
+        'B': 1,
+        'Byte': 1,
+        'K': 1024,
+        'KB': 1024,
+        'M': 1024 * 1024,
+        'MB': 1024 * 1024,
+        'G': 1024 * 1024 * 1024,
+        'GB': 1024 * 1024 * 1024,
+        'T': 1024 * 1024 * 1024 * 1024,
+        'TB': 1024 * 1024 * 1024 * 1024,
+        'P': 1024 * 1024 * 1024 * 1024 * 1024,
+        'PB': 1024 * 1024 * 1024 * 1024 * 1024,
+    }
+    for size_unit in avails:
+        if size_str.endswith(size_unit):
+            return int(size_str[: -len(size_unit)]) * avails[size_unit]
+    return int(size_str)
+
+
+def compare_results(check_method: str, model_answer: str, final_ans: str) -> bool:
+    try:
+        match check_method:
+            case 'check/integer-match.py':
+                return int(model_answer) == int(final_ans)
+            case 'check/size-match.py':
+                return analysis_size(model_answer) == analysis_size(final_ans)
+        return (
+            model_answer.replace('\r\n', '\n').replace('\r', '\n').strip()
+            == final_ans.replace('\r\n', '\n').replace('\r', '\n').strip()
+        )
+    except Exception:
+        return False
+
+
+def create_sh_file(filename: str, cmds: str) -> None:
+    with open(filename, 'w', encoding='utf-8') as file:
+        file.write(cmds.replace('\r\n', '\n'))
+    os.chmod(filename, 0o755)
+
+
+def try_parse_answer(act) -> str | None:
+    raw_ans = ''
+    if isinstance(act, MessageAction) and act.source == 'agent':
+        raw_ans = act.content
+    elif isinstance(act, CmdRunAction) and act.source == 'agent':
+        raw_ans = act.thought
+    else:
+        return None
+    agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
+    if not agent_answer:
+        return None
+    return agent_answer[0].strip()
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -0,0 +1,405 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+import docker
+from datasets import load_dataset
+from tqdm import tqdm
+
+from evaluation.agent_bench.helper import (
+    compare_results,
+    create_sh_file,
+    try_parse_answer,
+)
+from opendevin.controller.state.state import State
+from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import CmdRunAction, MessageAction
+from opendevin.events.serialization.event import event_to_dict
+from opendevin.runtime.docker.ssh_box import DockerSSHBox
+
+
+def cleanup():
+    print('Cleaning up child processes...')
+    for process in mp.active_children():
+        print(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'If you think you have solved the task, please first send your answer to user through '
+        'message and then <execute_bash> exit </execute_bash>.\n'
+        'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
+        'For example: The answer to the question is <solution> 42 </solution>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
+    )
+    if state.history:
+        # check if the last action is an answer, if so, return exit for early exit
+        last_action, _ = state.history[-1]
+        ans = try_parse_answer(last_action)
+        if ans is not None:
+            return '/exit'
+
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have solved the question, '
+    'please first send your answer to user through message and then exit.\n'
+}
+
+
+def process_instance(
+    instance,
+    agent_class,
+    metadata,
+    eval_output_dir,
+    reset_logger: bool = True,
+):
+    # =============================================
+    # preparation
+    # =============================================
+
+    inst_id = instance.instance_id
+    question = instance.description
+    # create a directory for the instance's workspace
+    instance_workspace = str(os.path.join(config.workspace_base, inst_id))
+    container_inst_workspace = str(
+        os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
+    )
+    if os.path.exists(instance_workspace):
+        shutil.rmtree(instance_workspace)
+    os.makedirs(instance_workspace, exist_ok=True)
+
+    # Set up the logger properly, so you can run multiprocessing to parallel the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(eval_output_dir, 'logs', f'instance_{inst_id}.log')
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {inst_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+
+    # =============================================
+    # build instruction
+    # =============================================
+
+    # Prepare instruction
+    instruction = (
+        f'Please fix the following issue.\n'
+        'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
+        'For example: The answer to the question is <solution> 42 </solution>.\n'
+        '# Problem \n'
+        f'{question}\n\n'
+    )
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided '
+        'to you AND NEVER ASK FOR HUMAN HELP.\n'
+    )
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+
+    sandbox = DockerSSHBox()
+    sandbox.execute(f'cd {inst_id}')
+
+    init_cmd = instance.init
+    if init_cmd is not None:
+        scpt_name = f'{instance.instance_id}_init.sh'
+        scpt_path = os.path.join(container_inst_workspace, scpt_name)
+        host_scpt_path = os.path.join(instance_workspace, scpt_name)
+        create_sh_file(host_scpt_path, init_cmd)
+        logger.info(f'Running init script: {scpt_path}')
+        _, init_res = sandbox.execute(scpt_path)
+        logger.info(f'Init script result: {init_res}')
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State = asyncio.run(
+        main(
+            instruction,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+            sandbox=sandbox,
+        )
+    )
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # get the ground truth
+    # OSBenchSSHBox.get_ground_truth(instance, state)
+
+    # =============================================
+    # result evaluation
+    # =============================================
+
+    agent_answer = ''
+    get_agent_result_cmd = instance.get_agent_result
+    if get_agent_result_cmd is not None:
+        scpt_name = f'{instance.instance_id}_get_agent_result.sh'
+        scpt_path = os.path.join(container_inst_workspace, scpt_name)
+        host_scpt_path = os.path.join(instance_workspace, scpt_name)
+        create_sh_file(host_scpt_path, get_agent_result_cmd)
+        logger.info(f'Running get agent result cmd: {scpt_path}')
+        _, agent_answer = sandbox.execute(scpt_path)
+    else:
+        logger.info('Retrieving agent answer from history.')
+        raw_ans = ''
+        for act, _ in reversed(state.history):
+            if isinstance(act, MessageAction) and act.source == 'agent':
+                raw_ans = act.content
+                break
+            if isinstance(act, CmdRunAction) and act.source == 'agent':
+                raw_ans = act.thought
+                break
+        agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
+        if len(agent_answer) == 0:
+            logger.warning(f'Failed to parse model answer: {raw_ans}')
+            agent_answer = raw_ans
+        else:
+            agent_answer = agent_answer[0]
+
+    final_ans = ''
+    if instance.ground_truth is not None:
+        final_ans = instance.ground_truth
+    else:
+        get_ground_truth_cmd = instance.get_ground_truth
+        if get_ground_truth_cmd is not None:
+            scpt_name = f'{instance.instance_id}_get_ground_truth.sh'
+            scpt_path = os.path.join(container_inst_workspace, scpt_name)
+            host_scpt_path = os.path.join(instance_workspace, scpt_name)
+            create_sh_file(host_scpt_path, get_ground_truth_cmd)
+            logger.info(f'Running get ground truth cmd: {scpt_path}')
+            sandbox.execute(f'cd {container_inst_workspace}')
+            _, final_ans = sandbox.execute(scpt_path)
+
+    comparison_method = instance.comparison_method
+    logger.info(
+        f'Final message: {agent_answer} | Ground truth: {final_ans} | Comparison method: {comparison_method}'
+    )
+    test_result = compare_results(comparison_method, agent_answer, final_ans)
+
+    histories = [
+        (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+    ]
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = {
+        'instance_id': inst_id,
+        'instance': instance.to_dict(),
+        'instruction': instruction,
+        'metadata': metadata,
+        'history': histories,
+        'metrics': metrics,
+        'error': state.error if state and state.error else None,
+        'test_result': {
+            'agent_answer': agent_answer,
+            'final_answer': final_ans,
+            'check_method': comparison_method,
+            'result': test_result,
+        },
+    }
+
+    # clean up
+    if os.path.exists(instance_workspace):
+        shutil.rmtree(instance_workspace)
+    # Close the sandbox
+    try:
+        sandbox.close()
+    except docker.errors.NotFound as e:
+        logger.error(f'Failed to close sandbox: {e}')
+    return output
+
+
+if __name__ == '__main__':
+    # =============================================
+    # load datasets
+    # =============================================
+
+    dataset = load_dataset('iFurySt/AgentBench')
+    agent_bench_tests = dataset['osbench'].to_pandas()
+    logger.info(f'Loaded {len(agent_bench_tests)} tests.')
+
+    # =============================================
+    # handle arguments and prepare for evaluation
+    # =============================================
+
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_cls = args.agent_cls
+    assert (
+        agent_cls in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_cls}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_op_dir = str(
+        os.path.join(
+            args.eval_output_dir,
+            'agent_bench',
+            agent_cls,
+            model_name + '_maxiter_' + str(max_iterations) + eval_note,
+        )
+    )
+
+    pathlib.Path(eval_op_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(str(os.path.join(eval_op_dir, 'logs'))).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_op_dir}')
+
+    meta = {
+        'agent_class': agent_cls,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_op_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproducibility
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {meta}')
+    with open(os.path.join(eval_op_dir, 'metadata.json'), 'w') as f:
+        json.dump(meta, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        agent_bench_tests = agent_bench_tests[:eval_n_limit]
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_op_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_instance_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_instance_ids.add(data['instance_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_cls}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    # =============================================
+
+    new_agent_bench_tests = []
+    for idx, inst in agent_bench_tests.iterrows():
+        if inst.instance_id in finished_instance_ids:
+            logger.info(
+                f'Skipping instance {inst.instance_id} as it is already finished.'
+            )
+            continue
+        new_agent_bench_tests.append(inst)
+
+    agent_bench_tests = new_agent_bench_tests
+    logger.info(
+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(agent_bench_tests)}'
+    )
+
+    # =============================================
+    # start task
+    # =============================================
+
+    pbar = tqdm(total=len(agent_bench_tests))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(fut):
+        pbar.update(1)
+        output = fut.result()
+        pbar.set_description(f'Instance {output["instance_id"]}')
+        pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        output_fp.flush()
+
+    # This sets the multiprocessing
+    num_workers = args.eval_num_workers
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multiprocessing
+            for inst in agent_bench_tests:
+                future = executor.submit(
+                    process_instance,
+                    inst,
+                    agent_cls,
+                    meta,
+                    eval_op_dir,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
--- a/evaluation/agent_bench/scripts/run_infer.sh
+++ b/evaluation/agent_bench/scripts/run_infer.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+MODEL_CONFIG=$1
+AGENT=$2
+EVAL_LIMIT=$3
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run python evaluation/agent_bench/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 30 \
+  --max-chars 10000000 \
+  --eval-num-workers 5 \
+  --eval-note $AGENT_VERSION"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/agent_bench/scripts/summarise_results.py
+++ b/evaluation/agent_bench/scripts/summarise_results.py
@@ -0,0 +1,37 @@
+import json
+import sys
+
+
+def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
+    passed = []
+    failed = []
+    with open(res_file_path, 'r') as file:
+        for line in file:
+            data = json.loads(line.strip())
+            instance_id = data['instance_id']
+            resolved = False
+            if 'test_result' in data and 'result' in data['test_result']:
+                resolved = data['test_result']['result']
+            if resolved:
+                passed.append(instance_id)
+            else:
+                failed.append(instance_id)
+    return passed, failed
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print(
+            'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
+        )
+        sys.exit(1)
+    json_file_path = sys.argv[1]
+    passed_tests, failed_tests = extract_test_results(json_file_path)
+    succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
+    print(
+        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
+    )
+    print('PASSED TESTS:')
+    print(passed_tests)
+    print('FAILED TESTS:')
+    print(failed_tests)
--- a/evaluation/biocoder/README.md
+++ b/evaluation/biocoder/README.md
@@ -0,0 +1,59 @@
+# BioCoder Evaluation with Opendevin
+
+Implements evaluation of agents on BioCoder from the BioCoder benchmark introduced in [BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models](https://arxiv.org/abs/2308.16458). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper.
+
+## Setup Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
+
+
+## Configure OpenDevin and your LLM
+Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
+
+## BioCoder Docker Image
+In the opendevin branch of the Biocoder repository, we have slightly modified our original Docker image to work with the OpenDevin environment. In the Docker image are testing scripts (`/testing/start_test_opendevin.py` and aux files in `/testing_files/`) to assist with evaluation. Additionally, we have installed all dependencies, including OpenJDK, mamba (with Python 3.6), and many system libraries. Notably, we have **not** packaged all repositories into the image, so they are downloaded at runtime.
+
+**Before first execution, pull our Docker image with the following command**
+```bash
+docker pull public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0
+```
+
+To reproduce this image, please see the Dockerfile_Opendevin in the `biocoder` repository.
+
+## Start the evaluation
+
+
+```bash
+./evaluation/biocoder/scripts/run_infer.sh [model_config] [agent] [eval_limit]
+```
+
+where `model_config` is mandatory, while `agent`, `dataset` and `eval_limit` are optional.
+
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
+
+Let's say you'd like to run 10 instances using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent,
+then your command would be:
+
+## Examples
+
+```bash
+./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 CodeActAgent 1
+```
+
+## Reference
+```
+@misc{tang2024biocoder,
+      title={BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models},
+      author={Xiangru Tang and Bill Qian and Rick Gao and Jiakang Chen and Xinyun Chen and Mark Gerstein},
+      year={2024},
+      eprint={2308.16458},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+```
--- a/evaluation/biocoder/biocoder_env_box.py
+++ b/evaluation/biocoder/biocoder_env_box.py
@@ -0,0 +1,396 @@
+import json
+import os
+import re
+import sys
+from collections import defaultdict
+from dataclasses import dataclass
+
+from datasets import load_dataset
+
+from opendevin.core.config import config
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.runtime.docker.ssh_box import DockerSSHBox
+from opendevin.runtime.plugins import (
+    JupyterRequirement,
+    PluginRequirement,
+    SWEAgentCommandsRequirement,
+)
+
+BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
+
+
+@dataclass
+class BiocoderData:
+    filePath: str
+    numLines: int
+    lineStart: int
+    lineEnd: int
+    signature: str
+    comment: str
+    content: str
+    repository: str
+    promptSummaryOnly: str
+    contextCode: str
+    goldenCode: str
+    test_case_id: str
+    language: str
+
+    def to_dict(self):
+        return {
+            'filePath': self.filePath,
+            'numLines': self.numLines,
+            'lineStart': self.lineStart,
+            'lineEnd': self.lineEnd,
+            'signature': self.signature,
+            'comment': self.comment,
+            'content': self.content,
+            'repository': self.repository,
+            'promptSummaryOnly': self.promptSummaryOnly,
+            'contextCode': self.contextCode,
+            'goldenCode': self.goldenCode,
+            'test_case_id': self.test_case_id,
+            'language': self.language,
+        }
+
+
+def get_likely_indent_size(array_of_tabs) -> int:
+    sizes = defaultdict(int)
+
+    for i in range(len(array_of_tabs) - 1):
+        diff = array_of_tabs[i + 1] - array_of_tabs[i]
+        if diff > 0:
+            sizes[diff] += 1
+    if len(sizes) == 0:
+        return 4
+    return int(max(sizes, key=sizes.get))
+
+
+class BiocoderSSHBox(DockerSSHBox):
+    def __init__(
+        self,
+        container_image: str,
+        timeout: int = 120,
+        sid: str | None = None,
+        biocoder_instance_id: str | None = None,
+        biocoder_instance: BiocoderData | None = None,
+        skip_workspace_mount: bool = True,
+        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
+        biocoder_cache_folder: str = 'biocoder_cache',
+        workspace_dir_name: str | None = None,
+    ):
+        if biocoder_instance_id is None:
+            raise ValueError('biocoder_instance_id must be provided')
+        self.biocoder_instance_id = biocoder_instance_id
+        self.biocoder_instance = biocoder_instance
+        self.skip_workspace_mount = skip_workspace_mount
+        self.biocoder_cache_folder = biocoder_cache_folder
+        self.first_line_after_removed = None
+        self.workspace_dir_name = workspace_dir_name
+        self.workspace_base = config.workspace_base
+        self.workspace_mount_path = config.workspace_mount_path
+        # self.workspace_dir_name_host = os.path.join(config.workspace_base, workspace_dir_name)
+
+        self.context_path = None
+        self.generated_path = None
+        self.golden_path = None
+
+        assert (
+            container_image is not None
+        ), 'container_image is required for BiocoderBenchSSHBox!'
+        super().__init__(container_image, timeout, sid)
+        self.init_plugins(sandbox_plugins)
+
+    @property
+    def volumes(self):
+        if self.skip_workspace_mount:
+            return {
+                k: v
+                for k, v in super().volumes.items()
+                if not v['bind'] == self.sandbox_workspace_dir
+            }
+        return super().volumes
+
+    def get_target_filepath(self):
+        target_filepath = os.path.join(
+            self.workspace_mount_path,
+            self.biocoder_instance.repository.split('/')[1],
+            self.biocoder_instance.filePath,
+        )
+        return target_filepath
+
+    def get_changed_code(self, include_signature=False):
+        # copies changed code into /testing_files/
+        # Note that this does NOT copy the function signature
+        target_filepath = self.get_target_filepath()
+        selected_lines = []
+        offset = 1 if include_signature else 0
+        if self.first_line_after_removed is None:
+            logger.warning('First line after removed is None')
+        with open(target_filepath, 'r') as f:
+            lines = f.read().split('\n')
+            for i in range(self.biocoder_instance.lineStart - offset, len(lines)):
+                if lines[i].strip() == self.first_line_after_removed.strip():
+                    break
+                selected_lines.append(lines[i])
+        text = '\n'.join(selected_lines)
+        return text
+
+    def copy_changed_code(self):
+        changed_code = self.get_changed_code(include_signature=True)
+        with open(self.generated_path, 'w') as f:
+            f.write(changed_code)
+        exit_code, output = self.execute_and_check(
+            f'cp -r /workspace/{self.biocoder_cache_folder}/* /testing_files',
+            'Failed to copy the files',
+        )
+
+    def remove_code(self):
+        comment_prefix = {'python': '#', 'java': '//'}
+
+        target_filepath = self.get_target_filepath()
+        line_start = self.biocoder_instance.lineStart
+        line_end = self.biocoder_instance.lineEnd
+        with open(target_filepath, 'r') as f:
+            lines = f.read().split('\n')
+            # print("="*10+"ORIGINAL"+"="*10)
+            # print("\n".join(lines))
+            signature_line = lines[line_start - 1]
+
+            # get the number of tabs
+            def get_indent_size(s: str):
+                return len(re.match(r'\s*', s).group())
+
+            indent_sizes = list(map(get_indent_size, lines))
+            indent_size = get_likely_indent_size(indent_sizes)
+            comment_indent_size = get_indent_size(signature_line) + indent_size
+            lines = (
+                lines[:line_start]
+                + [
+                    f"{' '*comment_indent_size+comment_prefix[self.biocoder_instance.language.lower()]}TODO: replace with your code here"
+                ]
+                + ([''] * 2)
+                + lines[line_end:]
+            )
+        first_line_after_removed_index = line_start
+        while len(
+            lines[first_line_after_removed_index].strip()
+        ) == 0 and first_line_after_removed_index < len(lines):
+            first_line_after_removed_index += 1
+        self.first_line_after_removed = lines[first_line_after_removed_index]
+        # print("FIRST LINE AFTER REMOVED: ", self.first_line_after_removed)
+
+        with open(target_filepath, 'w') as f:
+            f.write('\n'.join(lines))
+
+        # with open(target_filepath, 'r') as f:
+        #     print("="*10+"MODIFIED"+"="*10)
+        #     print(f.read())
+
+    def execute_and_check(self, cmd: str, error_msg: str) -> tuple[int, str]:
+        exit_code, output = self.execute(cmd)
+        if exit_code != 0:
+            logger.error(error_msg)
+            sys.exit(1)
+        return exit_code, output
+
+    @classmethod
+    def get_box_for_instance(
+        cls,
+        instance,
+        workspace_dir_name=None,
+        skip_workspace_mount: bool = False,
+        workspace_mount_path: str | None = None,
+        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
+    ) -> 'BiocoderSSHBox':
+        """This method initializes a container image, then runs some initialization commands"""
+        if workspace_dir_name is None:
+            workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
+                '/', '__'
+            )
+
+        workspace_base = str(os.path.join(config.workspace_base, workspace_dir_name))
+        old_workspace_base = config.workspace_base
+        old_workspace_mount_path = config.workspace_mount_path
+
+        try:
+            config.workspace_base = workspace_base
+            config.workspace_mount_path = workspace_base
+
+            # linting python after editing helps LLM fix indentations
+            config.enable_auto_lint = True
+
+            # create folder for transferring files back/forth
+            biocoder_cache_folder = 'biocoder_cache'
+            if not os.path.exists(os.path.join(workspace_base, biocoder_cache_folder)):
+                os.makedirs(
+                    os.path.join(workspace_base, biocoder_cache_folder), exist_ok=True
+                )
+
+            file_ext = {
+                'python': 'py',
+                'java': 'java',
+                'c': 'c',
+                'cpp': 'cpp',
+                'javascript': 'js',
+                'typescript': 'ts',
+            }[instance.language.lower()]
+
+            context_path = os.path.join(
+                workspace_base, biocoder_cache_folder, 'context.' + file_ext
+            )
+            generated_path = os.path.join(
+                workspace_base, biocoder_cache_folder, 'generated.' + file_ext
+            )
+            golden_path = os.path.join(
+                workspace_base, biocoder_cache_folder, 'golden.' + file_ext
+            )
+
+            # print(instance.contextCode)
+            with open(context_path, 'w') as f:
+                f.write(instance.contextCode)
+            with open(generated_path, 'w') as f:
+                f.write(instance.goldenCode)
+            with open(golden_path, 'w') as f:
+                f.write(instance.goldenCode)
+
+            testcase_json = {
+                'test_case_id': instance.test_case_id,
+                'num_cases': 1000,
+                'language': instance.language.lower(),
+            }
+
+            with open(
+                os.path.join(
+                    workspace_base, biocoder_cache_folder, 'testcase_biocoder.json'
+                ),
+                'w',
+            ) as f:
+                f.write(json.dumps(testcase_json, indent=4))
+
+            # linting python after editing helps LLM fix indentations
+            config.enable_auto_lint = True
+
+            sandbox = cls(
+                container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
+                biocoder_instance_id=instance.test_case_id,
+                biocoder_instance=instance,
+                skip_workspace_mount=skip_workspace_mount,
+                sandbox_plugins=sandbox_plugins,
+                biocoder_cache_folder=biocoder_cache_folder,
+                workspace_dir_name=workspace_dir_name,
+            )
+        except Exception:
+            raise
+        finally:
+            config.workspace_base = old_workspace_base
+            config.workspace_mount_path = old_workspace_mount_path
+
+        sandbox.context_path = context_path
+        sandbox.generated_path = generated_path
+        sandbox.golden_path = golden_path
+
+        logger.info(f'SSH box started for instance {instance.test_case_id}.')
+        # cd to the workspace
+        exit_code, output = sandbox.execute_and_check(
+            'cd /workspace', 'Failed to cd to workspace'
+        )
+        logger.info(f'cd to workspace: {output}')
+
+        # download repository archive
+        repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
+        exit_code, output = sandbox.execute_and_check(
+            'wget -O repo.zip ' + repository_url, 'Failed to download the repository'
+        )
+        logger.info(f'Downloaded the repository: {output}')
+        exit_code, output = sandbox.execute_and_check(
+            'unzip -o -q repo.zip', 'Failed to unzip the repository'
+        )
+        logger.info(f'Unzipped the repository: {output}')
+
+        # copy the context, generated and golden files to the /testing_files folder
+        exit_code, output = sandbox.execute_and_check(
+            f'cp -r /workspace/{biocoder_cache_folder}/* /testing_files',
+            'Failed to copy the files',
+        )
+
+        # chmod 777
+        exit_code, output = sandbox.execute_and_check(
+            'chmod -R 777 /workspace',
+            'Failed to chmod the files',
+        )
+
+        return sandbox
+
+
+if __name__ == '__main__':
+    biocoder_dataset = load_dataset('Lilbillbiscuit/biocoder_public')
+    EXAMPLE_INSTANCE = biocoder_dataset['test'][0]
+    EXAMPLE_INSTANCE = BiocoderData(**EXAMPLE_INSTANCE)
+
+    sandbox = BiocoderSSHBox.get_box_for_instance(
+        instance=EXAMPLE_INSTANCE,
+        workspace_mount_path='/home/ubuntu/OpenDevinBioCoder/workspace',
+        skip_workspace_mount=False,
+        sandbox_plugins=[JupyterRequirement(), SWEAgentCommandsRequirement()],
+    )
+
+    # PRE TEST
+    exit_code, output = sandbox.execute_and_check(
+        'cd /testing',
+        'Failed to cd /testing',
+    )
+    logger.info(f'cd $REPO_PATH: {output}')
+
+    exit_code, output = sandbox.execute_and_check(
+        'whoami',
+        'Failed to run whoami',
+    )
+    logger.info(f'whoami: {output}')
+
+    # TEST
+    exit_code, output = sandbox.execute(
+        '/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
+    )
+    assert exit_code == 0, 'Expected exit code 0 (this should have passed)'
+    logger.info(f'$TEST_CMD:\n{output}')
+
+    exit_code, output = sandbox.execute_and_check(
+        'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
+    )
+
+    print(output)
+    json_obj = json.loads(output)
+    if json_obj['result'] == 'pass':
+        print('PASS')
+    else:
+        print('FAIL')
+
+    bg_cmd = sandbox.execute_in_background(
+        "while true; do echo 'dot ' && sleep 10; done"
+    )
+
+    sys.stdout.flush()
+    try:
+        while True:
+            try:
+                user_input = input('>>> ')
+            except EOFError:
+                logger.info('Exiting...')
+                break
+            if user_input.lower() == 'exit':
+                logger.info('Exiting...')
+                break
+            if user_input.lower() == 'kill':
+                sandbox.kill_background(bg_cmd.pid)
+                logger.info('Background process killed')
+                continue
+            exit_code, output = sandbox.execute(user_input)
+            logger.info('exit code: %d', exit_code)
+            logger.info(output)
+            if bg_cmd.pid in sandbox.background_commands:
+                logs = sandbox.read_logs(bg_cmd.pid)
+                logger.info('background logs: %s', logs)
+            sys.stdout.flush()
+    except KeyboardInterrupt:
+        logger.info('Exiting...')
+    sandbox.close()
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -0,0 +1,393 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+import pandas as pd
+from datasets import load_dataset
+from tqdm import tqdm
+
+import agenthub
+from evaluation.biocoder.biocoder_env_box import BiocoderData, BiocoderSSHBox
+from opendevin.controller.state.state import State
+from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    print('Cleaning up child processes...')
+    for process in mp.active_children():
+        print(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
+    )
+    if state.history:
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have fixed the issue through code changes, please run the following command: <execute_bash> exit </execute_bash>.\n'
+}
+
+
+def get_test_result(instance, sandbox, workspace_dir_name):
+    test_result = {'result': {}, 'metadata': {}}
+    try:
+        code = sandbox.get_changed_code(include_signature=True)
+        sandbox.copy_changed_code()
+        test_result['metadata']['1_copy_change_success'] = True
+        test_result['metadata']['1_copy_change_code'] = code
+    except Exception:
+        logger.error('Error fetching changed code for this instance')
+        test_result['metadata']['1_copy_change_success'] = False
+        test_result['metadata']['1_copy_change_code'] = None
+
+    exit_code, output = sandbox.execute_and_check(
+        'cd /testing',
+        'Failed to cd /testing',
+    )
+    logger.info(f'cd $REPO_PATH: {output}')
+
+    exit_code, output = sandbox.execute_and_check(
+        'whoami',
+        'Failed to run whoami',
+    )
+    logger.info(f'whoami: {output}')
+
+    exit_code, output = sandbox.execute(
+        '/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
+    )
+    logger.info(f'$TEST_CMD:\n{output}')
+
+    exit_code, output = sandbox.execute_and_check(
+        'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
+    )
+    if exit_code == 0:
+        test_result['metadata']['2_run_test_success'] = True
+        test_result['metadata']['2_run_test_result'] = str(output)
+    else:
+        test_result['metadata']['2_run_test_success'] = False
+        test_result['metadata']['2_run_test_result'] = str(output)
+    json_obj = json.loads(output)
+    test_result['result'] = json_obj['result']
+
+    return test_result
+
+
+def process_instance(
+    instance,
+    agent_class,
+    metadata,
+    skip_workspace_mount,
+    eval_output_dir,
+    reset_logger: bool = True,
+):
+    instance = BiocoderData(**instance)
+    print(instance)
+    workspace_dir_name = (
+        f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
+            '/', '__'
+        )
+    )
+    workspace_mount_path = os.path.join(config.workspace_base, workspace_dir_name)
+    # create process-specific workspace dir
+    # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+    # so that different agent don't interfere with each other.
+    if not skip_workspace_mount:
+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(
+            eval_output_dir, 'logs', f'instance_{instance.test_case_id}.log'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {instance.test_case_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+
+    if not skip_workspace_mount:
+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+    # NOTE: this is something special we do for SWE-Bench due to the reason described in the previous section
+    # You can omit this if you don't need to setup specialized sandbox
+    workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}'.replace(
+        '/', '__'
+    )
+    sandbox = BiocoderSSHBox.get_box_for_instance(
+        instance,
+        workspace_dir_name,
+        skip_workspace_mount=False,
+        workspace_mount_path=workspace_mount_path,
+        sandbox_plugins=agenthub.Agent.get_cls(agent_class).sandbox_plugins,
+    )
+
+    sandbox.remove_code()
+
+    # Prepare instruction
+    instruction = (
+        f'Please complete the function "{instance.signature}" in the file /workspace/{instance.repository.split("/")[1]}/{instance.filePath}.\n'
+        f'The environment has been set up for you to start working. You may assume all necessary tools are installed.\n'
+        f'To complete the task, you must directly modify the file and fill in the function, keeping in mind that the function signature is on line {instance.lineStart-1}\n\n'
+        f'The function should do the following:\n'
+        f'{instance.promptSummaryOnly}\n\n'
+    )
+
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        'You should NOT modify any other files other than the file intended. This means that you should NOT write any test cases.\n'
+        'You may need context from other files in the repository to complete this task.'
+        'Do NOT add any import statements or change anything else other than the writing the function body.\n'
+        'You do not need to run the code to check if it works. \n'
+        'Make sure to include proper formatting in Java and Python, including correct braces and/or indentation.\n'
+    )
+
+    # instruction = (
+    #     f'In the file {instance.filePath}, there is a function with a signature and without a body. Your job is to complete the function, according to the given instructions. When you complete the function, respond with the function body, and nothing else.'
+    #     'The repository has cloned for you to start working. You are not allowed to run any bash commands, just modify the files. \n\n'
+    #     '# Problem Statement\n'
+    #     'Complete the following function signature:\n\n'
+    #     f'{instance.signature}'
+    #     'The function should do the following:\n\n'
+    #     f'{instance.promptSummaryOnly}\n\n'
+    # )
+    #
+    # instruction += (
+    #     'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+    #     'You should NOT modify any other files other than the file intended. This means that you should NOT write any test cases.\n'
+    #     'Do NOT add any import statements or change anything else other than the writing the function body.\n'
+    #     'You do not need to run the code to check if it works. The system will automatically check the correctness of your code.\n'
+    #     'Make sure to include proper formatting in Java and Python, including correct braces and/or indentation.\n'
+    # )
+
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State = asyncio.run(
+        main(
+            instruction,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+            sandbox=sandbox,
+        )
+    )
+
+    test_result = get_test_result(instance, sandbox, workspace_dir_name)
+
+    if state is None:
+        raise ValueError('State should not be None.')
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = {
+        'test_case_id': instance.test_case_id,
+        'biocoder_instance': instance.to_dict(),
+        'instruction': instruction,
+        'generated': test_result['metadata']['1_copy_change_code'],
+        'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'metrics': metrics,
+        'error': state.error if state and state.error else None,
+        'test_result': test_result,
+    }
+
+    # Close the sandbox
+    sandbox.close()
+    return output
+
+
+if __name__ == '__main__':
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenDevin's repo
+    dataset = load_dataset('lilbillbiscuit/biocoder_public')
+    biocoder_tests = dataset['test'].to_pandas()
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'biocoder',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    eval_output_dir = str(eval_output_dir)
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproduciblity
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        biocoder_tests = biocoder_tests.head(eval_n_limit)
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_test_case_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_test_case_ids.add(data['test_case_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_test_case_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_biocoder_tests = []
+    for idx, instance in biocoder_tests.iterrows():
+        if instance.test_case_id in finished_test_case_ids:
+            logger.info(
+                f'Skipping instance {instance.test_case_id} as it is already finished.'
+            )
+            continue
+        new_biocoder_tests.append(instance)
+
+    biocoder_tests = pd.DataFrame(new_biocoder_tests)
+    logger.info(
+        f'Finished instances: {len(finished_test_case_ids)}, Remaining instances: {len(biocoder_tests)}'
+    )
+    # =============================================
+
+    pbar = tqdm(total=len(biocoder_tests))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(future):
+        pbar.update(1)
+        output = future.result()
+        pbar.set_description(f'Instance {output["test_case_id"]}')
+        pbar.set_postfix_str(f'Test Result: {output["test_result"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["test_case_id"]}: {output["test_result"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        output_fp.flush()
+
+    # This sets the multi-processing
+    num_workers = args.eval_num_workers
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    # This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
+    skip_workspace_mount = agent_class == 'CodeActAgent'
+    logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
+
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multi-processing
+            for row_idx, instance in biocoder_tests.iterrows():
+                future = executor.submit(
+                    process_instance,
+                    instance,
+                    agent_class,
+                    metadata,
+                    skip_workspace_mount,
+                    eval_output_dir,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
--- a/evaluation/biocoder/scripts/run_infer.sh
+++ b/evaluation/biocoder/scripts/run_infer.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+MODEL_CONFIG=$1
+AGENT=$2
+EVAL_LIMIT=$3
+DATASET="biocoder"
+
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+echo "DATASET: $DATASET"
+
+COMMAND="poetry run python evaluation/biocoder/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 10 \
+  --max-chars 10000000 \
+  --eval-num-workers 1 \
+  --eval-note ${AGENT_VERSION}_${DATASET}"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+echo $COMMAND
+eval $COMMAND
--- a/evaluation/bird/README.md
+++ b/evaluation/bird/README.md
--- a/evaluation/bird/init.py
+++ b/evaluation/bird/init.py
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -0,0 +1,517 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import re
+import shutil
+import sqlite3
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+import pandas as pd
+from datasets import load_dataset
+from func_timeout import FunctionTimedOut, func_timeout
+from tqdm import tqdm
+
+from opendevin.controller.state.state import State
+from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    logger.info('Cleaning up child processes...')
+    for process in mp.active_children():
+        logger.info(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'If you think you have completed the SQL, please run the following command: <execute_bash> exit </execute_bash>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
+    )
+    if state.history:
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have fixed the issue through code changes, please run the following command: <execute_bash> exit </execute_bash>.\n'
+}
+
+
+def execute_sql(db_path, gen_sql, gold_sql):
+    """
+    Execute the generated SQL and the ground truth SQL and compare the results.
+    """
+    with sqlite3.connect(db_path) as conn:
+        cursor = conn.cursor()
+        cursor.execute(gen_sql)
+        predicted_res = cursor.fetchall()
+        cursor.execute(gold_sql)
+        ground_truth_res = cursor.fetchall()
+        res = 0
+        if set(predicted_res) == set(ground_truth_res):
+            res = 1
+    return res
+
+
+def get_test_result(instance, path, timeout=30):
+    test_result = {'result': {}, 'metadata': {}}
+
+    # Read the generated python file
+    with open(path, 'r') as f:
+        gen_file = f.read()
+
+    # Extract the SQL from the python file
+    gen_sql = ''
+    pattern = r'sql\s*=\s*"([^"]+)"'
+    match = re.search(pattern, gen_file)
+    if match:
+        gen_sql = match.group(1)
+    else:
+        print('No match found.')
+
+    gold_sql = instance.SQL
+    # Execute the SQL
+    try:
+        res = func_timeout(
+            timeout, execute_sql, args=(instance.db_path, gen_sql, gold_sql)
+        )
+        status = 'success'
+    except FunctionTimedOut:
+        res = 0
+        status = 'timeout'
+    except Exception as e:
+        res = 0
+        status = 'error'
+        logger.error(f'Error: {e}')
+
+    # Save the test result
+    test_result['result'] = {'passed': res, 'status': status}
+    test_result['metadata'] = {
+        'timeout': timeout,
+        'gen_sql': gen_sql,
+        'gold_sql': gold_sql,
+    }
+    return test_result
+
+
+def process_instance(
+    instance, agent_class, metadata, skip_workspace_mount, reset_logger: bool = True
+):
+    workspace_mount_path = os.path.join(
+        config.workspace_mount_path, 'bird_eval_workspace'
+    )
+    # create process-specific workspace dir
+    # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+    # so that different agent don't interfere with each other.
+    if not skip_workspace_mount:
+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+    # reset workspace to config
+    config.workspace_mount_path = workspace_mount_path
+
+    # Copy the database to the workspace
+    db_root = os.path.join(
+        config.workspace_base, 'evaluation_bird/dev/dev_databases', instance.db_id
+    )
+    target_path = os.path.join(workspace_mount_path, f'{instance.db_id}')
+    if not os.path.exists(target_path):
+        logger.info(f'Copying database from {db_root} to {target_path}...')
+        shutil.copytree(db_root, target_path)
+
+    # Set up the database path
+    database_path = os.path.join(instance.db_id, f'{instance.db_id}.sqlite')
+
+    # Set up the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(
+            eval_output_dir,
+            'logs',
+            f'instance_{instance.task_id.replace("/", "__")}.log',
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {instance.task_id}.\nLOG:   tail -f {log_file}'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+
+    if not skip_workspace_mount:
+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+    # Create file with BIRD instance
+    statements = f"""
+    import sqlite3
+    def execute_sql(db_path, sql):
+        with sqlite3.connect(db_path) as conn:
+            cursor = conn.cursor()
+            cursor.execute(sql)
+            result = cursor.fetchall()
+            return result
+
+    if __name__ == '__main__':
+        sql = "" # fill in your SQL here
+        db_path = "{database_path}"
+        print(db_path)
+        result = execute_sql(db_path, sql)
+        print(result)
+    """
+    path = os.path.join(
+        config.workspace_mount_path, f'{instance.task_id.replace("/", "__")}.py'
+    )
+    instruction = (
+        f'You are a SQL expert and need to complete the following text-to-SQL tasks.'
+        f'\n\n{instance.instruction}\n\n'
+        'Please write the SQL in one line without line breaks.'
+        f'And write a new python file named {instance.task_id.replace("/", "__")}.py to call the SQL you wrote.'
+        'You need to follow the code template below:'
+        f'\n\n{statements}\n\n'
+        'Environment has been set up for you to start working.'
+        'You may assume all necessary tools are installed.\n\n'
+    )
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
+    )
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State = asyncio.run(
+        main(
+            instruction,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+        )
+    )
+
+    # ======= Attempt to evaluate the agent's edits =======
+    test_result = get_test_result(instance, path)
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+    if state is None:
+        raise ValueError('State should not be None.')
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = {
+        'task_id': instance.task_id,
+        'instruction': instruction,
+        'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'metrics': metrics,
+        'error': state.error if state and state.error else None,
+        'test_result': test_result,
+    }
+    return output
+
+
+def load_bird():
+    """
+    Main function to handle the flow of downloading, processing, and loading the bird dataset.
+    """
+    raw_dataset_path = download_bird()
+    bird_dataset = process_bird(raw_dataset_path)
+    return bird_dataset
+
+
+def download_bird():
+    """
+    Downloads and extracts the bird dataset from a specified URL into a local directory.
+    """
+    dataset_path = os.path.join(config.workspace_base, 'evaluation_bird')
+    devset_path = os.path.join(dataset_path, 'dev')
+    if not os.path.exists(dataset_path):
+        logger.info(
+            f'{dataset_path} folder does not exist, starting download and extraction...'
+        )
+        os.makedirs(dataset_path, exist_ok=True)
+        download_url = 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip'
+        download_path = os.path.join(dataset_path, 'dev.zip')
+        logger.info('Start Downloading...')
+        subprocess.run(['wget', download_url, '-O', download_path])
+        logger.info('Download completed.')
+        logger.info('Start Extracting...')
+        subprocess.run(['unzip', download_path, '-d', dataset_path])
+        # extract databases
+        devset_path = os.path.join(dataset_path, 'dev')
+        database_path = os.path.join(devset_path, 'dev_databases.zip')
+        subprocess.run(['unzip', database_path, '-d', devset_path])
+        logger.info('Extraction completed.')
+    else:
+        logger.info(f'{dataset_path} folder already exists.')
+    return devset_path
+
+
+def process_bird(dataset_path):
+    """
+    Processes the raw bird dataset into a structured format and saves it as JSON.
+    """
+    processed_path = os.path.join(dataset_path, 'processed_dev.json')
+    if not os.path.exists(processed_path):
+        logger.info(f'{processed_path} folder does not exist, starting processing...')
+        raw_data_path = os.path.join(dataset_path, 'dev.json')
+        database_path = os.path.join(dataset_path, 'dev_databases')
+        processed_data = []
+        with pathlib.Path(raw_data_path).open('r') as f:
+            data = json.load(f)
+            for e in tqdm(data):
+                item = {
+                    'task_id': f'{len(processed_data)}',
+                    'db_path': os.path.join(
+                        database_path, e['db_id'], f"{e['db_id']}.sqlite"
+                    ),
+                    'db_id': e['db_id'],
+                    'instruction': create_prompt(e, database_path),
+                    'SQL': e['SQL'],
+                }
+                processed_data.append(item)
+
+        with pathlib.Path(processed_path).open('w') as f:
+            json.dump(processed_data, f, indent=2)
+            logger.info(f'Processed data saved to {processed_path}')
+    else:
+        logger.info(f'{processed_path} folder already exists.')
+    bird_dataset = load_dataset('json', data_files={'test': processed_path})
+    return bird_dataset
+
+
+def extract_create_table_prompt(db_path, limit_value=0):
+    """
+    Generates a SQL prompt with CREATE TABLE statements and sample data from the database.
+    """
+    table_query = "SELECT * FROM sqlite_master WHERE type='table';"
+    tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall()
+    prompt = ''
+    for table in tables:
+        table_name = table[1]
+        create_table_statement = table[-1]
+
+        table_info_query = f'PRAGMA table_info(`{table_name}`);'
+        top_k_row_query = f'SELECT * FROM {table_name} LIMIT {limit_value};'
+        try:
+            headers = [
+                x[1]
+                for x in sqlite3.connect(db_path)
+                .cursor()
+                .execute(table_info_query)
+                .fetchall()
+            ]
+        except Exception:
+            logger.error(f'Error Connection: {table_info_query}, {top_k_row_query}')
+            exit(0)
+
+        prompt += create_table_statement + ';\n'
+        if limit_value > 0:
+            top_k_rows = (
+                sqlite3.connect(db_path).cursor().execute(top_k_row_query).fetchall()
+            )
+            prompt += (
+                f"/*\n3 example rows:\n{top_k_row_query}\n{'    '.join(headers)}\n"
+            )
+            for row in top_k_rows:
+                row = [str(x) for x in row]
+                row = [x if x is not None else '' for x in row]
+                prompt += '    '.join(row) + '\n'
+            prompt += '*/\n'
+        prompt += '\n'
+    return prompt
+
+
+def create_prompt(e, database_path):
+    """
+    Create a prompt for the given example
+    """
+    db_id = e['db_id']
+    db_path = pathlib.Path(database_path) / db_id / f'{db_id}.sqlite'
+
+    # Extract the CREATE TABLE statements and sample data from the database
+    prompt = extract_create_table_prompt(db_path)
+    prompt += f"-- External Knowledge: {e['evidence']}\n\n"
+    prompt += '-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n'
+    prompt += '-- Using valid SQLite, answer the following questions for the tables provided above.\n'
+    prompt += f"Question: {e['question']}\n"
+
+    return prompt
+
+
+if __name__ == '__main__':
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenDevin's repo
+    # Due to the large size of the BIRD database, it cannot be hosted on huggingface datasets, so it needs to be downloaded
+    bird_dataset = load_bird()
+    bird_tests = bird_dataset['test'].to_pandas()
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/humanevalfix/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'bird',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproducibility
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        bird_tests = bird_tests.head(eval_n_limit)
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_instance_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_instance_ids.add(data['task_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_bird_tests = []
+    for idx, instance in bird_tests.iterrows():
+        if instance.task_id in finished_instance_ids:
+            logger.info(
+                f'Skipping instance {instance.task_id} as it is already finished.'
+            )
+            continue
+        new_bird_tests.append(instance)
+
+    bird_tests = pd.DataFrame(new_bird_tests)
+    logger.info(
+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(bird_tests)}'
+    )
+    # =============================================
+
+    pbar = tqdm(total=len(bird_tests))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(future):
+        pbar.update(1)
+        output = future.result()
+        pbar.set_description(f'Instance {output["task_id"]}')
+        pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["task_id"]}: {output["test_result"]["result"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        output_fp.flush()
+
+    # This sets the multi-processing
+    num_workers = args.eval_num_workers
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multi-processing
+            for row_idx, instance in bird_tests.iterrows():
+                future = executor.submit(
+                    process_instance,
+                    instance,
+                    agent_class,
+                    metadata,
+                    skip_workspace_mount=False,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
--- a/evaluation/bird/scripts/run_infer.sh
+++ b/evaluation/bird/scripts/run_infer.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+MODEL_CONFIG=$1
+AGENT=$2
+EVAL_LIMIT=$3
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+COMMAND="poetry run python evaluation/bird/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 5 \
+  --max-chars 10000000 \
+  --eval-num-workers 1 \
+  --eval-note $AGENT_VERSION" \
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -77,122 +77,133 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
    # we will create a workspace directory for EACH process
    # so that different agent don't interfere with each other.
    old_workspace_mount_path = config.workspace_mount_path
-    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
-    workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-    config.workspace_mount_path = workspace_mount_path

-    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
-    eval_output_dir = metadata['eval_output_dir']
-    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            eval_output_dir, 'logs', f'instance_{instance["task_id"]}.log'
+    try:
+        workspace_mount_path = os.path.join(
+            config.workspace_mount_path, '_eval_workspace'
        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+        config.workspace_mount_path = workspace_mount_path
+
+        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+        eval_output_dir = metadata['eval_output_dir']
+        if reset_logger:
+            # Set up logger
+            log_file = os.path.join(
+                eval_output_dir, 'logs', f'instance_{instance["task_id"]}.log'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            # add back the console handler to print ONE line
+            logger.addHandler(get_console_handler())
+            logger.info(
+                f'Starting evaluation for instance {instance["task_id"]}.\nLOG:   tail -f {log_file}'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setFormatter(
+                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+            )
+            logger.addHandler(file_handler)
+
+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+        if instance['file_name'] != '':
+            # if this question comes with a file, we need to save it to the workspace
+            src_file = os.path.join(
+                DATASET_CACHE_DIR, '2023', metadata['data_split'], instance['file_name']
+            )
+            extension_name = instance['file_name'].split('.')[-1]
+            dest_file = os.path.join(workspace_mount_path, f'file.{extension_name}')
+            shutil.copyfile(src_file, dest_file)
+            logger.info(f'File copied to {dest_file}')
+        else:
+            dest_file = None
+
+        # Prepare instruction
+        instruction = f"{instance['Question']}\n"
+        logger.info(f'Instruction: {instruction}')
+        if dest_file:
+            instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"
+
+        instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
+        instruction += (
+            'For example: The answer to the question is <solution> 42 </solution>.\n'
+        )
+        # NOTE: You can actually set slightly different instruction for different agents
+        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+        logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State = asyncio.run(
+            main(
+                instruction,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+                    agent_class
+                ),
+            )
+        )
+        # ======= Attempt to evaluate the agent's edits =======
+        # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+        if state is None:
+            raise ValueError('State should not be None.')
+
+        model_answer_raw = ''
+        for act, _ in reversed(state.history):
+            if isinstance(act, CmdRunAction) and act.source == 'agent':
+                model_answer_raw = act.thought
+                break
+            elif isinstance(act, MessageAction) and act.source == 'agent':
+                model_answer_raw = act.content
+                break
+
+        # attempt to parse model_answer
+        model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
+        if len(model_answer) == 0:
+            logger.warning(f'Failed to parse model answer: {model_answer_raw}')
+            model_answer = model_answer_raw
+        else:
+            model_answer = model_answer[0]
+
        logger.info(
-            f'Starting evaluation for instance {instance["task_id"]}.\nLOG:   tail -f {log_file}'
+            f'Final message: {model_answer} | Ground truth: {instance["Final answer"]}'
        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        score = question_scorer(
+            model_answer=model_answer, ground_truth=instance['Final answer']
        )
-        logger.addHandler(file_handler)
+        test_result = {
+            'score': score,
+            'model_answer_raw': model_answer_raw,
+            'model_answer': model_answer,
+            'ground_truth': instance['Final answer'],
+        }
+        metrics = state.metrics.get() if state.metrics else None

-    logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-    if instance['file_name'] != '':
-        # if this question comes with a file, we need to save it to the workspace
-        src_file = os.path.join(
-            DATASET_CACHE_DIR, '2023', metadata['data_split'], instance['file_name']
-        )
-        extension_name = instance['file_name'].split('.')[-1]
-        dest_file = os.path.join(workspace_mount_path, f'file.{extension_name}')
-        shutil.copyfile(src_file, dest_file)
-        logger.info(f'File copied to {dest_file}')
-    else:
-        dest_file = None
-
-    # Prepare instruction
-    instruction = f"{instance['Question']}\n"
-    logger.info(f'Instruction: {instruction}')
-    if dest_file:
-        instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"
-
-    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-    instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
-    instruction += (
-        'For example: The answer to the question is <solution> 42 </solution>.\n'
-    )
-    # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
-    logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
-
-    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State = asyncio.run(
-        main(
-            instruction,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
-        )
-    )
-    # ======= Attempt to evaluate the agent's edits =======
-    # If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
-    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-
-    if state is None:
-        raise ValueError('State should not be None.')
-
-    model_answer_raw = ''
-    for act, _ in reversed(state.history):
-        if isinstance(act, CmdRunAction) and act.source == 'agent':
-            model_answer_raw = act.thought
-            break
-        elif isinstance(act, MessageAction) and act.source == 'agent':
-            model_answer_raw = act.content
-            break
-
-    # attempt to parse model_answer
-    model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
-    if len(model_answer) == 0:
-        logger.warning(f'Failed to parse model answer: {model_answer_raw}')
-        model_answer = model_answer_raw
-    else:
-        model_answer = model_answer[0]
-
-    logger.info(
-        f'Final message: {model_answer} | Ground truth: {instance["Final answer"]}'
-    )
-    score = question_scorer(
-        model_answer=model_answer, ground_truth=instance['Final answer']
-    )
-    test_result = {
-        'score': score,
-        'model_answer_raw': model_answer_raw,
-        'model_answer': model_answer,
-        'ground_truth': instance['Final answer'],
-    }
-
-    # Save the output
-    output = {
-        'instance_id': instance['task_id'],
-        'instance': instance,
-        'instruction': instance['Question'],
-        'metadata': metadata,
-        'history': [
-            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-        ],
-        'error': state.error if state and state.error else None,
-        'test_result': test_result,
-    }
-
-    # Close the sandbox
-    config.workspace_mount_path = old_workspace_mount_path
+        # Save the output
+        output = {
+            'instance_id': instance['task_id'],
+            'instance': instance,
+            'instruction': instance['Question'],
+            'metadata': metadata,
+            'history': [
+                (event_to_dict(action), event_to_dict(obs))
+                for action, obs in state.history
+            ],
+            'metrics': metrics,
+            'error': state.error if state and state.error else None,
+            'test_result': test_result,
+        }
+    except Exception:
+        logger.error('Process instance failed')
+        raise
+    finally:
+        config.workspace_mount_path = old_workspace_mount_path
    return output


@@ -264,7 +275,7 @@ if __name__ == '__main__':
        'max_iterations': max_iterations,
        'eval_output_dir': eval_output_dir,
        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
-        # get the commit id of current repo for reproduciblity
+        # get the commit id of current repo for reproducibility
        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
        .decode('utf-8')
        .strip(),
--- a/evaluation/gaia/scorer.py
+++ b/evaluation/gaia/scorer.py
@@ -17,8 +17,10 @@ def normalize_number_str(number_str: str) -> float:

 def split_string(
    s: str,
-    char_list: list[str] = [',', ';'],
+    char_list: list[str] = None,
 ) -> list[str]:
+    if char_list is None:
+        char_list = [',', ';']
    pattern = f"[{''.join(char_list)}]"
    return re.split(pattern, s)

@@ -51,7 +53,9 @@ def question_scorer(
        # check length is the same
        if len(gt_elems) != len(ma_elems):
            warnings.warn(
-                'Answer lists have different lengths, returning False.', UserWarning
+                'Answer lists have different lengths, returning False.',
+                UserWarning,
+                stacklevel=2,
            )
            return False

--- a/evaluation/gorilla/README.md
+++ b/evaluation/gorilla/README.md
@@ -0,0 +1,41 @@
+# Gorilla APIBench Evaluation with OpenDevin
+
+This folder contains evaluation harness we built on top of the original [Gorilla APIBench](https://github.com/ShishirPatil/gorilla) ([paper](https://arxiv.org/pdf/2305.15334)).
+
+## Setup Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local development environment for OpenDevin.
+
+## Configure OpenDevin and your LLM
+
+Run `make setup-config` to set up the `config.toml` file if it does not exist at the root of the workspace.
+
+## Run Inference on APIBench Instances
+
+Make sure your Docker daemon is running, then run this bash script:
+
+```bash
+bash evaluation/gorilla/scripts/run_infer.sh [model_config] [agent] [eval_limit] [hubs]
+```
+
+where `model_config` is mandatory, while all other arguments are optional.
+
+`model_config`, e.g. `llm`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+
+`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+
+`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances.
+By default, the script evaluates 1 instance.
+
+`hubs`, the hub from APIBench to evaluate from. You could choose one or more from `torch` or `th` (which is abbreviation of torch), `hf` (which is abbreviation of huggingface), and `tf` (which is abbreviation of tensorflow),  for `hubs`. The default is `hf,torch,tf`.
+
+Note: in order to use `eval_limit`, you must also set `agent`; in order to use `hubs`, you must also set `eval_limit`.
+
+Let's say you'd like to run 10 instances using `llm` and CodeActAgent on `th` test,
+then your command would be:
+
+```bash
+bash evaluation/gorilla/scripts/run_infer.sh llm CodeActAgent 10 th
+```
--- a/evaluation/gorilla/ast_eval_hf.py
+++ b/evaluation/gorilla/ast_eval_hf.py
@@ -0,0 +1,127 @@
+# Copyright 2023 https://github.com/ShishirPatil/gorilla
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is modified from https://github.com/ShishirPatil/gorilla/blob/main/eval/eval-scripts/ast_eval_hf.py
+
+from tree_sitter import Language, Parser
+
+
+# Get all the subtrees given a root_node
+def get_all_sub_trees(root_node):
+    node_stack = []
+    sub_tree_sexp_list = []
+    depth = 1
+    # text = root_node.text
+    node_stack.append([root_node, depth])
+    while len(node_stack) != 0:
+        cur_node, cur_depth = node_stack.pop()
+        if cur_node.child_count > 0:
+            sub_tree_sexp_list.append(
+                [cur_node.sexp(), cur_depth, cur_node, cur_node.children[0].text]
+            )
+        else:
+            sub_tree_sexp_list.append([cur_node.sexp(), cur_depth, cur_node, None])
+        for child_node in cur_node.children:
+            if len(child_node.children) != 0:
+                depth = cur_depth + 1
+                node_stack.append([child_node, depth])
+    return sub_tree_sexp_list
+
+
+# Parse the program into AST trees
+def ast_parse(candidate, lang='python'):
+    LANGUAGE = Language('evaluation/gorilla/my-languages.so', lang)
+    parser = Parser()
+    parser.set_language(LANGUAGE)
+
+    candidate_tree = parser.parse(bytes(candidate, 'utf8')).root_node
+    return candidate_tree
+
+
+# Get all the arguments in the ast tree
+def get_args(node):
+    if node.child_count == 0:
+        return []
+    args_list = []
+    for child in node.children[0].children[0].children[1].children:
+        if '=' in child.text.decode():
+            args_list.append(child.children[2].text)
+        elif (
+            child.text.decode() != '('
+            and child.text.decode() != ')'
+            and child.text.decode() != ','
+        ):
+            args_list.append(child.text)
+    return args_list
+
+
+# Check if there is an api match
+def ast_check(candidate_subtree_list, base_tree_list):
+    for idx, base_tree in enumerate(base_tree_list):
+        if base_tree.children[0].children[0].child_count == 0:
+            continue
+        api_name = base_tree.children[0].children[0].children[0].text
+        for candidate_tree in candidate_subtree_list:
+            if candidate_tree[3] == api_name:
+                break
+        # Now we have a sub-tree
+        candidate_tree = candidate_tree[2]
+        args_list = get_args(base_tree)
+        if len(args_list) == 0:
+            continue
+        ast_match = True
+        for arg in args_list:
+            if arg.decode().lstrip("'").rstrip("'") not in candidate_tree.text.decode():
+                ast_match = False
+                break
+        if ast_match:
+            return idx
+    return -1
+
+
+def ast_eval_hf(api_database, qa_pairs, ast_database, question_id, response):
+    # Check correctness
+    correct = False
+    hallucination = False
+    output = response
+    # Index the "api_call" domain
+    output = output.split('api_call')
+    if len(output) == 1:
+        api_call = output[0]
+    else:
+        # Parse the output
+        output = output[1].split('api_provider')[0]
+        if ':' not in output:
+            start = 0
+        else:
+            start = output.index(':')
+        if ')' not in output:
+            end = -2
+        else:
+            end = output.rindex(')')
+        api_call = output[start + 2 : end + 1]
+    # Parse the api_call into AST tree
+    ast_tree = ast_parse(api_call)
+    # Search for a subtree
+    ast_subtree_list = get_all_sub_trees(ast_tree)
+    # Check which ast tree is matching
+    database_index = ast_check(ast_subtree_list, ast_database)
+    # We cannot index this ast in our database
+    if database_index == -1:
+        hallucination = True
+    # We index our reference api_call
+    ref_api_call = api_database[database_index]
+    # Check for functionality
+    if ref_api_call['domain'] == qa_pairs[question_id - 1]['domain']:
+        correct = True
+    return correct, hallucination
--- a/evaluation/gorilla/ast_eval_tf.py
+++ b/evaluation/gorilla/ast_eval_tf.py
@@ -0,0 +1,127 @@
+# Copyright 2023 https://github.com/ShishirPatil/gorilla
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is modified from https://github.com/ShishirPatil/gorilla/blob/main/eval/eval-scripts/ast_eval_tf.py
+
+from tree_sitter import Language, Parser
+
+
+# Get all the subtrees given a root_node
+def get_all_sub_trees(root_node):
+    node_stack = []
+    sub_tree_sexp_list = []
+    depth = 1
+    # text = root_node.text
+    node_stack.append([root_node, depth])
+    while len(node_stack) != 0:
+        cur_node, cur_depth = node_stack.pop()
+        if cur_node.child_count > 0:
+            sub_tree_sexp_list.append(
+                [cur_node.sexp(), cur_depth, cur_node, cur_node.children[0].text]
+            )
+        else:
+            sub_tree_sexp_list.append([cur_node.sexp(), cur_depth, cur_node, None])
+        for child_node in cur_node.children:
+            if len(child_node.children) != 0:
+                depth = cur_depth + 1
+                node_stack.append([child_node, depth])
+    return sub_tree_sexp_list
+
+
+# Parse the program into AST trees
+def ast_parse(candidate, lang='python'):
+    LANGUAGE = Language('evaluation/gorilla/my-languages.so', lang)
+    parser = Parser()
+    parser.set_language(LANGUAGE)
+
+    candidate_tree = parser.parse(bytes(candidate, 'utf8')).root_node
+    return candidate_tree
+
+
+# Get all the arguments in the ast tree
+def get_args(node):
+    if node.child_count == 0:
+        return []
+    args_list = []
+    for child in node.children[0].children[0].children[1].children:
+        if 'model=' in child.text.decode() or 'model =' in child.text.decode():
+            args_list.append(child.children[2].text)
+        elif (
+            child.text.decode() != '('
+            and child.text.decode() != ')'
+            and child.text.decode() != ','
+        ):
+            args_list.append(child.text)
+    return args_list
+
+
+# Check if there is an api match
+def ast_check(candidate_subtree_list, base_tree_list):
+    for idx, base_tree in enumerate(base_tree_list):
+        if base_tree.children[0].children[0].child_count == 0:
+            continue
+        api_name = base_tree.children[0].children[0].children[0].text
+        for candidate_tree in candidate_subtree_list:
+            if candidate_tree[3] == api_name:
+                break
+        # Now we have a sub-tree
+        candidate_tree = candidate_tree[2]
+        args_list = get_args(base_tree)
+        if len(args_list) == 0:
+            continue
+        ast_match = True
+        for arg in args_list:
+            if arg.decode().lstrip("'").rstrip("'") not in candidate_tree.text.decode():
+                ast_match = False
+                break
+        if ast_match:
+            return idx
+    return -1
+
+
+def ast_eval_tf(api_database, qa_pairs, ast_database, question_id, response):
+    # Check correctness
+    correct = False
+    hallucination = False
+    output = response
+    # Index the "api_call" domain
+    output = output.split('api_call')
+    if len(output) == 1:
+        api_call = output[0]
+    else:
+        # Parse the output
+        output = output[1].split('api_provider')[0]
+        if ':' not in output:
+            start = 0
+        else:
+            start = output.index(':')
+        if ')' not in output:
+            end = -2
+        else:
+            end = output.rindex(')')
+        api_call = output[start + 2 : end + 1]
+    # Parse the api_call into AST tree
+    ast_tree = ast_parse(api_call)
+    # Search for a subtree
+    ast_subtree_list = get_all_sub_trees(ast_tree)
+    # Check which ast tree is matching
+    database_index = ast_check(ast_subtree_list, ast_database)
+    # We cannot index this ast in our database
+    if database_index == -1:
+        hallucination = True
+    # We index our reference api_call
+    ref_api_call = api_database[database_index]
+    # Check for functionality
+    if ref_api_call['domain'] == qa_pairs[question_id - 1]['domain']:
+        correct = True
+    return correct, hallucination
--- a/evaluation/gorilla/ast_eval_th.py
+++ b/evaluation/gorilla/ast_eval_th.py
@@ -0,0 +1,123 @@
+# Copyright 2023 https://github.com/ShishirPatil/gorilla
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is modified from https://github.com/ShishirPatil/gorilla/blob/main/eval/eval-scripts/ast_eval_th.py
+
+from tree_sitter import Language, Parser
+
+
+# Get all the subtrees given a root_node
+def get_all_sub_trees(root_node):
+    node_stack = []
+    sub_tree_sexp_list = []
+    depth = 1
+    # text = root_node.text
+    node_stack.append([root_node, depth])
+    while len(node_stack) != 0:
+        cur_node, cur_depth = node_stack.pop()
+        if cur_node.child_count > 0:
+            sub_tree_sexp_list.append(
+                [cur_node.sexp(), cur_depth, cur_node, cur_node.children[0].text]
+            )
+        else:
+            sub_tree_sexp_list.append([cur_node.sexp(), cur_depth, cur_node, None])
+        for child_node in cur_node.children:
+            if len(child_node.children) != 0:
+                depth = cur_depth + 1
+                node_stack.append([child_node, depth])
+    return sub_tree_sexp_list
+
+
+# Parse the program into AST trees
+def ast_parse(candidate, lang='python'):
+    LANGUAGE = Language('evaluation/gorilla/my-languages.so', lang)
+    parser = Parser()
+    parser.set_language(LANGUAGE)
+
+    candidate_tree = parser.parse(bytes(candidate, 'utf8')).root_node
+    return candidate_tree
+
+
+# Get all the arguments in the ast tree
+def get_args(node):
+    if node.child_count == 0:
+        return []
+    args_list = []
+    for child in node.children[0].children[0].children[1].children:
+        if 'repo_or_dir' in child.text.decode() or 'model' in child.text.decode():
+            args_list.append(child.children[2].text)
+    return args_list
+
+
+# Check if there is an api match
+def ast_check(candidate_subtree_list, base_tree_list):
+    for idx, base_tree in enumerate(base_tree_list):
+        if base_tree.children[0].children[0].child_count == 0:
+            continue
+        api_name = base_tree.children[0].children[0].children[0].text
+        for candidate_tree in candidate_subtree_list:
+            if candidate_tree[3] == api_name:
+                break
+        # Now we have a sub-tree
+        candidate_tree = candidate_tree[2]
+        args_list = get_args(base_tree)
+        if len(args_list) == 0:
+            continue
+        ast_match = True
+        for arg in args_list:
+            if arg.decode().lstrip("'").rstrip("'") not in candidate_tree.text.decode():
+                ast_match = False
+                break
+        if ast_match:
+            return idx
+    return -1
+
+
+def process_response(question_id, output, api_database, qa_pairs, ast_database):
+    # Index the "api_call" domain
+    output = output.split('api_call')
+    if len(output) == 1:
+        return False, False
+    else:
+        output = output[1].split('api_provider')[0]
+    if ':' not in output:
+        start = 0
+    else:
+        start = output.index(':')
+    if ')' not in output:
+        end = -2
+    else:
+        end = output.rindex(')')
+    api_call = output[start + 2 : end + 1]
+
+    # Parse the api_call into AST tree
+    ast_tree = ast_parse(api_call)
+    # Search for a subtree
+    ast_subtree_list = get_all_sub_trees(ast_tree)
+    # Check which ast tree is matching
+    database_index = ast_check(ast_subtree_list, ast_database)
+    # We cannot index this ast in our database
+    if database_index == -1:
+        return False, True
+    # We index our reference api_call
+    ref_api_call = api_database[database_index]
+    # Check for functionality
+    if ref_api_call['domain'] == qa_pairs[question_id - 1]['domain']:
+        return True, False
+    else:
+        return False, False
+
+
+def ast_eval_th(api_database, qa_pairs, ast_database, question_id, response):
+    # Check correctness
+    return process_response(question_id, response, api_database, qa_pairs, ast_database)
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -0,0 +1,355 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+from tqdm import tqdm
+from utils import encode_question, get_data
+
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, get_parser
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    print('Cleaning up child processes...')
+    for process in mp.active_children():
+        print(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        #'Please continue working on the task on whatever approach you think is suitable.\n'
+        'Please run the following command: <execute_bash> exit </execute_bash>.\n'
+        #'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
+    )
+    if state.history:
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have completed the request, please run the following command: <execute_bash> exit </execute_bash>.\n'
+}
+
+
+def process_instance(
+    question_id, question, agent_class, metadata, reset_logger: bool = True
+):
+    # create process-specific workspace dir
+    # we will create a workspace directory for EACH process
+    # so that different agent don't interfere with each other.
+    old_workspace_mount_path = config.workspace_mount_path
+    try:
+        workspace_mount_path = os.path.join(
+            config.workspace_mount_path, '_eval_workspace'
+        )
+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+        config.workspace_mount_path = workspace_mount_path
+
+        # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+        eval_output_dir = metadata['eval_output_dir']
+        if reset_logger:
+            # Set up logger
+            log_file = os.path.join(
+                eval_output_dir, 'logs', f'instance_{question_id}.log'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            # add back the console handler to print ONE line
+            logger.addHandler(get_console_handler())
+            logger.info(
+                f'Starting evaluation for instance {question_id}.\nLOG:   tail -f {log_file}'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setFormatter(
+                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+            )
+            logger.addHandler(file_handler)
+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+        # Prepare instruction
+        instruction = encode_question(question, metadata['hub'])
+        instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        # NOTE: You can actually set slightly different instruction for different agents
+        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+        # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State = asyncio.run(
+            main(
+                instruction,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+                    agent_class
+                ),
+            )
+        )
+        # ======= Attempt to evaluate the agent's edits =======
+        # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+        if state is None:
+            raise ValueError('State should not be None.')
+
+        model_answer_raw = ''
+        for act, _ in reversed(state.history):
+            if isinstance(act, MessageAction) and act.source == 'agent':
+                model_answer_raw = act.content
+                break
+        # attempt to parse model_answer
+        _, _, ast_eval = get_data(metadata['hub'])
+        correct, hallucination = ast_eval(question_id, model_answer_raw)
+        metrics = state.metrics.get() if state.metrics else None
+        logger.info(
+            f'Final message: {model_answer_raw} | Correctness: {correct} | Hallucination: {hallucination}'
+        )
+        # Save the output
+        output = {
+            'question_id': question_id,
+            'text': model_answer_raw,
+            'correct': correct,
+            'hallucination': hallucination,
+            'answer_id': 'None',
+            'model_id': metadata['model_name'],
+            'metadata': metadata,
+            'history': [
+                (event_to_dict(action), event_to_dict(obs))
+                for action, obs in state.history
+            ],
+            'metrics': metrics,
+            'error': state.error if state and state.error else None,
+        }
+    except Exception:
+        logger.error('Process instance failed')
+        raise
+    finally:
+        config.workspace_mount_path = old_workspace_mount_path
+    return output
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '--hubs',
+        type=str,
+        help='Which hubs to evaluate from APIBench. APIBench contains 3 hubs, namely huggingface, torch, and tensorflow. You could choose one or more from hf, torch, or tf, separated by commas. For example, the default is --hub hf,torch,tf.',
+        default='hf,torch,tf',
+    )
+    args, _ = parser.parse_known_args()
+    if args.directory:
+        config.workspace_base = os.path.abspath(args.directory)
+        print(f'Setting workspace base to {config.workspace_base}')
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'gorilla',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    hubs = []
+    if 'hf' in args.hubs:
+        hubs.append('hf')
+    if 'torch' in args.hubs or 'th' in args.hubs:
+        hubs.append('torch')
+    if 'tf' in args.hubs:
+        hubs.append('tf')
+    if hubs == []:
+        raise ValueError('Please choose at least one from hf, torch, and tf for hubs.')
+
+    for hub in hubs:
+        logger.info(f'Evaluating APIBench {hub} test')
+        questions, question_ids, ast_eval = get_data(hub)
+
+        # TEST METADATA
+        metadata = {
+            'hub': hub,
+            'agent_class': agent_class,
+            'model_name': model_name,
+            'max_iterations': max_iterations,
+            'eval_output_dir': eval_output_dir,
+            'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+            # get the commit id of current repo for reproduciblity
+            'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+            .decode('utf-8')
+            .strip(),
+        }
+        logger.info(f'Metadata: {metadata}')
+        with open(os.path.join(eval_output_dir, f'metadata_{hub}.json'), 'w') as f:
+            json.dump(metadata, f)
+
+        # LIMIT EVALUATION
+        eval_n_limit = args.eval_n_limit
+        if eval_n_limit:
+            questions = questions[: (eval_n_limit // len(hubs))]
+            question_ids = question_ids[: (eval_n_limit // len(hubs))]
+            logger.info(
+                f'Limiting evaluation to a total of first {eval_n_limit} instances -> first {eval_n_limit//len(hubs)} instances per hub.'
+            )
+        output_file = os.path.join(eval_output_dir, f'output_{model_name}_{hub}.jsonl')
+        logger.info(f'Writing evaluation output to {output_file}')
+        finished_task_ids = set()
+        if os.path.exists(output_file):
+            with open(output_file, 'r') as f:
+                for line in f:
+                    data = json.loads(line)
+                    for i in range(len(question_ids)):
+                        if question_ids[i] == int(data['question_id']):
+                            finished_task_ids.add(data['question_id'])
+            logger.warning(
+                f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
+            )
+        output_fp = open(output_file, 'a')
+        logger.info(
+            f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+        )
+        # =============================================
+        # filter out finished instances
+        new_questions = []
+        new_question_ids = []
+        for i in range(len(question_ids)):
+            if question_ids[i] in finished_task_ids:
+                logger.info(
+                    f'Skipping instance {question_ids[i]} as it is already finished.'
+                )
+                continue
+            new_questions.append(questions[i])
+            new_question_ids.append(question_ids[i])
+
+        finished_task_number = len(finished_task_ids)
+        questions = new_questions
+        question_ids = new_question_ids
+        logger.info(
+            f'Finished instances: {finished_task_number}, Remaining instances: {len(question_ids)}'
+        )
+        # =============================================
+        pbar = tqdm(total=len(question_ids))
+
+        # This function tracks the progress AND write the output to a JSONL file
+        def update_progress(future, pbar, output_fp, finished_task_ids):
+            pbar.update(1)
+            output = future.result()
+            pbar.set_description(f'Instance {output["question_id"]}')
+            pbar.set_postfix_str(f'Test Result: {output["correct"]}')
+            logger.info(
+                f'Finished evaluation for instance {output["question_id"]}: {output["correct"]}'
+            )
+            output_fp.write(json.dumps(output) + '\n')
+            output_fp.flush()
+            finished_task_ids.add(output['question_id'])
+
+        # This sets the multi-processing
+        num_workers = args.eval_num_workers
+        logger.info(f'Using {num_workers} workers for evaluation.')
+        try:
+            with ProcessPoolExecutor(num_workers) as executor:
+                futures = []
+                # This is how we perform multi-processing
+                for i in range(len(question_ids)):
+                    try:
+                        question_id = question_ids[i]
+                        question = questions[i]
+                        future = executor.submit(
+                            process_instance,
+                            question_id,
+                            question,
+                            agent_class,
+                            metadata,
+                            reset_logger=bool(num_workers > 1),
+                        )
+                        future.add_done_callback(
+                            update_progress, pbar, output_fp, finished_task_ids
+                        )
+                        futures.append(future)
+                    except Exception:
+                        continue
+
+                # Wait for all futures to complete
+                for future in futures:
+                    try:
+                        future.result()
+                    except Exception:
+                        continue
+        except KeyboardInterrupt:
+            logger.info('KeyboardInterrupt received. Cleaning up...')
+            cleanup()
+
+        output_fp.close()
+        total_correct = 0
+        total_hallucination = 0
+        output = []
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                output.append(data)
+                if int(data['question_id']) in finished_task_ids:
+                    if str(data['correct']).lower() == 'true':
+                        total_correct += 1
+                    if str(data['hallucination']).lower() == 'true':
+                        total_hallucination += 1
+        # sort all output by question_id
+        output = sorted(output, key=lambda x: x['question_id'])
+        with open(output_file, 'w') as f:
+            for dat in output:
+                f.write(json.dumps(dat) + '\n')
+                f.flush()
+
+        logger.info(
+            f'Evaluation finished for {hub}. Total: {len(question_ids)+finished_task_number}; Correct: {total_correct}; Hallucination: {total_hallucination}. Accuracy: {total_correct / (len(question_ids)+finished_task_number)}'
+        )
--- a/evaluation/gorilla/scripts/run_infer.sh
+++ b/evaluation/gorilla/scripts/run_infer.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+MODEL_CONFIG=$1
+AGENT=$2
+EVAL_LIMIT=$3
+HUBS=$4
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+if [ -z "$HUBS" ]; then
+  HUBS="hf,torch,tf"
+  echo "Hubs not specified, use default $HUBS"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+echo "HUBS: $HUBS"
+
+COMMAND="poetry run python evaluation/gorilla/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 30 \
+  --hubs $HUBS \
+  --data-split validation \
+  --max-chars 10000000 \
+  --eval-num-workers 1 \
+  --eval-note ${AGENT_VERSION}_${LEVELS}"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/gorilla/utils.py
+++ b/evaluation/gorilla/utils.py
@@ -0,0 +1,101 @@
+import json
+from functools import partial
+
+import requests
+from ast_eval_hf import ast_eval_hf, ast_parse
+from ast_eval_tf import ast_eval_tf
+from ast_eval_th import ast_eval_th
+
+
+# This function is modified from Gorilla's APIBench implementations (https://github.com/ShishirPatil/gorilla/blob/main/eval/get_llm_responses.py).
+def encode_question(question, api_name):
+    """Encode multiple prompt instructions into a single string."""
+
+    prompts = []
+    if api_name == 'torch':
+        api_name = 'torchhub'
+        domains = '1. $DOMAIN is inferred from the task description and should include one of {Classification, Semantic Segmentation, Object Detection, Audio Separation, Video Classification, Text-to-Speech}.'
+    elif api_name == 'hf':
+        api_name = 'huggingface'
+        domains = '1. $DOMAIN should include one of {Multimodal Feature Extraction, Multimodal Text-to-Image, Multimodal Image-to-Text, Multimodal Text-to-Video, \
+        Multimodal Visual Question Answering, Multimodal Document Question Answer, Multimodal Graph Machine Learning, Computer Vision Depth Estimation,\
+        Computer Vision Image Classification, Computer Vision Object Detection, Computer Vision Image Segmentation, Computer Vision Image-to-Image, \
+        Computer Vision Unconditional Image Generation, Computer Vision Video Classification, Computer Vision Zero-Shor Image Classification, \
+        Natural Language Processing Text Classification, Natural Language Processing Token Classification, Natural Language Processing Table Question Answering, \
+        Natural Language Processing Question Answering, Natural Language Processing Zero-Shot Classification, Natural Language Processing Translation, \
+        Natural Language Processing Summarization, Natural Language Processing Conversational, Natural Language Processing Text Generation, Natural Language Processing Fill-Mask,\
+        Natural Language Processing Text2Text Generation, Natural Language Processing Sentence Similarity, Audio Text-to-Speech, Audio Automatic Speech Recognition, \
+        Audio Audio-to-Audio, Audio Audio Classification, Audio Voice Activity Detection, Tabular Tabular Classification, Tabular Tabular Regression, \
+        Reinforcement Learning Reinforcement Learning, Reinforcement Learning Robotics }'
+    elif api_name == 'tf':
+        api_name = 'tensorhub'
+        domains = '1. $DOMAIN is inferred from the task description and should include one of {text-sequence-alignment, text-embedding, text-language-model, text-preprocessing, text-classification, text-generation, text-question-answering, text-retrieval-question-answering, text-segmentation, text-to-mel, image-classification, image-feature-vector, image-object-detection, image-segmentation, image-generator, image-pose-detection, image-rnn-agent, image-augmentation, image-classifier, image-style-transfer, image-aesthetic-quality, image-depth-estimation, image-super-resolution, image-deblurring, image-extrapolation, image-text-recognition, image-dehazing, image-deraining, image-enhancemenmt, image-classification-logits, image-frame-interpolation, image-text-detection, image-denoising, image-others, video-classification, video-feature-extraction, video-generation, video-audio-text, video-text, audio-embedding, audio-event-classification, audio-command-detection, audio-paralinguists-classification, audio-speech-to-text, audio-speech-synthesis, audio-synthesis, audio-pitch-extraction}'
+    else:
+        print('Error: API name is not supported.')
+
+    prompt = (
+        question
+        + '\nWrite a python program in 1 to 2 lines to call API in '
+        + api_name
+        + '.\n\nThe answer should follow the format: <<<domain>>> $DOMAIN, <<<api_call>>>: $API_CALL, <<<api_provider>>>: $API_PROVIDER, <<<explanation>>>: $EXPLANATION, <<<code>>>: $CODE}. Here are the requirements:\n'
+        + domains
+        + '\n2. The $API_CALL should have only 1 line of code that calls api.\n3. The $API_PROVIDER should be the programming framework used.\n4. $EXPLANATION should be a step-by-step explanation.\n5. The $CODE is the python code.\n6. Do not repeat the format in your answer.'
+    )
+    # prompts.append({"role": "system", "content": ""})
+    prompts = (
+        'You are a helpful API writer who can write APIs based on requirements.\n'
+        + prompt
+    )
+    return prompts
+
+
+def get_data(hub):
+    if hub == 'hf':
+        question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/huggingface/questions_huggingface_0_shot.jsonl'
+        api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/huggingface_api.jsonl'
+        apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/huggingface_eval.json'
+        ast_eval = ast_eval_hf
+    if hub == 'torch':
+        question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/torchhub/questions_torchhub_0_shot.jsonl'
+        api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/torchhub_api.jsonl'
+        apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/torchhub_eval.json'
+        ast_eval = ast_eval_th
+    if hub == 'tf':
+        question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/tensorflowhub/questions_tensorflowhub_0_shot.jsonl'
+        api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/tensorflowhub_api.jsonl'
+        apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/tensorflow_eval.json'
+        ast_eval = ast_eval_tf
+
+    # get questions and question_ids
+    questions = []
+    question_ids = []
+    question_data = requests.get(question_data)
+    if question_data.status_code == 200:
+        lines = question_data.text.splitlines()
+        for line in lines:
+            questions.append(json.loads(line)['text'])
+            question_ids.append(json.loads(line)['question_id'])
+
+    # get the api datasest
+    api_database = []
+    api_dataset = requests.get(api_dataset)
+    if api_dataset.status_code == 200:
+        lines = api_dataset.text.splitlines()
+        for line in lines:
+            api_database.append(json.loads(line))
+
+    # get the question answer pair datasest
+    qa_pairs = []
+    apibench = requests.get(apibench)
+    if apibench.status_code == 200:
+        lines = apibench.text.splitlines()
+        for line in lines:
+            qa_pairs.append(json.loads(line)['api_data'])
+
+    # Parse all apis to ast trees
+    ast_database = []
+    for data in api_database:
+        ast_tree = ast_parse(data['api_call'])
+        ast_database.append(ast_tree)
+    ast_eval = partial(ast_eval, api_database, qa_pairs, ast_database)
+    return questions, question_ids, ast_eval
--- a/evaluation/gpqa/README.md
+++ b/evaluation/gpqa/README.md
@@ -0,0 +1,70 @@
+# Evaluating GPQA (A Graduate-Level Google-Proof Q&A Benchmark) with OpenDevin
+
+Implements the evaluation of agents on the GPQA benchmark introduced in [GPQA: A Graduate-Level Google-Proof Q&A Benchmark](https://arxiv.org/abs/2308.07124).
+
+This code implements the evaluation of agents on the GPQA Benchmark with Open Book setting.
+- The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-expert validators achieve only 34% accuracy despite unrestricted access to the web.
+- Even experts in the corresponding domains achieve only 65% accuracy.
+- State-of-the-art AI systems achieve only 39% accuracy on this challenging dataset.
+
+**Note**
+Accurate solving of above graduate level questions would require both tool use (e.g., python for calculations) and web-search for finding related facts as information required for the questions might not be part of the LLM knowledge / training data.
+
+Further references:
+- https://arxiv.org/pdf/2311.12022
+- https://paperswithcode.com/dataset/gpqa
+- https://github.com/idavidrein/gpqa
+
+## TODOs
+- [ ] Add support for other agents (currently only tested on `CodeActAgent`)
+- [ ] Complete full benchmark evaluation
+- [ ] Fix intermittent `BrowserException: Failed to start browser environment` error
+
+## Setup Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
+
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/tmp/cache"
+ssh_hostname = "localhost"
+enable_auto_lint = true
+
+# TODO: Change these to the model you want to evaluate
+[eval_gpt4_1106_preview]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[eval_azure_openai_compatible_model]
+model = "AZURE_OPENAI_EXACT_DEPLOYMENT_MODEL_NAME"
+base_url = "AZURE_OPENAI_ENDPOINT"
+api_key = "AZURE_ENDPOINT_API_KEY"
+temperature = 0.0
+```
+
+## Run Inference on GPQA Benchmark
+'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
+From the root of the OpenDevin repo, run the following command:
+```bash
+./evaluation/gpqa/scripts/run_infer.sh [model_config_name] [num_samples_eval] [data_split] [AgentClass]
+```
+You can replace `model_config_name` with any model you set up in `config.toml`.
+
+- `model_config_name`: The model configuration name from `config.toml` that you want to evaluate.
+- `num_samples_eval`: Number of samples to evaluate (useful for testing and debugging).
+- `data_split`: The data split to evaluate on. Must be one of `gpqa_main`, `gqpa_diamond`, `gpqa_experts`, `gpqa_extended`. Defaults to `gpqa_diamond` as done in the paper.
+- `AgentClass`: The agent class to use for evaluation. Currently only supports `CodeActAgent` for CodeActAgent.
+
+
+## Benchmark Evaluation Results
+
+- [] TODO: Finish the evaluation run across the entire benchmark and compile results
--- a/evaluation/gpqa/init.py
+++ b/evaluation/gpqa/init.py
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -0,0 +1,468 @@
+"""
+Overview:
+This code implements the evaluation of agents on the GPQA Benchmark with Open Book setting.
+- The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-expert validators achieve only 34% accuracy despite unrestricted access to the web.
+- Even experts in the corresponding domains achieve only 65% accuracy.
+- State-of-the-art AI systems achieve only 39% accuracy on this challenging dataset.
+
+Accurate solving of above graduate level questions would require both tool use (e.g., python for calculations) and web-search for finding related facts as information required for the questions might not be part of the LLM knowledge / training data.
+
+Further references:
+- https://arxiv.org/pdf/2311.12022
+- https://paperswithcode.com/dataset/gpqa
+- https://github.com/idavidrein/gpqa
+
+TODOs:
+- Add evaluation on other Agent classes (e.g., MonologueAgent)
+- Batch inference and evaluation of agents on the GPQA Benchmark.
+"""
+
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import random
+import re
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+import pandas as pd
+from datasets import load_dataset
+from tqdm import tqdm
+
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, get_parser
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    logger.info('Cleaning up child processes...')
+    for process in mp.active_children():
+        logger.info(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'Feel free to use all tools for calculations and solving the problem, and web-search for finding relevant facts during the process if needed\n'
+        'If you think you have reliably finished solving the problem, first generate a message reporting the final concise answer to the user. Once that is done, please run the following command: <execute_bash> exit </execute_bash>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP TO SOLVE THIS TASK.\n'
+    )
+    if state.history:
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, just generate a final answer message to the user and in the next turn --> run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': '\n\n SUPER IMPORTANT: When you think you have solved the question, first report it back to the user in the requested format. Only once that is done, in the next turn, please run the following command: <execute_bash> exit </execute_bash>.\n'
+}
+
+
+def parse_final_answer(final_answer: str) -> str:
+    """
+    Parse the final answer from the final message generated by the agent
+    to extract the final answer. The final answer is usually enclosed in the format:
+    <<FINAL_ANSWER||
+    <insert correct answer here>
+    ||FINAL_ANSWER>>
+    """
+    pattern = re.compile(r'<<FINAL_ANSWER\|\|(.*?)\|\|FINAL_ANSWER>>', re.DOTALL)
+    match = pattern.search(final_answer)
+
+    if match:
+        return match.group(1).strip()
+    else:
+        return 'No final answer found in the provided string.'
+
+
+def compare_answers(predicted_answer, ground_truth):
+    """
+    Compare the predicted answer with the ground truth answer
+    """
+    return predicted_answer == ground_truth
+
+
+def get_test_result(model_output, ground_truth):
+    """
+    Implements the evaluation logic for GPQA
+    Checks if the output of a given instance is correct (as per the ground truth)
+    """
+    # parse the final answer from model output
+    predicted_answer = parse_final_answer(model_output)
+
+    # check if the model output matches the ground truth
+    result = compare_answers(predicted_answer, ground_truth)
+
+    return result
+
+
+def convert_instance_dict(instance):
+    """
+    Used for preprocessing the hf dataset into a format that can be used by the agent.
+    Reads and extracts relevant information from the dataset instance.
+    """
+    out_instance_dict = {}
+    out_instance_dict['question'] = instance['Question']
+    correct_answer = instance['Correct Answer']
+    out_instance_dict['choices'] = [
+        correct_answer,
+        instance['Incorrect Answer 1'],
+        instance['Incorrect Answer 2'],
+        instance['Incorrect Answer 3'],
+    ]
+
+    # Randomize the order of choices
+    random.shuffle(out_instance_dict['choices'])
+
+    # Find the index of the correct answer after shuffling and store it as a letter (A/B/C/D)
+    correct_index = out_instance_dict['choices'].index(correct_answer)
+    correct_letter = chr(
+        65 + correct_index
+    )  # Convert index (0-3) to corresponding letter (A-D)
+
+    out_instance_dict['correct_solution'] = correct_letter
+
+    return out_instance_dict
+
+
+def process_instance(
+    instance: dict,
+    agent_class: str,
+    metadata: dict,
+    skip_workspace_mount: bool,
+    eval_output_dir: str,
+    reset_logger: bool = True,
+):
+    """
+    Process a single instance from the dataset
+    """
+    old_workspace_mount_path = config.workspace_mount_path
+    old_workspace_base = config.workspace_base
+    try:
+        workspace_mount_path = os.path.join(
+            config.workspace_mount_path, '_eval_workspace'
+        )
+        # create process-specific workspace dir
+        # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+        # so that different agent don't interfere with each other.
+        skip_workspace_mount = False
+        if not skip_workspace_mount:
+            workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+            pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+        # reset workspace to config
+        config.workspace_base = workspace_mount_path
+        config.workspace_mount_path = workspace_mount_path
+
+        # workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
+        # workspace_mount_path = os.path.abspath(workspace_mount_path)
+        # # create process-specific workspace dir
+        # # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+        # # so that different agent don't interfere with each other.
+        # if not skip_workspace_mount:
+        #     workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        #     pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+        if reset_logger:
+            # Set up logger
+            log_file = os.path.join(
+                eval_output_dir, 'logs', f'instance_{instance.instance_id}.log'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            # add back the console handler to print ONE line
+            logger.addHandler(get_console_handler())
+            logger.info(
+                f'Starting evaluation for instance {instance.instance_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setFormatter(
+                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+            )
+            logger.addHandler(file_handler)
+        else:
+            logger.info(f'Starting evaluation for instance {instance.instance_id}.')
+
+        if not skip_workspace_mount:
+            logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+        # ======= Run the agent on the instance =======
+        # Prepare instruction for the agent using suggested format in gpqa codebase
+        instruction = f"""
+        What is the correct answer to this question:\n
+        {instance['question']}\n
+
+        Choices:\n
+        (A) {instance['choices'][0]}\n
+        (B) {instance['choices'][1]}\n
+        (C) {instance['choices'][2]}\n
+        (D) {instance['choices'][3]}\n
+        \n\n
+
+        MOST IMPORTANT: Format your response as follows:
+        <<FINAL_ANSWER||
+        <insert correct answer here, must be one of A, B, C, D> (Please dont use any additional characters. Just the letter of the correct answer (A/B/C/D).)
+        ||FINAL_ANSWER>>
+
+        Additional Instructions:
+        - You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.
+        """
+
+        # NOTE: You can actually set slightly different instruction for different agents
+        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State = asyncio.run(
+            main(
+                instruction,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+                    agent_class
+                ),
+            )
+        )
+
+        # ======= Attempt to evaluate the agent's edits =======
+        # get the final message from the state history (default to None if not found)
+        final_message = next(
+            (
+                act.content
+                for act in reversed(state.history)
+                if isinstance(act, MessageAction)
+            ),
+            None,
+        )
+
+        logger.info(f'Final message generated by the agent: {final_message}')
+
+        test_result = get_test_result(final_message, instance.correct_solution)
+
+        # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+        if state is None:
+            raise ValueError('State should not be None.')
+
+        metrics = state.metrics.get() if state.metrics else None
+
+        # Save the output
+        output = {
+            'task_id': instance.task_id,
+            'instance_id': instance.instance_id,
+            'instruction': instruction,
+            'metadata': metadata,
+            'history': [
+                (event_to_dict(action), event_to_dict(obs))
+                for action, obs in state.history
+            ],
+            'metrics': metrics,
+            'error': state.error if state and state.error else None,
+            'test_result': test_result,
+        }
+
+    except Exception:
+        logger.error('Process instance failed')
+        raise
+    finally:
+        config.workspace_mount_path = old_workspace_mount_path
+        config.workspace_base = old_workspace_base
+    return output
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    # data split must be one of 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended'
+    parser.add_argument(
+        '--data-split',
+        type=str,
+        choices=['gpqa_main', 'gpqa_diamond', 'gpqa_experts', 'gpqa_extended'],
+        default='gpqa_diamond',
+        help='data split to evaluate, eg. gpqa_diamond',
+    )
+    args, _ = parser.parse_known_args()
+
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenDevin's repo
+    dataset = load_dataset('Idavidrein/gpqa', args.data_split)
+    gpqa_dataset = dataset['train']
+    # preprocess the dataset
+    gpqa_dataset = gpqa_dataset.map(convert_instance_dict)
+    gpqa_dataset = gpqa_dataset.to_pandas()
+    # Add a new column 'instance_id' with the index
+    gpqa_dataset['instance_id'] = gpqa_dataset.index
+    gpqa_dataset['task_id'] = gpqa_dataset.index
+    # gpqa_dataset = dataset['train'].to_pandas().sort_values(by='id').reset_index(drop=True)
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'gpqa',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproduciblity
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit  # NOTE: This is useful for debugging and testing using a smaller subset of the dataset
+    if eval_n_limit:
+        # start_index = 20
+        # gpqa_dataset = gpqa_dataset.iloc[start_index:]
+        gpqa_dataset = gpqa_dataset.head(eval_n_limit)
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    logger.info('#############################################')
+    logger.info(f'{eval_n_limit} instances will be evaluated.')
+    logger.info('#############################################')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_instance_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_instance_ids.add(data['instance_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_gpqa_dataset = []
+    for idx, instance in gpqa_dataset.iterrows():
+        # instance = convert_instance_dict(instance) # preprocessing
+        if instance.instance_id in finished_instance_ids:
+            logger.info(
+                f'Skipping instance {instance.instance_id} as it is already finished.'
+            )
+            continue
+        new_gpqa_dataset.append(instance)
+
+    gpqa_dataset = pd.DataFrame(new_gpqa_dataset)
+    logger.info(
+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(gpqa_dataset)}'
+    )
+    # =============================================
+
+    pbar = tqdm(total=len(gpqa_dataset))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(future):
+        pbar.update(1)
+        output = future.result()
+        pbar.set_description(f'Instance {output["instance_id"]}')
+        pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        output_fp.flush()
+
+    # This sets the multi-processing
+    num_workers = args.eval_num_workers
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    # This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
+    skip_workspace_mount = agent_class == 'CodeActAgent'
+    logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
+
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multi-processing
+            for row_idx, instance in gpqa_dataset.iterrows():
+                future = executor.submit(
+                    process_instance,
+                    instance,
+                    agent_class,
+                    metadata,
+                    skip_workspace_mount,
+                    eval_output_dir,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
--- a/evaluation/gpqa/scripts/run_infer.sh
+++ b/evaluation/gpqa/scripts/run_infer.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+MODEL_CONFIG=$1
+EVAL_LIMIT=$2
+DATA_SPLIT=$3
+AGENT=$4
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent ..."
+  AGENT="CodeActAgent"
+fi
+
+# NOTE: if data split is not provided, use the default value 'gpqa_diamond'
+if [ -z "$DATA_SPLIT" ]; then
+  echo "Data split not specified, using default gpqa_diamond ..."
+  DATA_SPLIT="gpqa_diamond"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+COMMAND="poetry run python evaluation/gpqa/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 10 \
+  --max-chars 10000000 \
+  --eval-num-workers 1 \
+  --data-split $DATA_SPLIT \
+  --eval-note $AGENT_VERSION"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -140,102 +140,114 @@ def process_instance(
 ):
    old_workspace_mount_path = config.workspace_mount_path
    old_workspace_base = config.workspace_base
-    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
-    # create process-specific workspace dir
-    # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
-    # so that different agent don't interfere with each other.
-    if not skip_workspace_mount:
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)

-    # reset workspace to config
-    config.workspace_base = workspace_mount_path
-    config.workspace_mount_path = workspace_mount_path
-
-    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
-    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            eval_output_dir,
-            'logs',
-            f'instance_{instance.task_id.replace("/", "__")}.log',
+    try:
+        workspace_mount_path = os.path.join(
+            config.workspace_mount_path, '_eval_workspace'
        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {instance.task_id}.\nLOG:   tail -f {log_file}'
+        # create process-specific workspace dir
+        # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+        # so that different agent don't interfere with each other.
+        if not skip_workspace_mount:
+            workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+            pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+        # reset workspace to config
+        config.workspace_base = workspace_mount_path
+        config.workspace_mount_path = workspace_mount_path
+
+        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+        if reset_logger:
+            # Set up logger
+            log_file = os.path.join(
+                eval_output_dir,
+                'logs',
+                f'instance_{instance.task_id.replace("/", "__")}.log',
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            # add back the console handler to print ONE line
+            logger.addHandler(get_console_handler())
+            logger.info(
+                f'Starting evaluation for instance {instance.task_id}.\nLOG:   tail -f {log_file}'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setFormatter(
+                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+            )
+            logger.addHandler(file_handler)
+
+        if not skip_workspace_mount:
+            logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+        # Create file with HumanEvalFix problem
+        # Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
+        problem_statement = (
+            instance.declaration + instance.buggy_solution + '\n' + instance.test
        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        path = os.path.join(
+            workspace_mount_path, f'{instance.task_id.replace("/", "__")}.py'
        )
-        logger.addHandler(file_handler)
+        with open(path, 'w') as f:
+            f.write(problem_statement)

-    if not skip_workspace_mount:
-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-
-    # Create file with HumanEvalFix problem
-    # Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
-    problem_statement = (
-        instance.declaration + instance.buggy_solution + '\n' + instance.test
-    )
-    path = os.path.join(
-        workspace_mount_path, f'{instance.task_id.replace("/", "__")}.py'
-    )
-    with open(path, 'w') as f:
-        f.write(problem_statement)
-
-    # Prepare instruction
-    instruction = (
-        f'Please fix the function in {instance.task_id.replace("/", "__")}.py such that all test cases pass.\n'
-        'Environment has been set up for you to start working. You may assume all necessary tools are installed.\n\n'
-        '# Problem Statement\n'
-        f'{problem_statement}\n\n'
-    )
-    instruction += (
-        'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-        'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
-        'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
-    )
-    # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
-
-    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State = asyncio.run(
-        main(
-            instruction,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+        # Prepare instruction
+        instruction = (
+            f'Please fix the function in {instance.task_id.replace("/", "__")}.py such that all test cases pass.\n'
+            'Environment has been set up for you to start working. You may assume all necessary tools are installed.\n\n'
+            '# Problem Statement\n'
+            f'{problem_statement}\n\n'
        )
-    )
+        instruction += (
+            'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+            'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
+            'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
+        )
+        # NOTE: You can actually set slightly different instruction for different agents
+        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')

-    # ======= Attempt to evaluate the agent's edits =======
-    test_result = get_test_result(instance, path)
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State = asyncio.run(
+            main(
+                instruction,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+                    agent_class
+                ),
+            )
+        )

-    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-    if state is None:
-        raise ValueError('State should not be None.')
+        # ======= Attempt to evaluate the agent's edits =======
+        test_result = get_test_result(instance, path)

-    # Save the output
-    output = {
-        'task_id': instance.task_id,
-        'instruction': instruction,
-        'metadata': metadata,
-        'history': [
-            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-        ],
-        'error': state.error if state and state.error else None,
-        'test_result': test_result,
-    }
+        # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+        if state is None:
+            raise ValueError('State should not be None.')
+        metrics = state.metrics.get() if state.metrics else None

-    config.workspace_mount_path = old_workspace_mount_path
-    config.workspace_base = old_workspace_base
+        # Save the output
+        output = {
+            'task_id': instance.task_id,
+            'instruction': instruction,
+            'metadata': metadata,
+            'history': [
+                (event_to_dict(action), event_to_dict(obs))
+                for action, obs in state.history
+            ],
+            'metrics': metrics,
+            'error': state.error if state and state.error else None,
+            'test_result': test_result,
+        }
+    except Exception:
+        logger.error('Process instance failed')
+        raise
+    finally:
+        config.workspace_mount_path = old_workspace_mount_path
+        config.workspace_base = old_workspace_base
    return output


@@ -284,7 +296,7 @@ if __name__ == '__main__':
        'max_iterations': max_iterations,
        'eval_output_dir': eval_output_dir,
        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
-        # get the commit id of current repo for reproduciblity
+        # get the commit id of current repo for reproducibility
        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
        .decode('utf-8')
        .strip(),
--- a/Show More
+++ b/Show More