Minor SWE-Bench inference config tweak (#2381 )

* save infer logs to infer_logs * set max budget for swebench eval
avoid repeat logging of unneeded messages (#2380 )
2026-04-29 03:00:45 -04:00 · 2024-06-10 20:14:22 +00:00 · 2024-06-10 20:08:09 +00:00 · 2024-06-10 19:30:40 +00:00 · 2024-06-10 17:18:40 +00:00 · 2024-06-11 00:32:10 +08:00
153 changed files with 7692 additions and 1106 deletions
--- a/.github/ISSUE_TEMPLATE/question.md
+++ b/.github/ISSUE_TEMPLATE/question.md
@@ -1,16 +0,0 @@
---
-name: Question
-about: Use this template to ask a question regarding the project.
-title: ''
-labels: question
-assignees: ''
-
---
-
-## Describe your question
-
-<!--A clear and concise description of what you want to know.-->
-
-## Additional context
-
-<!--Add any other context about the question here, like what you've tried so far.-->
--- a/.github/workflows/run-unit-tests.yml
+++ b/.github/workflows/run-unit-tests.yml
@@ -23,7 +23,7 @@ jobs:
    name: Test on macOS
    runs-on: macos-13
    env:
-      INSTALL_DOCKER: "0" # Set to '0' to skip Docker installation
+      INSTALL_DOCKER: "1" # Set to '0' to skip Docker installation
    strategy:
      matrix:
        python-version: ["3.11"]
--- a/.github/workflows/solve-issue.yml
+++ b/.github/workflows/solve-issue.yml
@@ -35,15 +35,28 @@ jobs:
        echo "" >> task.txt
        echo "BODY:" >> task.txt
        echo "${ISSUE_BODY}" >> task.txt
+    
+    - name: Set up environment
+      run: |
+        curl -sSL https://install.python-poetry.org | python3 -
+        export PATH="/github/home/.local/bin:$PATH"
+        poetry install --without evaluation
+        poetry run playwright install --with-deps chromium
+

    - name: Run OpenDevin
      env:
        ISSUE_TITLE: ${{ github.event.issue.title }}
        ISSUE_BODY: ${{ github.event.issue.body }}
        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        SANDBOX_TYPE: exec
      run: |
-        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
+        # Append path to launch poetry
+        export PATH="/github/home/.local/bin:$PATH"
+        # Append path to correctly import package, note: must set pwd at first
+        export PYTHONPATH=$(pwd):$PYTHONPATH
+        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
        rm task.txt

    - name: Setup Git, Create Branch, and Commit Changes
--- a/.gitignore
+++ b/.gitignore
@@ -161,9 +161,14 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 .vscode/
+.cursorignore

 # evaluation
+evaluation/evaluation_outputs
+evaluation/outputs
+evaluation/swe_bench/eval_workspace*
 evaluation/SWE-bench/data
+evaluation/webarena/scripts/webarena_env.sh

 # frontend

@@ -176,6 +181,8 @@ frontend/yarn.lock

 # testing
 frontend/coverage
+test_results*
+/_test_files_tmp/

 # production
 frontend/build
@@ -204,8 +211,3 @@ cache
 # configuration
 config.toml
 config.toml.bak
-evaluation/swe_bench/eval_workspace*
-evaluation/outputs
-evaluation/evaluation_outputs
-test_results*
-/_test_files_tmp/
--- a/16
+++ b/16
@@ -10,6 +10,7 @@ DEFAULT_WORKSPACE_DIR = "./workspace"
 DEFAULT_MODEL = "gpt-4o"
 CONFIG_FILE = config.toml
 PRECOMMIT_CONFIG_PATH = "./dev_config/python/.pre-commit-config.yaml"
+PYTHON_VERSION = 3.11

 # ANSI color codes
 GREEN=$(shell tput -Txterm setaf 2)
@@ -62,10 +63,10 @@ check-system:

 check-python:
 	@echo "$(YELLOW)Checking Python installation...$(RESET)"
-	@if command -v python3.11 > /dev/null; then \
-		echo "$(BLUE)$(shell python3.11 --version) is already installed.$(RESET)"; \
+	@if command -v python$(PYTHON_VERSION) > /dev/null; then \
+		echo "$(BLUE)$(shell python$(PYTHON_VERSION) --version) is already installed.$(RESET)"; \
 	else \
-		echo "$(RED)Python 3.11 is not installed. Please install Python 3.11 to continue.$(RESET)"; \
+		echo "$(RED)Python $(PYTHON_VERSION) is not installed. Please install Python $(PYTHON_VERSION) to continue.$(RESET)"; \
 		exit 1; \
 	fi

@@ -112,13 +113,13 @@ check-poetry:
 			echo "$(BLUE)$(shell poetry --version) is already installed.$(RESET)"; \
 		else \
 			echo "$(RED)Poetry 1.8 or later is required. You can install poetry by running the following command, then adding Poetry to your PATH:"; \
-			echo "$(RED) curl -sSL https://install.python-poetry.org | python3 -$(RESET)"; \
+			echo "$(RED) curl -sSL https://install.python-poetry.org | python$(PYTHON_VERSION) -$(RESET)"; \
 			echo "$(RED)More detail here: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
 			exit 1; \
 		fi; \
 	else \
 		echo "$(RED)Poetry is not installed. You can install poetry by running the following command, then adding Poetry to your PATH:"; \
-		echo "$(RED) curl -sSL https://install.python-poetry.org | python3.11 -$(RESET)"; \
+		echo "$(RED) curl -sSL https://install.python-poetry.org | python$(PYTHON_VERSION) -$(RESET)"; \
 		echo "$(RED)More detail here: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
 		exit 1; \
 	fi
@@ -130,7 +131,7 @@ pull-docker-image:

 install-python-dependencies:
 	@echo "$(GREEN)Installing Python dependencies...$(RESET)"
-	poetry env use python3.11
+	poetry env use python$(PYTHON_VERSION)
 	@if [ "$(shell uname)" = "Darwin" ]; then \
 		echo "$(BLUE)Installing chroma-hnswlib...$(RESET)"; \
 		export HNSWLIB_NO_NATIVE=1; \
@@ -229,7 +230,7 @@ setup-config:
 setup-config-prompts:
 	@echo "[core]" > $(CONFIG_FILE).tmp

-	@read -p "Enter your workspace directory [default: $(DEFAULT_WORKSPACE_DIR)]: " workspace_dir; \
+	@read -p "Enter your workspace directory (as absolute path) [default: $(DEFAULT_WORKSPACE_DIR)]: " workspace_dir; \
 	 workspace_dir=$${workspace_dir:-$(DEFAULT_WORKSPACE_DIR)}; \
 	 echo "workspace_base=\"$$workspace_dir\"" >> $(CONFIG_FILE).tmp

@@ -238,6 +239,7 @@ setup-config-prompts:
 	 if [ "$$persist_sandbox" = "true" ]; then \
 		 read -p "Enter a password for the sandbox container: " ssh_password; \
 		 echo "ssh_password=\"$$ssh_password\"" >> $(CONFIG_FILE).tmp; \
+		 echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
 	 else \
 		echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
 	 fi
--- a/README.md
+++ b/README.md
@@ -129,3 +129,16 @@ Distributed under the MIT License. See [`LICENSE`](./LICENSE) for more informati
 [issues-url]: https://github.com/OpenDevin/OpenDevin/issues
 [license-shield]: https://img.shields.io/github/license/opendevin/opendevin?style=for-the-badge
 [license-url]: https://github.com/OpenDevin/OpenDevin/blob/main/LICENSE
+
+## 📚 Cite
+
+```
+@misc{opendevin2024,
+  author       = {{OpenDevin Team}},
+  title        = {{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
+  year         = {2024},
+  version      = {v1.0},
+  howpublished = {\url{https://github.com/OpenDevin/OpenDevin}},
+  note         = {Accessed: ENTER THE DATE YOU ACCESSED THE PROJECT}
+}
+```
--- a/agenthub/README.md
+++ b/agenthub/README.md
@@ -2,15 +2,15 @@

 In this folder, there may exist multiple implementations of `Agent` that will be used by the framework.

-For example, `agenthub/monologue_agent`, `agenthub/metagpt_agent`, `agenthub/codeact_agent`, etc.
+For example, `agenthub/codeact_agent`, etc.
 Contributors from different backgrounds and interests can choose to contribute to any (or all!) of these directions.

 ## Constructing an Agent

-The abstraction for an agent can be found [here](../opendevin/agent.py).
+The abstraction for an agent can be found [here](../opendevin/controller/agent.py).

 Agents are run inside of a loop. At each iteration, `agent.step()` is called with a
-[State](../opendevin/state.py) input, and the agent must output an [Action](../opendevin/action).
+[State](../opendevin/controller/state/state.py) input, and the agent must output an [Action](../opendevin/events/action).

 Every agent also has a `self.llm` which it can use to interact with the LLM configured by the user.
 See the [LiteLLM docs for `self.llm.completion`](https://docs.litellm.ai/docs/completion).
@@ -28,21 +28,19 @@ The `state` contains:

 Here is a list of available Actions, which can be returned by `agent.step()`:

- [`CmdRunAction`](../opendevin/action/bash.py) - Runs a command inside a sandboxed terminal
- [`CmdKillAction`](../opendevin/action/bash.py) - Kills a background command
- [`IPythonRunCellAction`](../opendevin/action/bash.py) - Execute a block of Python code interactively (in Jupyter notebook) and receives `CmdOutputObservation`. Requires setting up `jupyter` [plugin](../opendevin/sandbox/plugins) as a requirement.
- [`FileReadAction`](../opendevin/action/fileop.py) - Reads the content of a file
- [`FileWriteAction`](../opendevin/action/fileop.py) - Writes new content to a file
- [`BrowseURLAction`](../opendevin/action/browse.py) - Gets the content of a URL
- [`AgentRecallAction`](../opendevin/action/agent.py) - Searches memory (e.g. a vector database)
- [`AddTaskAction`](../opendevin/action/tasks.py) - Adds a subtask to the plan
- [`ModifyTaskAction`](../opendevin/action/tasks.py) - Changes the state of a subtask
- [`AgentThinkAction`](../opendevin/action/agent.py) - A no-op that allows the agent to add plaintext to the history (as well as the chat log)
- [`AgentTalkAction`](../opendevin/action/agent.py) - A no-op that allows the agent to add plaintext to the history and talk to the user.
- [`AgentFinishAction`](../opendevin/action/agent.py) - Stops the control loop, allowing the user/delegator agent to enter a new task
- [`AgentRejectAction`](../opendevin/action/agent.py) - Stops the control loop, allowing the user/delegator agent to enter a new task
- [`AgentFinishAction`](../opendevin/action/agent.py) - Stops the control loop, allowing the user to enter a new task
- [`MessageAction`](../opendevin/action/message.py) - Represents a message from an agent or the user
+- [`CmdRunAction`](../opendevin/events/action/commands.py) - Runs a command inside a sandboxed terminal
+- [`CmdKillAction`](../opendevin/events/action/commands.py) - Kills a background command
+- [`IPythonRunCellAction`](../opendevin/events/action/commands.py) - Execute a block of Python code interactively (in Jupyter notebook) and receives `CmdOutputObservation`. Requires setting up `jupyter` [plugin](../opendevin/runtime/plugins) as a requirement.
+- [`FileReadAction`](../opendevin/events/action/files.py) - Reads the content of a file
+- [`FileWriteAction`](../opendevin/events/action/files.py) - Writes new content to a file
+- [`BrowseURLAction`](../opendevin/events/action/browse.py) - Gets the content of a URL
+- [`AgentRecallAction`](../opendevin/events/action/agent.py) - Searches memory (e.g. a vector database)
+- [`AddTaskAction`](../opendevin/events/action/tasks.py) - Adds a subtask to the plan
+- [`ModifyTaskAction`](../opendevin/events/action/tasks.py) - Changes the state of a subtask.
+- [`AgentFinishAction`](../opendevin/events/action/agent.py) - Stops the control loop, allowing the user/delegator agent to enter a new task
+- [`AgentRejectAction`](../opendevin/events/action/agent.py) - Stops the control loop, allowing the user/delegator agent to enter a new task
+- [`AgentFinishAction`](../opendevin/events/action/agent.py) - Stops the control loop, allowing the user to enter a new task
+- [`MessageAction`](../opendevin/events/action/message.py) - Represents a message from an agent or the user

 You can use `action.to_dict()` and `action_from_dict` to serialize and deserialize actions.

@@ -54,13 +52,13 @@ in the background).

 Here is a list of available Observations:

- [`CmdOutputObservation`](../opendevin/observation/run.py)
- [`BrowserOutputObservation`](../opendevin/observation/browse.py)
- [`FileReadObservation`](../opendevin/observation/files.py)
- [`FileWriteObservation`](../opendevin/observation/files.py)
- [`AgentRecallObservation`](../opendevin/observation/recall.py)
- [`ErrorObservation`](../opendevin/observation/error.py)
- [`SuccessObservation`](../opendevin/observation/success.py)
+- [`CmdOutputObservation`](../opendevin/events/observation/commands.py)
+- [`BrowserOutputObservation`](../opendevin/events/observation/browse.py)
+- [`FileReadObservation`](../opendevin/events/observation/files.py)
+- [`FileWriteObservation`](../opendevin/events/observation/files.py)
+- [`AgentRecallObservation`](../opendevin/events/observation/recall.py)
+- [`ErrorObservation`](../opendevin/events/observation/error.py)
+- [`SuccessObservation`](../opendevin/events/observation/success.py)

 You can use `observation.to_dict()` and `observation_from_dict` to serialize and deserialize observations.

--- a/agenthub/browsing_agent/browsing_agent.py
+++ b/agenthub/browsing_agent/browsing_agent.py
@@ -1,4 +1,5 @@
 import ast
+import os

 from browsergym.core.action.highlevel import HighLevelActionSet
 from browsergym.utils.obs import flatten_axtree_to_str
@@ -12,6 +13,7 @@ from opendevin.events.action import (
    BrowseInteractiveAction,
    MessageAction,
 )
+from opendevin.events.event import EventSource
 from opendevin.events.observation import BrowserOutputObservation
 from opendevin.llm.llm import LLM
 from opendevin.runtime.plugins import (
@@ -19,21 +21,17 @@ from opendevin.runtime.plugins import (
 )
 from opendevin.runtime.tools import RuntimeTool

+USE_NAV = (
+    os.environ.get('USE_NAV', 'true') == 'true'
+)  # only disable NAV actions when running webarena and miniwob benchmarks
+USE_CONCISE_ANSWER = (
+    os.environ.get('USE_CONCISE_ANSWER', 'false') == 'true'
+)  # only return concise answer when running webarena and miniwob benchmarks

-def parse_response(response: str) -> Action:
-    if '```' not in response:
-        # unexpected response format, message back to user
-        return MessageAction(response)
-    thought = response.split('```')[0].strip()
-    action_str = response.split('```')[1].strip()
-    # handle send message to user function call in BrowserGym
-    for sub_action in action_str.split('\n'):
-        if 'send_msg_to_user(' in sub_action:
-            tree = ast.parse(sub_action)
-            args = tree.body[0].value.args  # type: ignore
-            return MessageAction(args[0].value)
-
-    return BrowseInteractiveAction(browser_actions=action_str, thought=thought)
+if not USE_NAV and USE_CONCISE_ANSWER:
+    EVAL_MODE = True  # disabled NAV actions and only return concise answer, for webarena and miniwob benchmarks\
+else:
+    EVAL_MODE = False


 class BrowsingAgent(Agent):
@@ -56,13 +54,13 @@ class BrowsingAgent(Agent):
        - llm (LLM): The llm to be used by this agent
        """
        super().__init__(llm)
+        # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
+        # see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
+        action_subsets = ['chat', 'bid']
+        if USE_NAV:
+            action_subsets.append('nav')
        self.action_space = HighLevelActionSet(
-            # see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
-            subsets=[
-                'chat',
-                'bid',
-                'nav',
-            ],  # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
+            subsets=action_subsets,
            strict=False,  # less strict on the parsing of the actions
            multiaction=True,  # enable to agent to take multiple actions at once
        )
@@ -75,6 +73,32 @@ class BrowsingAgent(Agent):
        """
        super().reset()
        self.cost_accumulator = 0
+        self.error_accumulator = 0
+
+    def parse_response(self, response: str) -> Action:
+        if '```' not in response:
+            # unexpected response format, message back to user
+            action_str = f'send_msg_to_user("""{response}""")'
+            return BrowseInteractiveAction(
+                browser_actions=action_str,
+                thought=response,
+                browsergym_send_msg_to_user=response,
+            )
+        thought = response.split('```')[0].strip()
+        action_str = response.split('```')[1].strip()
+        # handle send message to user function call in BrowserGym
+        msg_content = ''
+        for sub_action in action_str.split('\n'):
+            if 'send_msg_to_user(' in sub_action:
+                tree = ast.parse(sub_action)
+                args = tree.body[0].value.args  # type: ignore
+                msg_content = args[0].value
+
+        return BrowseInteractiveAction(
+            browser_actions=action_str,
+            thought=thought,
+            browsergym_send_msg_to_user=msg_content,
+        )

    def step(self, state: State) -> Action:
        """
@@ -90,27 +114,66 @@ class BrowsingAgent(Agent):
        - AgentFinishAction() - end the interaction
        """
        goal = state.get_current_user_intent()
+        if goal is None:
+            goal = state.inputs['task']
        messages = []
-        prev_actions = ''
+        prev_actions = []
        cur_axtree_txt = ''
        error_prefix = ''
        last_obs = None
+        last_action = None
+
+        if EVAL_MODE and len(state.history) == 1:
+            # for webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
+            # initialize and retrieve the first observation by issuing an noop OP
+            # For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
+            return BrowseInteractiveAction(browser_actions='noop()')
+
        for prev_action, obs in state.history:
            if isinstance(prev_action, BrowseInteractiveAction):
-                prev_actions += f'{prev_action.browser_actions}\n'
+                prev_actions.append(prev_action.browser_actions)
                last_obs = obs
+                last_action = prev_action
            elif (
-                isinstance(prev_action, MessageAction) and prev_action.source != 'user'
+                isinstance(prev_action, MessageAction)
+                and prev_action.source == EventSource.AGENT
            ):
                # agent has responded, task finish.
-                return AgentFinishAction()
+                return AgentFinishAction(outputs={'content': prev_action.content})
+
+        if EVAL_MODE:
+            prev_actions = prev_actions[1:]  # remove the first noop action
+
+        prev_action_str = '\n'.join(prev_actions)
+        # if the final BrowserInteractiveAction exec BrowserGym's send_msg_to_user,
+        # we should also send a message back to the user in OpenDevin and call it a day
+        if (
+            isinstance(last_action, BrowseInteractiveAction)
+            and last_action.browsergym_send_msg_to_user
+        ):
+            return MessageAction(last_action.browsergym_send_msg_to_user)

        if isinstance(last_obs, BrowserOutputObservation):
            if last_obs.error:
                # add error recovery prompt prefix
                error_prefix = f'IMPORTANT! Last action is incorrect:\n{last_obs.last_browser_action}\nThink again with the current observation of the page.\n'
-            cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object)
+            try:
+                cur_axtree_txt = flatten_axtree_to_str(
+                    last_obs.axtree_object,
+                    extra_properties=last_obs.extra_element_properties,
+                    with_clickable=True,
+                    filter_visible_only=True,
+                )
+            except Exception as e:
+                logger.error(
+                    'Error when trying to process the accessibility tree: %s', e
+                )
+                return MessageAction('Error encountered when browsing.')

+        if error_prefix:
+            self.error_accumulator += 1
+            if self.error_accumulator > 5:
+                return MessageAction('Too many errors encountered. Task failed.')
        system_msg = f"""\
 # Instructions
 Review the current state of the page and all other information to find the best
@@ -133,7 +196,7 @@ and executed by a program, make sure to follow the formatting instructions.
 {cur_axtree_txt}

 # Previous Actions
-{prev_actions}
+{prev_action_str}

 Here is an example with chain of thought of a valid action when clicking on a button:
 "
@@ -141,16 +204,31 @@ In order to accomplish my goal I need to click on the button with bid 12
 ```click("12")```
 "
 """.strip()
+
+        if USE_CONCISE_ANSWER:
+            concise_instruction = """\
+
+Here is another example with chain of thought of a valid action when providing a concise answer to user:
+"
+In order to accomplish my goal I need to send the information asked back to the user. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I will send a message back to user with the answer.
+```send_msg_to_user("$279.49")```
+"
+"""
+            prompt += concise_instruction
        messages.append({'role': 'user', 'content': prompt})
        response = self.llm.completion(
            messages=messages,
            temperature=0.0,
+            stop=[')```', ')\n```'],
        )
        self.log_cost(response)
-        action_resp = response['choices'][0]['message']['content']
+        action_resp = response['choices'][0]['message']['content'].strip()
+        if not action_resp.endswith('```'):
+            action_resp = action_resp + ')```'
+
        logger.info(prompt)
        logger.info(action_resp)
-        return parse_response(action_resp)
+        return self.parse_response(action_resp)

    def search_memory(self, query: str) -> list[str]:
        raise NotImplementedError('Implement this abstract method')
--- a/agenthub/codeact_agent/action_parser.py
+++ b/agenthub/codeact_agent/action_parser.py
@@ -0,0 +1,182 @@
+import re
+
+from opendevin.controller.action_parser import ActionParser, ResponseParser
+from opendevin.events.action import (
+    Action,
+    AgentDelegateAction,
+    AgentFinishAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+
+
+class CodeActResponseParser(ResponseParser):
+    """
+    Parser action:
+        - CmdRunAction(command) - bash command to run
+        - IPythonRunCellAction(code) - IPython code to run
+        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        # Need pay attention to the item order in self.action_parsers
+        self.action_parsers = [
+            CodeActActionParserFinish(),
+            CodeActActionParserCmdRun(),
+            CodeActActionParserIPythonRunCell(),
+            CodeActActionParserAgentDelegate(),
+        ]
+        self.default_parser = CodeActActionParserMessage()
+
+    def parse(self, response: str) -> Action:
+        action_str = self.parse_response(response)
+        return self.parse_action(action_str)
+
+    def parse_response(self, response) -> str:
+        action = response.choices[0].message.content
+        for lang in ['bash', 'ipython', 'browse']:
+            if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
+                action += f'</execute_{lang}>'
+        return action
+
+    def parse_action(self, action_str: str) -> Action:
+        for action_parser in self.action_parsers:
+            if action_parser.check_condition(action_str):
+                return action_parser.parse(action_str)
+        return self.default_parser.parse(action_str)
+
+
+class CodeActActionParserFinish(ActionParser):
+    """
+    Parser action:
+        - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        self.finish_command = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.finish_command = re.search(r'<finish>.*</finish>', action_str, re.DOTALL)
+        return self.finish_command is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.finish_command is not None
+        ), 'self.finish_command should not be None when parse is called'
+        thought = action_str.replace(self.finish_command.group(0), '').strip()
+        return AgentFinishAction(thought=thought)
+
+
+class CodeActActionParserCmdRun(ActionParser):
+    """
+    Parser action:
+        - CmdRunAction(command) - bash command to run
+        - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        self.bash_command = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.bash_command = re.search(
+            r'<execute_bash>(.*?)</execute_bash>', action_str, re.DOTALL
+        )
+        return self.bash_command is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.bash_command is not None
+        ), 'self.bash_command should not be None when parse is called'
+        thought = action_str.replace(self.bash_command.group(0), '').strip()
+        # a command was found
+        command_group = self.bash_command.group(1).strip()
+        if command_group.strip() == 'exit':
+            return AgentFinishAction()
+        return CmdRunAction(command=command_group, thought=thought)
+
+
+class CodeActActionParserIPythonRunCell(ActionParser):
+    """
+    Parser action:
+        - IPythonRunCellAction(code) - IPython code to run
+    """
+
+    def __init__(
+        self,
+    ):
+        self.python_code = None
+        self.jupyter_kernel_init_code: str = 'from agentskills import *'
+
+    def check_condition(self, action_str: str) -> bool:
+        self.python_code = re.search(
+            r'<execute_ipython>(.*?)</execute_ipython>', action_str, re.DOTALL
+        )
+        return self.python_code is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.python_code is not None
+        ), 'self.python_code should not be None when parse is called'
+        code_group = self.python_code.group(1).strip()
+        thought = action_str.replace(self.python_code.group(0), '').strip()
+        return IPythonRunCellAction(
+            code=code_group,
+            thought=thought,
+            kernel_init_code=self.jupyter_kernel_init_code,
+        )
+
+
+class CodeActActionParserAgentDelegate(ActionParser):
+    """
+    Parser action:
+        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    """
+
+    def __init__(
+        self,
+    ):
+        self.agent_delegate = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.agent_delegate = re.search(
+            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
+        )
+        return self.agent_delegate is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.agent_delegate is not None
+        ), 'self.agent_delegate should not be None when parse is called'
+        thought = action_str.replace(self.agent_delegate.group(0), '').strip()
+        browse_actions = self.agent_delegate.group(1).strip()
+        task = f'{thought}. I should start with: {browse_actions}'
+        return AgentDelegateAction(agent='BrowsingAgent', inputs={'task': task})
+
+
+class CodeActActionParserMessage(ActionParser):
+    """
+    Parser action:
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    """
+
+    def __init__(
+        self,
+    ):
+        pass
+
+    def check_condition(self, action_str: str) -> bool:
+        # We assume the LLM is GOOD enough that when it returns pure natural language
+        # it wants to talk to the user
+        return True
+
+    def parse(self, action_str: str) -> Action:
+        return MessageAction(content=action_str, wait_for_response=True)
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@@ -1,5 +1,4 @@
-import re
-
+from agenthub.codeact_agent.action_parser import CodeActResponseParser
 from agenthub.codeact_agent.prompt import (
    COMMAND_DOCS,
    EXAMPLES,
@@ -18,6 +17,7 @@ from opendevin.events.action import (
    MessageAction,
 )
 from opendevin.events.observation import (
+    AgentDelegateObservation,
    BrowserOutputObservation,
    CmdOutputObservation,
    IPythonRunCellObservation,
@@ -33,14 +33,6 @@ from opendevin.runtime.tools import RuntimeTool
 ENABLE_GITHUB = True


-def parse_response(response) -> str:
-    action = response.choices[0].message.content
-    for lang in ['bash', 'ipython', 'browse']:
-        if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
-            action += f'</execute_{lang}>'
-    return action
-
-
 def action_to_str(action: Action) -> str:
    if isinstance(action, CmdRunAction):
        return f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
@@ -89,6 +81,9 @@ def get_observation_message(obs) -> dict[str, str] | None:
    elif isinstance(obs, BrowserOutputObservation):
        content = 'OBSERVATION:\n' + truncate_observation(obs.content)
        return {'role': 'user', 'content': content}
+    elif isinstance(obs, AgentDelegateObservation):
+        content = 'OBSERVATION:\n' + truncate_observation(str(obs.outputs))
+        return {'role': 'user', 'content': content}
    return None


@@ -119,7 +114,7 @@ def get_in_context_example() -> str:


 class CodeActAgent(Agent):
-    VERSION = '1.5'
+    VERSION = '1.6'
    """
    The Code Act Agent is a minimalist agent.
    The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
@@ -164,11 +159,12 @@ class CodeActAgent(Agent):
        JupyterRequirement(),
    ]
    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]
-    jupyter_kernel_init_code: str = 'from agentskills import *'

    system_message: str = get_system_message()
    in_context_example: str = f"Here is an example of how you can interact with the environment for task solving:\n{get_in_context_example()}\n\nNOW, LET'S START!"

+    action_parser = CodeActResponseParser()
+
    def __init__(
        self,
        llm: LLM,
@@ -199,7 +195,7 @@ class CodeActAgent(Agent):
        Returns:
        - CmdRunAction(command) - bash command to run
        - IPythonRunCellAction(code) - IPython code to run
-        - BrowseInteractiveAction(browsergym_command) - BrowserGym commands to run
+        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
        - MessageAction(content) - Message action to run (e.g. ask for clarification)
        - AgentFinishAction() - end the interaction
        """
@@ -234,50 +230,10 @@ class CodeActAgent(Agent):
            ],
            temperature=0.0,
        )
-
-        action_str: str = parse_response(response)
        state.num_of_chars += sum(
            len(message['content']) for message in messages
-        ) + len(action_str)
-
-        if finish_command := re.search(r'<finish>.*</finish>', action_str, re.DOTALL):
-            thought = action_str.replace(finish_command.group(0), '').strip()
-            return AgentFinishAction(thought=thought)
-        if bash_command := re.search(
-            r'<execute_bash>(.*?)</execute_bash>', action_str, re.DOTALL
-        ):
-            # remove the command from the action string to get thought
-            thought = action_str.replace(bash_command.group(0), '').strip()
-            # a command was found
-            command_group = bash_command.group(1).strip()
-
-            if command_group.strip() == 'exit':
-                return AgentFinishAction()
-            return CmdRunAction(command=command_group, thought=thought)
-        elif python_code := re.search(
-            r'<execute_ipython>(.*?)</execute_ipython>', action_str, re.DOTALL
-        ):
-            # a code block was found
-            code_group = python_code.group(1).strip()
-            thought = action_str.replace(python_code.group(0), '').strip()
-            return IPythonRunCellAction(
-                code=code_group,
-                thought=thought,
-                kernel_init_code=self.jupyter_kernel_init_code,
-            )
-        elif browse_command := re.search(
-            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
-        ):
-            # BrowserGym actions was found
-            browse_actions = browse_command.group(1).strip()
-            thought = action_str.replace(browse_command.group(0), '').strip()
-            return BrowseInteractiveAction(
-                browser_actions=browse_actions, thought=thought
-            )
-        else:
-            # We assume the LLM is GOOD enough that when it returns pure natural language
-            # it want to talk to the user
-            return MessageAction(content=action_str, wait_for_response=True)
+        ) + len(response.choices[0].message.content)
+        return self.action_parser.parse(response)

    def search_memory(self, query: str) -> list[str]:
        raise NotImplementedError('Implement this abstract method')
--- a/agenthub/codeact_agent/prompt.py
+++ b/agenthub/codeact_agent/prompt.py
@@ -5,35 +5,41 @@ _AGENT_SKILLS_DOCS = AgentSkillsRequirement.documentation
 COMMAND_DOCS = (
    '\nApart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:\n'
    f'{_AGENT_SKILLS_DOCS}'
-    "Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run."
+    "Please note that THE `edit_file` and `append_file` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run."
 )

 # ======= SYSTEM MESSAGE =======
 MINIMAL_SYSTEM_PREFIX = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
-The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
 <execute_ipython>
 print("Hello World!")
 </execute_ipython>
 The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
 For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
 """

-BROWSING_PREFIX = """The assistant can browse the Internet with commands on behalf of the user by wrapping them with <execute_browse> and </execute_browse>.
-For example, you can browse a given URL by <execute_browse> goto("<URL>") </execute_browse>.
-The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
+BROWSING_PREFIX = """The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
 """
 PIP_INSTALL_PREFIX = """The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them."""

 SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX

-GITHUB_MESSAGE = """To do any activities on GitHub, the assistant should use the token in the $GITHUB_TOKEN environment variable.
-For instance, to push a local branch `my_branch` to the github repo `owner/repo`, the assistant can use the following four commands:
+GITHUB_MESSAGE = """To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
 <execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
-If the assistant require access to GitHub but $GITHUB_TOKEN is not set, ask the user to set it."""
+If $GITHUB_TOKEN is not set, ask the user to set it."""

-SYSTEM_SUFFIX = """The assistant's response should be concise.
-The assistant should include ONLY ONE <execute_ipython> or <execute_bash> or <execute_browse> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
-IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> or <execute_browse> instead of providing it.
+SYSTEM_SUFFIX = """Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
 """


@@ -41,10 +47,10 @@ IMPORTANT: Whenever possible, execute the code for the user using <execute_ipyth
 EXAMPLES = """
 --- START OF EXAMPLE ---

-USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.

 ASSISTANT:
-Sure! Let me create a file first:
+Sure! Let me create a Python file `app.py`:
 <execute_ipython>
 create_file('app.py')
 </execute_ipython>
@@ -68,7 +74,7 @@ def index():

 if __name__ == '__main__':
    app.run(port=5000)\"\"\"
-edit_file(start=1, end=1, content=EDITED_CODE)
+edit_file('app.py', start=1, end=1, content=EDITED_CODE)
 </execute_ipython>

 USER:
@@ -163,7 +169,7 @@ USER: Now browse the newly started server's homepage and show me the content.
 ASSISTANT:
 Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
 <execute_browse>
-goto("http://127.0.0.1:5000")
+Get the content on "http://127.0.0.1:5000"
 </execute_browse>

 USER:
@@ -207,7 +213,7 @@ USER:
 ASSISTANT:
 I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
 <execute_ipython>
-edit_file(start=7, end=7, content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'")
+edit_file('app.py', start=7, end=7, content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'")
 </execute_ipython>

 USER:
@@ -226,7 +232,7 @@ Observation:
 [File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]

 ASSISTANT:
-The file has been updated. Let me run the Python file again with the new changes:
+Running the updated file:
 <execute_bash>
 python3 app.py > server.log 2>&1 &
 </execute_bash>
@@ -236,14 +242,14 @@ Observation:
 [1] 126

 ASSISTANT:
-The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Free free to let me know if you have any further requests!
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!

 --- END OF EXAMPLE ---
 """

 INVALID_INPUT_MESSAGE = (
    "I don't understand your input. \n"
-    'If you want to execute a bash command, please use <execute_bash> YOUR_COMMAND_HERE </execute_bash>.\n'
-    'If you want to execute a block of Python code, please use <execute_ipython> YOUR_COMMAND_HERE </execute_ipython>.\n'
-    'If you want to browse the Internet, please use <execute_browse> YOUR_COMMAND_HERE </execute_browse>.\n'
+    'For bash commands, use <execute_bash> YOUR_COMMAND </execute_bash>.\n'
+    'For Python code, use <execute_ipython> YOUR_CODE </execute_ipython>.\n'
+    'For browsing, use <execute_browse> YOUR_COMMAND </execute_browse>.\n'
 )
--- a/agenthub/micro/README.md
+++ b/agenthub/micro/README.md
@@ -12,3 +12,6 @@ in the following structure:
 Note that `prompt.md` could use jinja2 template syntax. During runtime, `prompt.md`
 is loaded and rendered, and used together with `agent.yaml` to initialize a
 micro-agent.
+
+Micro-agents can be used independently. You can also use `ManagerAgent` which knows
+how to coordinate the agents and collaboratively finish a task.
--- a/agenthub/micro/_instructions/actions/reject.md
+++ b/agenthub/micro/_instructions/actions/reject.md
@@ -1,2 +1,2 @@
 * `reject` - reject the task. Arguments:
-  * `outputs` - a dictionary representing the outputs of your task, if any
+  * `outputs` - a dictionary with only a `reason` attribute
--- a/agenthub/micro/commit_writer/agent.yaml
+++ b/agenthub/micro/commit_writer/agent.yaml
@@ -3,3 +3,4 @@ description: "Write a git commit message for files in the git staging area"
 inputs: {}
 outputs:
  answer: string
+  reason: string
--- a/agenthub/micro/commit_writer/prompt.md
+++ b/agenthub/micro/commit_writer/prompt.md
@@ -14,7 +14,7 @@ changes. The commit message should include:
 You should find the diff using `git diff --cached`, compile a commit message,
 and call the `finish` action with `outputs.answer` set to the answer. If current
 repo is not a valid git repo, or there is no diff in the staging area, please call
-the `reject` action with `outputs.answer` set to the reason.
+the `reject` action.

 ## History
 {{ instructions.history_truncated }}
--- a/agenthub/micro/manager/agent.yaml
+++ b/agenthub/micro/manager/agent.yaml
@@ -3,4 +3,6 @@ description: Delegates tasks to microagents based on their area of expertise
 generates: Action
 inputs:
  task: string
-outputs: {}
+outputs:
+  summary: string # if finished
+  reason: string # if rejected
--- a/agenthub/micro/manager/prompt.md
+++ b/agenthub/micro/manager/prompt.md
@@ -7,6 +7,15 @@ can do the actual work. A description of each agent is provided below. You MUST
 select one of the delegates below to move towards accomplishing the task, and you MUST
 provide the correct inputs for the delegate you select.

+Note: the delegated agent either returns "finish" or "reject".
+- If the action is "finish", but the full task is not done yet, you should
+continue to delegate to one of the agents below to until the full task is finished.
+- If the action is "reject", it means the delegated agent is not capable of the
+task you send to. You should revisit the input you send to the delegate, and consider
+whether any other delegate would be able to solve the task. If you cannot find
+a proper delegate agent, or the delegate attempts keep failing, call the `reject`
+action.
+
 ## Agents
 {% for name, details in delegates.items() %}
 ### {{ name }}
@@ -19,9 +28,13 @@ provide the correct inputs for the delegate you select.
 {{ instructions.history_truncated }}
 {{ history_to_json(state.history[-10:]) }}

+If the last item in the history is an error, you should try to fix it. If you
+cannot fix it, call the `reject` action.
+
 ## Available Actions
 {{ instructions.actions.delegate }}
 {{ instructions.actions.finish }}
+{{ instructions.actions.reject }}

 ## Format
 {{ instructions.format.action }}
--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -10,7 +10,7 @@ RUN npm ci
 COPY ./frontend ./
 RUN npm run make-i18n && npm run build

-FROM python:3.12-slim as backend-builder
+FROM python:3.12.3-slim as backend-builder

 WORKDIR /app
 ENV PYTHONPATH '/app'
@@ -28,7 +28,7 @@ COPY ./pyproject.toml ./poetry.lock ./
 RUN touch README.md
 RUN poetry install --without evaluation --no-root && rm -rf $POETRY_CACHE_DIR

-FROM python:3.12-slim as runtime
+FROM python:3.12.3-slim as runtime

 WORKDIR /app

--- a/docs/modules/usage/about.md
+++ b/docs/modules/usage/about.md
@@ -15,7 +15,7 @@ Achieving full replication of production-grade applications with LLMs is a compl

 ## 🚧 Default Agent

- Our default Agent is currently the MonologueAgent, which has limited capabilities, but is fairly stable. We're working on other Agent implementations, including [SWE Agent](https://swe-agent.com/). You can [read about our current set of agents here](./agents).
+- Our default Agent is currently the CodeActAgent, which is capable of generating code and handling files. We're working on other Agent implementations, including [SWE Agent](https://swe-agent.com/). You can [read about our current set of agents here](./agents).

 ## 🤝 How to Contribute

--- a/docs/modules/usage/troubleshooting/troubleshooting.md
+++ b/docs/modules/usage/troubleshooting/troubleshooting.md
@@ -4,52 +4,53 @@ sidebar_position: 5

 # 🚧 Troubleshooting

-There are some error messages that get reported over and over by users.
-We'll try to make the install process easier, and to make these error messages
-better in the future. But for now, you can look for your error message below,
-and see if there are any workaround.
+There are some error messages that frequently get reported by users.
+
+We'll try to make the install process easier and these error messages
+better in the future. But for now, you can look for your error message below and see if there are any workarounds.

 For each of these error messages **there is an existing issue**. Please do not
-open an new issue--just comment there.
+open a new issue--just comment there.

 If you find more information or a workaround for one of these issues, please
-open a PR to add details to this file.
+open a *PR* to add details to this file.

 :::tip
-If you're running on Windows and having trouble, check out our [guide for Windows users](troubleshooting/windows)
+If you're running on Windows and having trouble, check out our [guide for Windows (WSL) users](troubleshooting/windows).
 :::

-## Unable to connect to docker
+## Unable to connect to Docker

 [GitHub Issue](https://github.com/OpenDevin/OpenDevin/issues/1226)

 ### Symptoms

-```
+```bash
 Error creating controller. Please check Docker is running and visit `https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting` for more debugging information.
 ```

-```
+```bash
 docker.errors.DockerException: Error while fetching server API version: ('Connection aborted.', FileNotFoundError(2, 'No such file or directory'))
 ```

 ### Details

-OpenDevin uses a docker container to do its work safely, without potentially breaking your machine.
+OpenDevin uses a Docker container to do its work safely, without potentially breaking your machine.

 ### Workarounds

 * Run `docker ps` to ensure that docker is running
 * Make sure you don't need `sudo` to run docker [see here](https://www.baeldung.com/linux/docker-run-without-sudo)
-* If you are on a mac, check the [permissions requirements](https://docs.docker.com/desktop/mac/permission-requirements/) and in particular consider enabling the "Allow the default Docker socket to be used" under "Settings > Advanced" in Docker Desktop.
-* If you are on a mac, Upgrade your Docker to the latest version under "Check for Updates"
+* If you are on a Mac, check the [permissions requirements](https://docs.docker.com/desktop/mac/permission-requirements/) and in particular consider enabling the `Allow the default Docker socket to be used` under `Settings > Advanced` in Docker Desktop.
+* In addition, upgrade your Docker to the latest version under `Check for Updates`

 ## Unable to connect to SSH box
+
 [GitHub Issue](https://github.com/OpenDevin/OpenDevin/issues/1156)

 ### Symptoms

-```
+```python
 self.shell = DockerSSHBox(
 ...
 pexpect.pxssh.ExceptionPxssh: Could not establish connection to host
@@ -62,17 +63,19 @@ especially Windows, this seems to fail.

 ### Workarounds

- Restart your computer (sometimes works?)
- Be sure to have the latest versions of WSL and Docker
- Try [this reinstallation guide](https://github.com/OpenDevin/OpenDevin/issues/1156#issuecomment-2064549427)
- Set `-e SANDBOX_TYPE=exec` to switch to the ExecBox docker container
+* Restart your computer (sometimes it does work)
+* Be sure to have the latest versions of WSL and Docker
+* Check that your distribution in WSL is up to date as well
+* Try [this reinstallation guide](https://github.com/OpenDevin/OpenDevin/issues/1156#issuecomment-2064549427)
+* Set `-e SANDBOX_TYPE=exec` to switch to the ExecBox docker container

 ## Unable to connect to LLM
+
 [GitHub Issue](https://github.com/OpenDevin/OpenDevin/issues/1208)

 ### Symptoms

-```
+```python
  File "/app/.venv/lib/python3.12/site-packages/openai/_exceptions.py", line 81, in __init__
    super().__init__(message, response.request, body=body)
                              ^^^^^^^^^^^^^^^^
@@ -83,18 +86,20 @@ AttributeError: 'NoneType' object has no attribute 'request'

 [GitHub Issues](https://github.com/OpenDevin/OpenDevin/issues?q=is%3Aissue+is%3Aopen+404)

-This usually happens with local LLM setups, when OpenDevin can't connect to the LLM server.
+This usually happens with *local* LLM setups, when OpenDevin can't connect to the LLM server.
 See our guide for [local LLMs](llms/localLLMs) for more information.

 ### Workarounds

- Check your `LLM_BASE_URL`
- Check that ollama is running OK
- Make sure you're using `--add-host host.docker.internal:host-gateway` when running in docker
+* Check your `base_url` in your config.toml (if it exists) under the "llm" section
+* Check that ollama (or whatever LLM you're using) is running OK
+* Make sure you're using `--add-host host.docker.internal:host-gateway` when running in Docker
+
+## `404 Resource not found`

-## 404 Resource not found
 ### Symptoms
-```
+
+```python
 Traceback (most recent call last):
  File "/app/.venv/lib/python3.12/site-packages/litellm/llms/openai.py", line 414, in completion
    raise e
@@ -119,18 +124,86 @@ openai.NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Re
 ```

 ### Details
+
 This happens when LiteLLM (our library for connecting to different LLM providers) can't find
-the API you're trying to connect to. Most often this happens for Azure or ollama users.
+the API endpoint you're trying to connect to. Most often this happens for Azure or ollama users.

 ### Workarounds
- Check that you've set `LLM_BASE_URL` properly
- Check that model is set properly, based on the [LiteLLM docs](https://docs.litellm.ai/docs/providers)
-  - If you're running inside the UI, be sure to set the `model` in the settings modal
-  - If you're running headless (via main.py) be sure to set `LLM_MODEL` in your env/config
- Make sure you've followed any special instructions for your LLM provider
-  - [ollama](/OpenDevin/modules/usage/llms/localLLMs)
-  - [Azure](/OpenDevin/modules/usage/llms/azureLLMs)
-  - [Google](/OpenDevin/modules/usage/llms/googleLLMs)
- Make sure your API key is correct
- See if you can connect to the LLM using `curl`
- Try [connecting via LiteLLM directly](https://github.com/BerriAI/litellm) to test your setup
+
+* Check that you've set `LLM_BASE_URL` properly
+* Check that model is set properly, based on the [LiteLLM docs](https://docs.litellm.ai/docs/providers)
+  * If you're running inside the UI, be sure to set the `model` in the settings modal
+  * If you're running headless (via main.py) be sure to set `LLM_MODEL` in your env/config
+* Make sure you've followed any special instructions for your LLM provider
+  * [ollama](/OpenDevin/modules/usage/llms/localLLMs)
+  * [Azure](/OpenDevin/modules/usage/llms/azureLLMs)
+  * [Google](/OpenDevin/modules/usage/llms/googleLLMs)
+* Make sure your API key is correct
+* See if you can connect to the LLM using `curl`
+* Try [connecting via LiteLLM directly](https://github.com/BerriAI/litellm) to test your setup
+
+## `make build` getting stuck on package installations
+
+### Symptoms
+
+Package installation stuck on `Pending...` without any error message:
+
+```bash
+Package operations: 286 installs, 0 updates, 0 removals
+
+  - Installing certifi (2024.2.2): Pending...
+  - Installing h11 (0.14.0): Pending...
+  - Installing idna (3.7): Pending...
+  - Installing sniffio (1.3.1): Pending...
+  - Installing typing-extensions (4.11.0): Pending...
+```
+
+### Details
+
+In rare cases, `make build` can seemingly get stuck on package installations
+without any error message.
+
+### Workarounds
+
+* The package installer Poetry may miss a configuration setting for
+where credentials are to be looked up (keyring).
+
+### Workaround
+
+First check with `env` if a value for `PYTHON_KEYRING_BACKEND` exists.
+If not, run the below command to set it to a known value and retry the build:
+
+```bash
+export PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring
+```
+
+## Sessions are not restored
+
+### Symptoms
+
+OpenDevin usually asks whether to resume or start a new session when opening the UI.
+But clicking "Resume" still starts a fresh new chat.
+
+### Details
+
+With a standard installation as of today session data is stored in memory.
+Currently, if OpenDevin's service is restarted, previous sessions become
+invalid (a new secret is generated) and thus not recoverable.
+
+### Workarounds
+
+* Change configuration to make sessions persistent by editing the `config.toml`
+file (in OpenDevin's root folder) by specifying a `file_store` and an
+absolute `file_store_path`:
+
+```toml
+file_store="local"
+file_store_path="/absolute/path/to/opendevin/cache/directory"
+```
+
+* Add a fixed jwt secret in your .bashrc, like below, so that previous session id's
+should stay accepted.
+
+```bash
+EXPORT JWT_SECRET=A_CONST_VALUE
+```
--- a/docs/static/img/screenshot.png
+++ b/docs/static/img/screenshot.png
--- a/evaluation/EDA/game.py
+++ b/evaluation/EDA/game.py
@@ -181,7 +181,7 @@ class Q20GameCelebrity(Q20Game):
        user_messages = [
            {
                'role': 'system',
-                'content': f'Based on on your knowledge about the celebrity: {self.item}, '
+                'content': f'Based on your knowledge about the celebrity: {self.item}, '
                f'respond to the following question or guess. '
                f"Limit your respond to only 'Yes.', 'No.' or 'Dunno.', with no explanation or other words. "
                f"Never say the name {self.item} in your response. Do not say 'Dunno.' if it can be answered by 'Yes.' or 'No.' "
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -45,7 +45,7 @@ def codeact_user_response(state: State) -> str:
    msg = game.generate_user_response(model_guess)
    game.curr_turn += 1
    logger.info(f'Model guess: {model_guess}')
-    logger.info(f'Anwser response: {msg}')
+    logger.info(f'Answer response: {msg}')
    if 'bingo!' in msg.lower():
        return '/exit'
    return msg
@@ -65,7 +65,9 @@ AGENT_CLS_TO_INST_SUFFIX = {
 }


-def process_instance(instance, agent_class, metadata, reset_logger: bool = True):
+def process_instance(
+    instance, agent_class, metadata, openai_api_key, reset_logger: bool = True
+):
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    eval_output_dir = metadata['eval_output_dir']
    if reset_logger:
@@ -107,7 +109,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
        answerer_model=metadata['answerer_model'],
        guesser_model=None,
        num_turns=metadata['max_iterations'],
-        openai_api_key=metadata['openai_api'],
+        openai_api_key=openai_api_key,
        guesser_kargs=guesser_kargs,
    )

@@ -234,7 +236,6 @@ if __name__ == '__main__':
        'data_split': args.data_split,
        'answerer_model': args.answerer_model,
        'agent_class': agent_class,
-        'openai_api': args.OPENAI_API_KEY,
        'model_name': model_name,
        'max_iterations': max_iterations,
        'eval_output_dir': eval_output_dir,
@@ -317,6 +318,7 @@ if __name__ == '__main__':
                    instance,
                    agent_class,
                    metadata,
+                    args.OPENAI_API_KEY,
                    reset_logger=bool(num_workers > 1),
                )
                future.add_done_callback(update_progress)
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -109,7 +109,7 @@ def process_instance(
        # add back the console handler to print ONE line
        logger.addHandler(get_console_handler())
        logger.info(
-            f'Starting evaluation for instance {inst_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
+            f'Starting evaluation for instance {inst_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
        )
        # Remove all existing handlers from logger
        for handler in logger.handlers[:]:
--- a/evaluation/biocoder/README.md
+++ b/evaluation/biocoder/README.md
@@ -0,0 +1,59 @@
+# BioCoder Evaluation with Opendevin
+
+Implements evaluation of agents on BioCoder from the BioCoder benchmark introduced in [BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models](https://arxiv.org/abs/2308.16458). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper.
+
+## Setup Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
+
+
+## Configure OpenDevin and your LLM
+Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
+
+## BioCoder Docker Image
+In the opendevin branch of the Biocoder repository, we have slightly modified our original Docker image to work with the OpenDevin environment. In the Docker image are testing scripts (`/testing/start_test_opendevin.py` and aux files in `/testing_files/`) to assist with evaluation. Additionally, we have installed all dependencies, including OpenJDK, mamba (with Python 3.6), and many system libraries. Notably, we have **not** packaged all repositories into the image, so they are downloaded at runtime.
+
+**Before first execution, pull our Docker image with the following command**
+```bash
+docker pull public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0
+```
+
+To reproduce this image, please see the Dockerfile_Opendevin in the `biocoder` repository.
+
+## Start the evaluation
+
+
+```bash
+./evaluation/biocoder/scripts/run_infer.sh [model_config] [agent] [eval_limit]
+```
+
+where `model_config` is mandatory, while `agent`, `dataset` and `eval_limit` are optional.
+
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
+
+Let's say you'd like to run 10 instances using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent,
+then your command would be:
+
+## Examples
+
+```bash
+./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 CodeActAgent 1
+```
+
+## Reference
+```
+@misc{tang2024biocoder,
+      title={BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models},
+      author={Xiangru Tang and Bill Qian and Rick Gao and Jiakang Chen and Xinyun Chen and Mark Gerstein},
+      year={2024},
+      eprint={2308.16458},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+```
--- a/evaluation/biocoder/biocoder_env_box.py
+++ b/evaluation/biocoder/biocoder_env_box.py
@@ -0,0 +1,396 @@
+import json
+import os
+import re
+import sys
+from collections import defaultdict
+from dataclasses import dataclass
+
+from datasets import load_dataset
+
+from opendevin.core.config import config
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.runtime.docker.ssh_box import DockerSSHBox
+from opendevin.runtime.plugins import (
+    JupyterRequirement,
+    PluginRequirement,
+    SWEAgentCommandsRequirement,
+)
+
+BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
+
+
+@dataclass
+class BiocoderData:
+    filePath: str
+    numLines: int
+    lineStart: int
+    lineEnd: int
+    signature: str
+    comment: str
+    content: str
+    repository: str
+    promptSummaryOnly: str
+    contextCode: str
+    goldenCode: str
+    test_case_id: str
+    language: str
+
+    def to_dict(self):
+        return {
+            'filePath': self.filePath,
+            'numLines': self.numLines,
+            'lineStart': self.lineStart,
+            'lineEnd': self.lineEnd,
+            'signature': self.signature,
+            'comment': self.comment,
+            'content': self.content,
+            'repository': self.repository,
+            'promptSummaryOnly': self.promptSummaryOnly,
+            'contextCode': self.contextCode,
+            'goldenCode': self.goldenCode,
+            'test_case_id': self.test_case_id,
+            'language': self.language,
+        }
+
+
+def get_likely_indent_size(array_of_tabs) -> int:
+    sizes = defaultdict(int)
+
+    for i in range(len(array_of_tabs) - 1):
+        diff = array_of_tabs[i + 1] - array_of_tabs[i]
+        if diff > 0:
+            sizes[diff] += 1
+    if len(sizes) == 0:
+        return 4
+    return int(max(sizes, key=sizes.get))
+
+
+class BiocoderSSHBox(DockerSSHBox):
+    def __init__(
+        self,
+        container_image: str,
+        timeout: int = 120,
+        sid: str | None = None,
+        biocoder_instance_id: str | None = None,
+        biocoder_instance: BiocoderData | None = None,
+        skip_workspace_mount: bool = True,
+        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
+        biocoder_cache_folder: str = 'biocoder_cache',
+        workspace_dir_name: str | None = None,
+    ):
+        if biocoder_instance_id is None:
+            raise ValueError('biocoder_instance_id must be provided')
+        self.biocoder_instance_id = biocoder_instance_id
+        self.biocoder_instance = biocoder_instance
+        self.skip_workspace_mount = skip_workspace_mount
+        self.biocoder_cache_folder = biocoder_cache_folder
+        self.first_line_after_removed = None
+        self.workspace_dir_name = workspace_dir_name
+        self.workspace_base = config.workspace_base
+        self.workspace_mount_path = config.workspace_mount_path
+        # self.workspace_dir_name_host = os.path.join(config.workspace_base, workspace_dir_name)
+
+        self.context_path = None
+        self.generated_path = None
+        self.golden_path = None
+
+        assert (
+            container_image is not None
+        ), 'container_image is required for BiocoderBenchSSHBox!'
+        super().__init__(container_image, timeout, sid)
+        self.init_plugins(sandbox_plugins)
+
+    @property
+    def volumes(self):
+        if self.skip_workspace_mount:
+            return {
+                k: v
+                for k, v in super().volumes.items()
+                if not v['bind'] == self.sandbox_workspace_dir
+            }
+        return super().volumes
+
+    def get_target_filepath(self):
+        target_filepath = os.path.join(
+            self.workspace_mount_path,
+            self.biocoder_instance.repository.split('/')[1],
+            self.biocoder_instance.filePath,
+        )
+        return target_filepath
+
+    def get_changed_code(self, include_signature=False):
+        # copies changed code into /testing_files/
+        # Note that this does NOT copy the function signature
+        target_filepath = self.get_target_filepath()
+        selected_lines = []
+        offset = 1 if include_signature else 0
+        if self.first_line_after_removed is None:
+            logger.warning('First line after removed is None')
+        with open(target_filepath, 'r') as f:
+            lines = f.read().split('\n')
+            for i in range(self.biocoder_instance.lineStart - offset, len(lines)):
+                if lines[i].strip() == self.first_line_after_removed.strip():
+                    break
+                selected_lines.append(lines[i])
+        text = '\n'.join(selected_lines)
+        return text
+
+    def copy_changed_code(self):
+        changed_code = self.get_changed_code(include_signature=True)
+        with open(self.generated_path, 'w') as f:
+            f.write(changed_code)
+        exit_code, output = self.execute_and_check(
+            f'cp -r /workspace/{self.biocoder_cache_folder}/* /testing_files',
+            'Failed to copy the files',
+        )
+
+    def remove_code(self):
+        comment_prefix = {'python': '#', 'java': '//'}
+
+        target_filepath = self.get_target_filepath()
+        line_start = self.biocoder_instance.lineStart
+        line_end = self.biocoder_instance.lineEnd
+        with open(target_filepath, 'r') as f:
+            lines = f.read().split('\n')
+            # print("="*10+"ORIGINAL"+"="*10)
+            # print("\n".join(lines))
+            signature_line = lines[line_start - 1]
+
+            # get the number of tabs
+            def get_indent_size(s: str):
+                return len(re.match(r'\s*', s).group())
+
+            indent_sizes = list(map(get_indent_size, lines))
+            indent_size = get_likely_indent_size(indent_sizes)
+            comment_indent_size = get_indent_size(signature_line) + indent_size
+            lines = (
+                lines[:line_start]
+                + [
+                    f"{' '*comment_indent_size+comment_prefix[self.biocoder_instance.language.lower()]}TODO: replace with your code here"
+                ]
+                + ([''] * 2)
+                + lines[line_end:]
+            )
+        first_line_after_removed_index = line_start
+        while len(
+            lines[first_line_after_removed_index].strip()
+        ) == 0 and first_line_after_removed_index < len(lines):
+            first_line_after_removed_index += 1
+        self.first_line_after_removed = lines[first_line_after_removed_index]
+        # print("FIRST LINE AFTER REMOVED: ", self.first_line_after_removed)
+
+        with open(target_filepath, 'w') as f:
+            f.write('\n'.join(lines))
+
+        # with open(target_filepath, 'r') as f:
+        #     print("="*10+"MODIFIED"+"="*10)
+        #     print(f.read())
+
+    def execute_and_check(self, cmd: str, error_msg: str) -> tuple[int, str]:
+        exit_code, output = self.execute(cmd)
+        if exit_code != 0:
+            logger.error(error_msg)
+            sys.exit(1)
+        return exit_code, output
+
+    @classmethod
+    def get_box_for_instance(
+        cls,
+        instance,
+        workspace_dir_name=None,
+        skip_workspace_mount: bool = False,
+        workspace_mount_path: str | None = None,
+        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
+    ) -> 'BiocoderSSHBox':
+        """This method initializes a container image, then runs some initialization commands"""
+        if workspace_dir_name is None:
+            workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
+                '/', '__'
+            )
+
+        workspace_base = str(os.path.join(config.workspace_base, workspace_dir_name))
+        old_workspace_base = config.workspace_base
+        old_workspace_mount_path = config.workspace_mount_path
+
+        try:
+            config.workspace_base = workspace_base
+            config.workspace_mount_path = workspace_base
+
+            # linting python after editing helps LLM fix indentations
+            config.enable_auto_lint = True
+
+            # create folder for transferring files back/forth
+            biocoder_cache_folder = 'biocoder_cache'
+            if not os.path.exists(os.path.join(workspace_base, biocoder_cache_folder)):
+                os.makedirs(
+                    os.path.join(workspace_base, biocoder_cache_folder), exist_ok=True
+                )
+
+            file_ext = {
+                'python': 'py',
+                'java': 'java',
+                'c': 'c',
+                'cpp': 'cpp',
+                'javascript': 'js',
+                'typescript': 'ts',
+            }[instance.language.lower()]
+
+            context_path = os.path.join(
+                workspace_base, biocoder_cache_folder, 'context.' + file_ext
+            )
+            generated_path = os.path.join(
+                workspace_base, biocoder_cache_folder, 'generated.' + file_ext
+            )
+            golden_path = os.path.join(
+                workspace_base, biocoder_cache_folder, 'golden.' + file_ext
+            )
+
+            # print(instance.contextCode)
+            with open(context_path, 'w') as f:
+                f.write(instance.contextCode)
+            with open(generated_path, 'w') as f:
+                f.write(instance.goldenCode)
+            with open(golden_path, 'w') as f:
+                f.write(instance.goldenCode)
+
+            testcase_json = {
+                'test_case_id': instance.test_case_id,
+                'num_cases': 1000,
+                'language': instance.language.lower(),
+            }
+
+            with open(
+                os.path.join(
+                    workspace_base, biocoder_cache_folder, 'testcase_biocoder.json'
+                ),
+                'w',
+            ) as f:
+                f.write(json.dumps(testcase_json, indent=4))
+
+            # linting python after editing helps LLM fix indentations
+            config.enable_auto_lint = True
+
+            sandbox = cls(
+                container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
+                biocoder_instance_id=instance.test_case_id,
+                biocoder_instance=instance,
+                skip_workspace_mount=skip_workspace_mount,
+                sandbox_plugins=sandbox_plugins,
+                biocoder_cache_folder=biocoder_cache_folder,
+                workspace_dir_name=workspace_dir_name,
+            )
+        except Exception:
+            raise
+        finally:
+            config.workspace_base = old_workspace_base
+            config.workspace_mount_path = old_workspace_mount_path
+
+        sandbox.context_path = context_path
+        sandbox.generated_path = generated_path
+        sandbox.golden_path = golden_path
+
+        logger.info(f'SSH box started for instance {instance.test_case_id}.')
+        # cd to the workspace
+        exit_code, output = sandbox.execute_and_check(
+            'cd /workspace', 'Failed to cd to workspace'
+        )
+        logger.info(f'cd to workspace: {output}')
+
+        # download repository archive
+        repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
+        exit_code, output = sandbox.execute_and_check(
+            'wget -O repo.zip ' + repository_url, 'Failed to download the repository'
+        )
+        logger.info(f'Downloaded the repository: {output}')
+        exit_code, output = sandbox.execute_and_check(
+            'unzip -o -q repo.zip', 'Failed to unzip the repository'
+        )
+        logger.info(f'Unzipped the repository: {output}')
+
+        # copy the context, generated and golden files to the /testing_files folder
+        exit_code, output = sandbox.execute_and_check(
+            f'cp -r /workspace/{biocoder_cache_folder}/* /testing_files',
+            'Failed to copy the files',
+        )
+
+        # chmod 777
+        exit_code, output = sandbox.execute_and_check(
+            'chmod -R 777 /workspace',
+            'Failed to chmod the files',
+        )
+
+        return sandbox
+
+
+if __name__ == '__main__':
+    biocoder_dataset = load_dataset('Lilbillbiscuit/biocoder_public')
+    EXAMPLE_INSTANCE = biocoder_dataset['test'][0]
+    EXAMPLE_INSTANCE = BiocoderData(**EXAMPLE_INSTANCE)
+
+    sandbox = BiocoderSSHBox.get_box_for_instance(
+        instance=EXAMPLE_INSTANCE,
+        workspace_mount_path='/home/ubuntu/OpenDevinBioCoder/workspace',
+        skip_workspace_mount=False,
+        sandbox_plugins=[JupyterRequirement(), SWEAgentCommandsRequirement()],
+    )
+
+    # PRE TEST
+    exit_code, output = sandbox.execute_and_check(
+        'cd /testing',
+        'Failed to cd /testing',
+    )
+    logger.info(f'cd $REPO_PATH: {output}')
+
+    exit_code, output = sandbox.execute_and_check(
+        'whoami',
+        'Failed to run whoami',
+    )
+    logger.info(f'whoami: {output}')
+
+    # TEST
+    exit_code, output = sandbox.execute(
+        '/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
+    )
+    assert exit_code == 0, 'Expected exit code 0 (this should have passed)'
+    logger.info(f'$TEST_CMD:\n{output}')
+
+    exit_code, output = sandbox.execute_and_check(
+        'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
+    )
+
+    print(output)
+    json_obj = json.loads(output)
+    if json_obj['result'] == 'pass':
+        print('PASS')
+    else:
+        print('FAIL')
+
+    bg_cmd = sandbox.execute_in_background(
+        "while true; do echo 'dot ' && sleep 10; done"
+    )
+
+    sys.stdout.flush()
+    try:
+        while True:
+            try:
+                user_input = input('>>> ')
+            except EOFError:
+                logger.info('Exiting...')
+                break
+            if user_input.lower() == 'exit':
+                logger.info('Exiting...')
+                break
+            if user_input.lower() == 'kill':
+                sandbox.kill_background(bg_cmd.pid)
+                logger.info('Background process killed')
+                continue
+            exit_code, output = sandbox.execute(user_input)
+            logger.info('exit code: %d', exit_code)
+            logger.info(output)
+            if bg_cmd.pid in sandbox.background_commands:
+                logs = sandbox.read_logs(bg_cmd.pid)
+                logger.info('background logs: %s', logs)
+            sys.stdout.flush()
+    except KeyboardInterrupt:
+        logger.info('Exiting...')
+    sandbox.close()
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -0,0 +1,393 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+import pandas as pd
+from datasets import load_dataset
+from tqdm import tqdm
+
+import agenthub
+from evaluation.biocoder.biocoder_env_box import BiocoderData, BiocoderSSHBox
+from opendevin.controller.state.state import State
+from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    print('Cleaning up child processes...')
+    for process in mp.active_children():
+        print(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
+    )
+    if state.history:
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have fixed the issue through code changes, please run the following command: <execute_bash> exit </execute_bash>.\n'
+}
+
+
+def get_test_result(instance, sandbox, workspace_dir_name):
+    test_result = {'result': {}, 'metadata': {}}
+    try:
+        code = sandbox.get_changed_code(include_signature=True)
+        sandbox.copy_changed_code()
+        test_result['metadata']['1_copy_change_success'] = True
+        test_result['metadata']['1_copy_change_code'] = code
+    except Exception:
+        logger.error('Error fetching changed code for this instance')
+        test_result['metadata']['1_copy_change_success'] = False
+        test_result['metadata']['1_copy_change_code'] = None
+
+    exit_code, output = sandbox.execute_and_check(
+        'cd /testing',
+        'Failed to cd /testing',
+    )
+    logger.info(f'cd $REPO_PATH: {output}')
+
+    exit_code, output = sandbox.execute_and_check(
+        'whoami',
+        'Failed to run whoami',
+    )
+    logger.info(f'whoami: {output}')
+
+    exit_code, output = sandbox.execute(
+        '/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
+    )
+    logger.info(f'$TEST_CMD:\n{output}')
+
+    exit_code, output = sandbox.execute_and_check(
+        'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
+    )
+    if exit_code == 0:
+        test_result['metadata']['2_run_test_success'] = True
+        test_result['metadata']['2_run_test_result'] = str(output)
+    else:
+        test_result['metadata']['2_run_test_success'] = False
+        test_result['metadata']['2_run_test_result'] = str(output)
+    json_obj = json.loads(output)
+    test_result['result'] = json_obj['result']
+
+    return test_result
+
+
+def process_instance(
+    instance,
+    agent_class,
+    metadata,
+    skip_workspace_mount,
+    eval_output_dir,
+    reset_logger: bool = True,
+):
+    instance = BiocoderData(**instance)
+    print(instance)
+    workspace_dir_name = (
+        f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
+            '/', '__'
+        )
+    )
+    workspace_mount_path = os.path.join(config.workspace_base, workspace_dir_name)
+    # create process-specific workspace dir
+    # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+    # so that different agent don't interfere with each other.
+    if not skip_workspace_mount:
+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(
+            eval_output_dir, 'logs', f'instance_{instance.test_case_id}.log'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {instance.test_case_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+
+    if not skip_workspace_mount:
+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+    # NOTE: this is something special we do for SWE-Bench due to the reason described in the previous section
+    # You can omit this if you don't need to setup specialized sandbox
+    workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}'.replace(
+        '/', '__'
+    )
+    sandbox = BiocoderSSHBox.get_box_for_instance(
+        instance,
+        workspace_dir_name,
+        skip_workspace_mount=False,
+        workspace_mount_path=workspace_mount_path,
+        sandbox_plugins=agenthub.Agent.get_cls(agent_class).sandbox_plugins,
+    )
+
+    sandbox.remove_code()
+
+    # Prepare instruction
+    instruction = (
+        f'Please complete the function "{instance.signature}" in the file /workspace/{instance.repository.split("/")[1]}/{instance.filePath}.\n'
+        f'The environment has been set up for you to start working. You may assume all necessary tools are installed.\n'
+        f'To complete the task, you must directly modify the file and fill in the function, keeping in mind that the function signature is on line {instance.lineStart-1}\n\n'
+        f'The function should do the following:\n'
+        f'{instance.promptSummaryOnly}\n\n'
+    )
+
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        'You should NOT modify any other files other than the file intended. This means that you should NOT write any test cases.\n'
+        'You may need context from other files in the repository to complete this task.'
+        'Do NOT add any import statements or change anything else other than the writing the function body.\n'
+        'You do not need to run the code to check if it works. \n'
+        'Make sure to include proper formatting in Java and Python, including correct braces and/or indentation.\n'
+    )
+
+    # instruction = (
+    #     f'In the file {instance.filePath}, there is a function with a signature and without a body. Your job is to complete the function, according to the given instructions. When you complete the function, respond with the function body, and nothing else.'
+    #     'The repository has cloned for you to start working. You are not allowed to run any bash commands, just modify the files. \n\n'
+    #     '# Problem Statement\n'
+    #     'Complete the following function signature:\n\n'
+    #     f'{instance.signature}'
+    #     'The function should do the following:\n\n'
+    #     f'{instance.promptSummaryOnly}\n\n'
+    # )
+    #
+    # instruction += (
+    #     'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+    #     'You should NOT modify any other files other than the file intended. This means that you should NOT write any test cases.\n'
+    #     'Do NOT add any import statements or change anything else other than the writing the function body.\n'
+    #     'You do not need to run the code to check if it works. The system will automatically check the correctness of your code.\n'
+    #     'Make sure to include proper formatting in Java and Python, including correct braces and/or indentation.\n'
+    # )
+
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State = asyncio.run(
+        main(
+            instruction,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+            sandbox=sandbox,
+        )
+    )
+
+    test_result = get_test_result(instance, sandbox, workspace_dir_name)
+
+    if state is None:
+        raise ValueError('State should not be None.')
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = {
+        'test_case_id': instance.test_case_id,
+        'biocoder_instance': instance.to_dict(),
+        'instruction': instruction,
+        'generated': test_result['metadata']['1_copy_change_code'],
+        'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'metrics': metrics,
+        'error': state.error if state and state.error else None,
+        'test_result': test_result,
+    }
+
+    # Close the sandbox
+    sandbox.close()
+    return output
+
+
+if __name__ == '__main__':
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenDevin's repo
+    dataset = load_dataset('lilbillbiscuit/biocoder_public')
+    biocoder_tests = dataset['test'].to_pandas()
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'biocoder',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    eval_output_dir = str(eval_output_dir)
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproduciblity
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        biocoder_tests = biocoder_tests.head(eval_n_limit)
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_test_case_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_test_case_ids.add(data['test_case_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_test_case_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_biocoder_tests = []
+    for idx, instance in biocoder_tests.iterrows():
+        if instance.test_case_id in finished_test_case_ids:
+            logger.info(
+                f'Skipping instance {instance.test_case_id} as it is already finished.'
+            )
+            continue
+        new_biocoder_tests.append(instance)
+
+    biocoder_tests = pd.DataFrame(new_biocoder_tests)
+    logger.info(
+        f'Finished instances: {len(finished_test_case_ids)}, Remaining instances: {len(biocoder_tests)}'
+    )
+    # =============================================
+
+    pbar = tqdm(total=len(biocoder_tests))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(future):
+        pbar.update(1)
+        output = future.result()
+        pbar.set_description(f'Instance {output["test_case_id"]}')
+        pbar.set_postfix_str(f'Test Result: {output["test_result"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["test_case_id"]}: {output["test_result"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        output_fp.flush()
+
+    # This sets the multi-processing
+    num_workers = args.eval_num_workers
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    # This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
+    skip_workspace_mount = agent_class == 'CodeActAgent'
+    logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
+
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multi-processing
+            for row_idx, instance in biocoder_tests.iterrows():
+                future = executor.submit(
+                    process_instance,
+                    instance,
+                    agent_class,
+                    metadata,
+                    skip_workspace_mount,
+                    eval_output_dir,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
--- a/evaluation/biocoder/scripts/run_infer.sh
+++ b/evaluation/biocoder/scripts/run_infer.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+MODEL_CONFIG=$1
+AGENT=$2
+EVAL_LIMIT=$3
+DATASET="biocoder"
+
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+echo "DATASET: $DATASET"
+
+COMMAND="poetry run python evaluation/biocoder/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 10 \
+  --max-chars 10000000 \
+  --eval-num-workers 1 \
+  --eval-note ${AGENT_VERSION}_${DATASET}"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+echo $COMMAND
+eval $COMMAND
--- a/evaluation/gorilla/README.md
+++ b/evaluation/gorilla/README.md
@@ -0,0 +1,41 @@
+# Gorilla APIBench Evaluation with OpenDevin
+
+This folder contains evaluation harness we built on top of the original [Gorilla APIBench](https://github.com/ShishirPatil/gorilla) ([paper](https://arxiv.org/pdf/2305.15334)).
+
+## Setup Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local development environment for OpenDevin.
+
+## Configure OpenDevin and your LLM
+
+Run `make setup-config` to set up the `config.toml` file if it does not exist at the root of the workspace.
+
+## Run Inference on APIBench Instances
+
+Make sure your Docker daemon is running, then run this bash script:
+
+```bash
+bash evaluation/gorilla/scripts/run_infer.sh [model_config] [agent] [eval_limit] [hubs]
+```
+
+where `model_config` is mandatory, while all other arguments are optional.
+
+`model_config`, e.g. `llm`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+
+`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+
+`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances.
+By default, the script evaluates 1 instance.
+
+`hubs`, the hub from APIBench to evaluate from. You could choose one or more from `torch` or `th` (which is abbreviation of torch), `hf` (which is abbreviation of huggingface), and `tf` (which is abbreviation of tensorflow),  for `hubs`. The default is `hf,torch,tf`.
+
+Note: in order to use `eval_limit`, you must also set `agent`; in order to use `hubs`, you must also set `eval_limit`.
+
+Let's say you'd like to run 10 instances using `llm` and CodeActAgent on `th` test,
+then your command would be:
+
+```bash
+bash evaluation/gorilla/scripts/run_infer.sh llm CodeActAgent 10 th
+```
--- a/evaluation/gorilla/ast_eval_hf.py
+++ b/evaluation/gorilla/ast_eval_hf.py
@@ -0,0 +1,127 @@
+# Copyright 2023 https://github.com/ShishirPatil/gorilla
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is modified from https://github.com/ShishirPatil/gorilla/blob/main/eval/eval-scripts/ast_eval_hf.py
+
+from tree_sitter import Language, Parser
+
+
+# Get all the subtrees given a root_node
+def get_all_sub_trees(root_node):
+    node_stack = []
+    sub_tree_sexp_list = []
+    depth = 1
+    # text = root_node.text
+    node_stack.append([root_node, depth])
+    while len(node_stack) != 0:
+        cur_node, cur_depth = node_stack.pop()
+        if cur_node.child_count > 0:
+            sub_tree_sexp_list.append(
+                [cur_node.sexp(), cur_depth, cur_node, cur_node.children[0].text]
+            )
+        else:
+            sub_tree_sexp_list.append([cur_node.sexp(), cur_depth, cur_node, None])
+        for child_node in cur_node.children:
+            if len(child_node.children) != 0:
+                depth = cur_depth + 1
+                node_stack.append([child_node, depth])
+    return sub_tree_sexp_list
+
+
+# Parse the program into AST trees
+def ast_parse(candidate, lang='python'):
+    LANGUAGE = Language('evaluation/gorilla/my-languages.so', lang)
+    parser = Parser()
+    parser.set_language(LANGUAGE)
+
+    candidate_tree = parser.parse(bytes(candidate, 'utf8')).root_node
+    return candidate_tree
+
+
+# Get all the arguments in the ast tree
+def get_args(node):
+    if node.child_count == 0:
+        return []
+    args_list = []
+    for child in node.children[0].children[0].children[1].children:
+        if '=' in child.text.decode():
+            args_list.append(child.children[2].text)
+        elif (
+            child.text.decode() != '('
+            and child.text.decode() != ')'
+            and child.text.decode() != ','
+        ):
+            args_list.append(child.text)
+    return args_list
+
+
+# Check if there is an api match
+def ast_check(candidate_subtree_list, base_tree_list):
+    for idx, base_tree in enumerate(base_tree_list):
+        if base_tree.children[0].children[0].child_count == 0:
+            continue
+        api_name = base_tree.children[0].children[0].children[0].text
+        for candidate_tree in candidate_subtree_list:
+            if candidate_tree[3] == api_name:
+                break
+        # Now we have a sub-tree
+        candidate_tree = candidate_tree[2]
+        args_list = get_args(base_tree)
+        if len(args_list) == 0:
+            continue
+        ast_match = True
+        for arg in args_list:
+            if arg.decode().lstrip("'").rstrip("'") not in candidate_tree.text.decode():
+                ast_match = False
+                break
+        if ast_match:
+            return idx
+    return -1
+
+
+def ast_eval_hf(api_database, qa_pairs, ast_database, question_id, response):
+    # Check correctness
+    correct = False
+    hallucination = False
+    output = response
+    # Index the "api_call" domain
+    output = output.split('api_call')
+    if len(output) == 1:
+        api_call = output[0]
+    else:
+        # Parse the output
+        output = output[1].split('api_provider')[0]
+        if ':' not in output:
+            start = 0
+        else:
+            start = output.index(':')
+        if ')' not in output:
+            end = -2
+        else:
+            end = output.rindex(')')
+        api_call = output[start + 2 : end + 1]
+    # Parse the api_call into AST tree
+    ast_tree = ast_parse(api_call)
+    # Search for a subtree
+    ast_subtree_list = get_all_sub_trees(ast_tree)
+    # Check which ast tree is matching
+    database_index = ast_check(ast_subtree_list, ast_database)
+    # We cannot index this ast in our database
+    if database_index == -1:
+        hallucination = True
+    # We index our reference api_call
+    ref_api_call = api_database[database_index]
+    # Check for functionality
+    if ref_api_call['domain'] == qa_pairs[question_id - 1]['domain']:
+        correct = True
+    return correct, hallucination
--- a/evaluation/gorilla/ast_eval_tf.py
+++ b/evaluation/gorilla/ast_eval_tf.py
@@ -0,0 +1,127 @@
+# Copyright 2023 https://github.com/ShishirPatil/gorilla
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is modified from https://github.com/ShishirPatil/gorilla/blob/main/eval/eval-scripts/ast_eval_tf.py
+
+from tree_sitter import Language, Parser
+
+
+# Get all the subtrees given a root_node
+def get_all_sub_trees(root_node):
+    node_stack = []
+    sub_tree_sexp_list = []
+    depth = 1
+    # text = root_node.text
+    node_stack.append([root_node, depth])
+    while len(node_stack) != 0:
+        cur_node, cur_depth = node_stack.pop()
+        if cur_node.child_count > 0:
+            sub_tree_sexp_list.append(
+                [cur_node.sexp(), cur_depth, cur_node, cur_node.children[0].text]
+            )
+        else:
+            sub_tree_sexp_list.append([cur_node.sexp(), cur_depth, cur_node, None])
+        for child_node in cur_node.children:
+            if len(child_node.children) != 0:
+                depth = cur_depth + 1
+                node_stack.append([child_node, depth])
+    return sub_tree_sexp_list
+
+
+# Parse the program into AST trees
+def ast_parse(candidate, lang='python'):
+    LANGUAGE = Language('evaluation/gorilla/my-languages.so', lang)
+    parser = Parser()
+    parser.set_language(LANGUAGE)
+
+    candidate_tree = parser.parse(bytes(candidate, 'utf8')).root_node
+    return candidate_tree
+
+
+# Get all the arguments in the ast tree
+def get_args(node):
+    if node.child_count == 0:
+        return []
+    args_list = []
+    for child in node.children[0].children[0].children[1].children:
+        if 'model=' in child.text.decode() or 'model =' in child.text.decode():
+            args_list.append(child.children[2].text)
+        elif (
+            child.text.decode() != '('
+            and child.text.decode() != ')'
+            and child.text.decode() != ','
+        ):
+            args_list.append(child.text)
+    return args_list
+
+
+# Check if there is an api match
+def ast_check(candidate_subtree_list, base_tree_list):
+    for idx, base_tree in enumerate(base_tree_list):
+        if base_tree.children[0].children[0].child_count == 0:
+            continue
+        api_name = base_tree.children[0].children[0].children[0].text
+        for candidate_tree in candidate_subtree_list:
+            if candidate_tree[3] == api_name:
+                break
+        # Now we have a sub-tree
+        candidate_tree = candidate_tree[2]
+        args_list = get_args(base_tree)
+        if len(args_list) == 0:
+            continue
+        ast_match = True
+        for arg in args_list:
+            if arg.decode().lstrip("'").rstrip("'") not in candidate_tree.text.decode():
+                ast_match = False
+                break
+        if ast_match:
+            return idx
+    return -1
+
+
+def ast_eval_tf(api_database, qa_pairs, ast_database, question_id, response):
+    # Check correctness
+    correct = False
+    hallucination = False
+    output = response
+    # Index the "api_call" domain
+    output = output.split('api_call')
+    if len(output) == 1:
+        api_call = output[0]
+    else:
+        # Parse the output
+        output = output[1].split('api_provider')[0]
+        if ':' not in output:
+            start = 0
+        else:
+            start = output.index(':')
+        if ')' not in output:
+            end = -2
+        else:
+            end = output.rindex(')')
+        api_call = output[start + 2 : end + 1]
+    # Parse the api_call into AST tree
+    ast_tree = ast_parse(api_call)
+    # Search for a subtree
+    ast_subtree_list = get_all_sub_trees(ast_tree)
+    # Check which ast tree is matching
+    database_index = ast_check(ast_subtree_list, ast_database)
+    # We cannot index this ast in our database
+    if database_index == -1:
+        hallucination = True
+    # We index our reference api_call
+    ref_api_call = api_database[database_index]
+    # Check for functionality
+    if ref_api_call['domain'] == qa_pairs[question_id - 1]['domain']:
+        correct = True
+    return correct, hallucination
--- a/evaluation/gorilla/ast_eval_th.py
+++ b/evaluation/gorilla/ast_eval_th.py
@@ -0,0 +1,123 @@
+# Copyright 2023 https://github.com/ShishirPatil/gorilla
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is modified from https://github.com/ShishirPatil/gorilla/blob/main/eval/eval-scripts/ast_eval_th.py
+
+from tree_sitter import Language, Parser
+
+
+# Get all the subtrees given a root_node
+def get_all_sub_trees(root_node):
+    node_stack = []
+    sub_tree_sexp_list = []
+    depth = 1
+    # text = root_node.text
+    node_stack.append([root_node, depth])
+    while len(node_stack) != 0:
+        cur_node, cur_depth = node_stack.pop()
+        if cur_node.child_count > 0:
+            sub_tree_sexp_list.append(
+                [cur_node.sexp(), cur_depth, cur_node, cur_node.children[0].text]
+            )
+        else:
+            sub_tree_sexp_list.append([cur_node.sexp(), cur_depth, cur_node, None])
+        for child_node in cur_node.children:
+            if len(child_node.children) != 0:
+                depth = cur_depth + 1
+                node_stack.append([child_node, depth])
+    return sub_tree_sexp_list
+
+
+# Parse the program into AST trees
+def ast_parse(candidate, lang='python'):
+    LANGUAGE = Language('evaluation/gorilla/my-languages.so', lang)
+    parser = Parser()
+    parser.set_language(LANGUAGE)
+
+    candidate_tree = parser.parse(bytes(candidate, 'utf8')).root_node
+    return candidate_tree
+
+
+# Get all the arguments in the ast tree
+def get_args(node):
+    if node.child_count == 0:
+        return []
+    args_list = []
+    for child in node.children[0].children[0].children[1].children:
+        if 'repo_or_dir' in child.text.decode() or 'model' in child.text.decode():
+            args_list.append(child.children[2].text)
+    return args_list
+
+
+# Check if there is an api match
+def ast_check(candidate_subtree_list, base_tree_list):
+    for idx, base_tree in enumerate(base_tree_list):
+        if base_tree.children[0].children[0].child_count == 0:
+            continue
+        api_name = base_tree.children[0].children[0].children[0].text
+        for candidate_tree in candidate_subtree_list:
+            if candidate_tree[3] == api_name:
+                break
+        # Now we have a sub-tree
+        candidate_tree = candidate_tree[2]
+        args_list = get_args(base_tree)
+        if len(args_list) == 0:
+            continue
+        ast_match = True
+        for arg in args_list:
+            if arg.decode().lstrip("'").rstrip("'") not in candidate_tree.text.decode():
+                ast_match = False
+                break
+        if ast_match:
+            return idx
+    return -1
+
+
+def process_response(question_id, output, api_database, qa_pairs, ast_database):
+    # Index the "api_call" domain
+    output = output.split('api_call')
+    if len(output) == 1:
+        return False, False
+    else:
+        output = output[1].split('api_provider')[0]
+    if ':' not in output:
+        start = 0
+    else:
+        start = output.index(':')
+    if ')' not in output:
+        end = -2
+    else:
+        end = output.rindex(')')
+    api_call = output[start + 2 : end + 1]
+
+    # Parse the api_call into AST tree
+    ast_tree = ast_parse(api_call)
+    # Search for a subtree
+    ast_subtree_list = get_all_sub_trees(ast_tree)
+    # Check which ast tree is matching
+    database_index = ast_check(ast_subtree_list, ast_database)
+    # We cannot index this ast in our database
+    if database_index == -1:
+        return False, True
+    # We index our reference api_call
+    ref_api_call = api_database[database_index]
+    # Check for functionality
+    if ref_api_call['domain'] == qa_pairs[question_id - 1]['domain']:
+        return True, False
+    else:
+        return False, False
+
+
+def ast_eval_th(api_database, qa_pairs, ast_database, question_id, response):
+    # Check correctness
+    return process_response(question_id, response, api_database, qa_pairs, ast_database)
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -0,0 +1,355 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+from tqdm import tqdm
+from utils import encode_question, get_data
+
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, get_parser
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    print('Cleaning up child processes...')
+    for process in mp.active_children():
+        print(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        #'Please continue working on the task on whatever approach you think is suitable.\n'
+        'Please run the following command: <execute_bash> exit </execute_bash>.\n'
+        #'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
+    )
+    if state.history:
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have completed the request, please run the following command: <execute_bash> exit </execute_bash>.\n'
+}
+
+
+def process_instance(
+    question_id, question, agent_class, metadata, reset_logger: bool = True
+):
+    # create process-specific workspace dir
+    # we will create a workspace directory for EACH process
+    # so that different agent don't interfere with each other.
+    old_workspace_mount_path = config.workspace_mount_path
+    try:
+        workspace_mount_path = os.path.join(
+            config.workspace_mount_path, '_eval_workspace'
+        )
+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+        config.workspace_mount_path = workspace_mount_path
+
+        # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+        eval_output_dir = metadata['eval_output_dir']
+        if reset_logger:
+            # Set up logger
+            log_file = os.path.join(
+                eval_output_dir, 'logs', f'instance_{question_id}.log'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            # add back the console handler to print ONE line
+            logger.addHandler(get_console_handler())
+            logger.info(
+                f'Starting evaluation for instance {question_id}.\nLOG:   tail -f {log_file}'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setFormatter(
+                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+            )
+            logger.addHandler(file_handler)
+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+        # Prepare instruction
+        instruction = encode_question(question, metadata['hub'])
+        instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        # NOTE: You can actually set slightly different instruction for different agents
+        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+        # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State = asyncio.run(
+            main(
+                instruction,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+                    agent_class
+                ),
+            )
+        )
+        # ======= Attempt to evaluate the agent's edits =======
+        # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+        if state is None:
+            raise ValueError('State should not be None.')
+
+        model_answer_raw = ''
+        for act, _ in reversed(state.history):
+            if isinstance(act, MessageAction) and act.source == 'agent':
+                model_answer_raw = act.content
+                break
+        # attempt to parse model_answer
+        _, _, ast_eval = get_data(metadata['hub'])
+        correct, hallucination = ast_eval(question_id, model_answer_raw)
+        metrics = state.metrics.get() if state.metrics else None
+        logger.info(
+            f'Final message: {model_answer_raw} | Correctness: {correct} | Hallucination: {hallucination}'
+        )
+        # Save the output
+        output = {
+            'question_id': question_id,
+            'text': model_answer_raw,
+            'correct': correct,
+            'hallucination': hallucination,
+            'answer_id': 'None',
+            'model_id': metadata['model_name'],
+            'metadata': metadata,
+            'history': [
+                (event_to_dict(action), event_to_dict(obs))
+                for action, obs in state.history
+            ],
+            'metrics': metrics,
+            'error': state.error if state and state.error else None,
+        }
+    except Exception:
+        logger.error('Process instance failed')
+        raise
+    finally:
+        config.workspace_mount_path = old_workspace_mount_path
+    return output
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '--hubs',
+        type=str,
+        help='Which hubs to evaluate from APIBench. APIBench contains 3 hubs, namely huggingface, torch, and tensorflow. You could choose one or more from hf, torch, or tf, separated by commas. For example, the default is --hub hf,torch,tf.',
+        default='hf,torch,tf',
+    )
+    args, _ = parser.parse_known_args()
+    if args.directory:
+        config.workspace_base = os.path.abspath(args.directory)
+        print(f'Setting workspace base to {config.workspace_base}')
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'gorilla',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    hubs = []
+    if 'hf' in args.hubs:
+        hubs.append('hf')
+    if 'torch' in args.hubs or 'th' in args.hubs:
+        hubs.append('torch')
+    if 'tf' in args.hubs:
+        hubs.append('tf')
+    if hubs == []:
+        raise ValueError('Please choose at least one from hf, torch, and tf for hubs.')
+
+    for hub in hubs:
+        logger.info(f'Evaluating APIBench {hub} test')
+        questions, question_ids, ast_eval = get_data(hub)
+
+        # TEST METADATA
+        metadata = {
+            'hub': hub,
+            'agent_class': agent_class,
+            'model_name': model_name,
+            'max_iterations': max_iterations,
+            'eval_output_dir': eval_output_dir,
+            'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+            # get the commit id of current repo for reproduciblity
+            'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+            .decode('utf-8')
+            .strip(),
+        }
+        logger.info(f'Metadata: {metadata}')
+        with open(os.path.join(eval_output_dir, f'metadata_{hub}.json'), 'w') as f:
+            json.dump(metadata, f)
+
+        # LIMIT EVALUATION
+        eval_n_limit = args.eval_n_limit
+        if eval_n_limit:
+            questions = questions[: (eval_n_limit // len(hubs))]
+            question_ids = question_ids[: (eval_n_limit // len(hubs))]
+            logger.info(
+                f'Limiting evaluation to a total of first {eval_n_limit} instances -> first {eval_n_limit//len(hubs)} instances per hub.'
+            )
+        output_file = os.path.join(eval_output_dir, f'output_{model_name}_{hub}.jsonl')
+        logger.info(f'Writing evaluation output to {output_file}')
+        finished_task_ids = set()
+        if os.path.exists(output_file):
+            with open(output_file, 'r') as f:
+                for line in f:
+                    data = json.loads(line)
+                    for i in range(len(question_ids)):
+                        if question_ids[i] == int(data['question_id']):
+                            finished_task_ids.add(data['question_id'])
+            logger.warning(
+                f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
+            )
+        output_fp = open(output_file, 'a')
+        logger.info(
+            f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+        )
+        # =============================================
+        # filter out finished instances
+        new_questions = []
+        new_question_ids = []
+        for i in range(len(question_ids)):
+            if question_ids[i] in finished_task_ids:
+                logger.info(
+                    f'Skipping instance {question_ids[i]} as it is already finished.'
+                )
+                continue
+            new_questions.append(questions[i])
+            new_question_ids.append(question_ids[i])
+
+        finished_task_number = len(finished_task_ids)
+        questions = new_questions
+        question_ids = new_question_ids
+        logger.info(
+            f'Finished instances: {finished_task_number}, Remaining instances: {len(question_ids)}'
+        )
+        # =============================================
+        pbar = tqdm(total=len(question_ids))
+
+        # This function tracks the progress AND write the output to a JSONL file
+        def update_progress(future, pbar, output_fp, finished_task_ids):
+            pbar.update(1)
+            output = future.result()
+            pbar.set_description(f'Instance {output["question_id"]}')
+            pbar.set_postfix_str(f'Test Result: {output["correct"]}')
+            logger.info(
+                f'Finished evaluation for instance {output["question_id"]}: {output["correct"]}'
+            )
+            output_fp.write(json.dumps(output) + '\n')
+            output_fp.flush()
+            finished_task_ids.add(output['question_id'])
+
+        # This sets the multi-processing
+        num_workers = args.eval_num_workers
+        logger.info(f'Using {num_workers} workers for evaluation.')
+        try:
+            with ProcessPoolExecutor(num_workers) as executor:
+                futures = []
+                # This is how we perform multi-processing
+                for i in range(len(question_ids)):
+                    try:
+                        question_id = question_ids[i]
+                        question = questions[i]
+                        future = executor.submit(
+                            process_instance,
+                            question_id,
+                            question,
+                            agent_class,
+                            metadata,
+                            reset_logger=bool(num_workers > 1),
+                        )
+                        future.add_done_callback(
+                            update_progress, pbar, output_fp, finished_task_ids
+                        )
+                        futures.append(future)
+                    except Exception:
+                        continue
+
+                # Wait for all futures to complete
+                for future in futures:
+                    try:
+                        future.result()
+                    except Exception:
+                        continue
+        except KeyboardInterrupt:
+            logger.info('KeyboardInterrupt received. Cleaning up...')
+            cleanup()
+
+        output_fp.close()
+        total_correct = 0
+        total_hallucination = 0
+        output = []
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                output.append(data)
+                if int(data['question_id']) in finished_task_ids:
+                    if str(data['correct']).lower() == 'true':
+                        total_correct += 1
+                    if str(data['hallucination']).lower() == 'true':
+                        total_hallucination += 1
+        # sort all output by question_id
+        output = sorted(output, key=lambda x: x['question_id'])
+        with open(output_file, 'w') as f:
+            for dat in output:
+                f.write(json.dumps(dat) + '\n')
+                f.flush()
+
+        logger.info(
+            f'Evaluation finished for {hub}. Total: {len(question_ids)+finished_task_number}; Correct: {total_correct}; Hallucination: {total_hallucination}. Accuracy: {total_correct / (len(question_ids)+finished_task_number)}'
+        )
--- a/evaluation/gorilla/scripts/run_infer.sh
+++ b/evaluation/gorilla/scripts/run_infer.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+MODEL_CONFIG=$1
+AGENT=$2
+EVAL_LIMIT=$3
+HUBS=$4
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+if [ -z "$HUBS" ]; then
+  HUBS="hf,torch,tf"
+  echo "Hubs not specified, use default $HUBS"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+echo "HUBS: $HUBS"
+
+COMMAND="poetry run python evaluation/gorilla/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 30 \
+  --hubs $HUBS \
+  --data-split validation \
+  --max-chars 10000000 \
+  --eval-num-workers 1 \
+  --eval-note ${AGENT_VERSION}_${LEVELS}"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/gorilla/utils.py
+++ b/evaluation/gorilla/utils.py
@@ -0,0 +1,101 @@
+import json
+from functools import partial
+
+import requests
+from ast_eval_hf import ast_eval_hf, ast_parse
+from ast_eval_tf import ast_eval_tf
+from ast_eval_th import ast_eval_th
+
+
+# This function is modified from Gorilla's APIBench implementations (https://github.com/ShishirPatil/gorilla/blob/main/eval/get_llm_responses.py).
+def encode_question(question, api_name):
+    """Encode multiple prompt instructions into a single string."""
+
+    prompts = []
+    if api_name == 'torch':
+        api_name = 'torchhub'
+        domains = '1. $DOMAIN is inferred from the task description and should include one of {Classification, Semantic Segmentation, Object Detection, Audio Separation, Video Classification, Text-to-Speech}.'
+    elif api_name == 'hf':
+        api_name = 'huggingface'
+        domains = '1. $DOMAIN should include one of {Multimodal Feature Extraction, Multimodal Text-to-Image, Multimodal Image-to-Text, Multimodal Text-to-Video, \
+        Multimodal Visual Question Answering, Multimodal Document Question Answer, Multimodal Graph Machine Learning, Computer Vision Depth Estimation,\
+        Computer Vision Image Classification, Computer Vision Object Detection, Computer Vision Image Segmentation, Computer Vision Image-to-Image, \
+        Computer Vision Unconditional Image Generation, Computer Vision Video Classification, Computer Vision Zero-Shor Image Classification, \
+        Natural Language Processing Text Classification, Natural Language Processing Token Classification, Natural Language Processing Table Question Answering, \
+        Natural Language Processing Question Answering, Natural Language Processing Zero-Shot Classification, Natural Language Processing Translation, \
+        Natural Language Processing Summarization, Natural Language Processing Conversational, Natural Language Processing Text Generation, Natural Language Processing Fill-Mask,\
+        Natural Language Processing Text2Text Generation, Natural Language Processing Sentence Similarity, Audio Text-to-Speech, Audio Automatic Speech Recognition, \
+        Audio Audio-to-Audio, Audio Audio Classification, Audio Voice Activity Detection, Tabular Tabular Classification, Tabular Tabular Regression, \
+        Reinforcement Learning Reinforcement Learning, Reinforcement Learning Robotics }'
+    elif api_name == 'tf':
+        api_name = 'tensorhub'
+        domains = '1. $DOMAIN is inferred from the task description and should include one of {text-sequence-alignment, text-embedding, text-language-model, text-preprocessing, text-classification, text-generation, text-question-answering, text-retrieval-question-answering, text-segmentation, text-to-mel, image-classification, image-feature-vector, image-object-detection, image-segmentation, image-generator, image-pose-detection, image-rnn-agent, image-augmentation, image-classifier, image-style-transfer, image-aesthetic-quality, image-depth-estimation, image-super-resolution, image-deblurring, image-extrapolation, image-text-recognition, image-dehazing, image-deraining, image-enhancemenmt, image-classification-logits, image-frame-interpolation, image-text-detection, image-denoising, image-others, video-classification, video-feature-extraction, video-generation, video-audio-text, video-text, audio-embedding, audio-event-classification, audio-command-detection, audio-paralinguists-classification, audio-speech-to-text, audio-speech-synthesis, audio-synthesis, audio-pitch-extraction}'
+    else:
+        print('Error: API name is not supported.')
+
+    prompt = (
+        question
+        + '\nWrite a python program in 1 to 2 lines to call API in '
+        + api_name
+        + '.\n\nThe answer should follow the format: <<<domain>>> $DOMAIN, <<<api_call>>>: $API_CALL, <<<api_provider>>>: $API_PROVIDER, <<<explanation>>>: $EXPLANATION, <<<code>>>: $CODE}. Here are the requirements:\n'
+        + domains
+        + '\n2. The $API_CALL should have only 1 line of code that calls api.\n3. The $API_PROVIDER should be the programming framework used.\n4. $EXPLANATION should be a step-by-step explanation.\n5. The $CODE is the python code.\n6. Do not repeat the format in your answer.'
+    )
+    # prompts.append({"role": "system", "content": ""})
+    prompts = (
+        'You are a helpful API writer who can write APIs based on requirements.\n'
+        + prompt
+    )
+    return prompts
+
+
+def get_data(hub):
+    if hub == 'hf':
+        question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/huggingface/questions_huggingface_0_shot.jsonl'
+        api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/huggingface_api.jsonl'
+        apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/huggingface_eval.json'
+        ast_eval = ast_eval_hf
+    if hub == 'torch':
+        question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/torchhub/questions_torchhub_0_shot.jsonl'
+        api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/torchhub_api.jsonl'
+        apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/torchhub_eval.json'
+        ast_eval = ast_eval_th
+    if hub == 'tf':
+        question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/tensorflowhub/questions_tensorflowhub_0_shot.jsonl'
+        api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/tensorflowhub_api.jsonl'
+        apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/tensorflow_eval.json'
+        ast_eval = ast_eval_tf
+
+    # get questions and question_ids
+    questions = []
+    question_ids = []
+    question_data = requests.get(question_data)
+    if question_data.status_code == 200:
+        lines = question_data.text.splitlines()
+        for line in lines:
+            questions.append(json.loads(line)['text'])
+            question_ids.append(json.loads(line)['question_id'])
+
+    # get the api datasest
+    api_database = []
+    api_dataset = requests.get(api_dataset)
+    if api_dataset.status_code == 200:
+        lines = api_dataset.text.splitlines()
+        for line in lines:
+            api_database.append(json.loads(line))
+
+    # get the question answer pair datasest
+    qa_pairs = []
+    apibench = requests.get(apibench)
+    if apibench.status_code == 200:
+        lines = apibench.text.splitlines()
+        for line in lines:
+            qa_pairs.append(json.loads(line)['api_data'])
+
+    # Parse all apis to ast trees
+    ast_database = []
+    for data in api_database:
+        ast_tree = ast_parse(data['api_call'])
+        ast_database.append(ast_tree)
+    ast_eval = partial(ast_eval, api_database, qa_pairs, ast_database)
+    return questions, question_ids, ast_eval
--- a/evaluation/gpqa/README.md
+++ b/evaluation/gpqa/README.md
@@ -0,0 +1,70 @@
+# Evaluating GPQA (A Graduate-Level Google-Proof Q&A Benchmark) with OpenDevin
+
+Implements the evaluation of agents on the GPQA benchmark introduced in [GPQA: A Graduate-Level Google-Proof Q&A Benchmark](https://arxiv.org/abs/2308.07124).
+
+This code implements the evaluation of agents on the GPQA Benchmark with Open Book setting.
+- The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-expert validators achieve only 34% accuracy despite unrestricted access to the web.
+- Even experts in the corresponding domains achieve only 65% accuracy.
+- State-of-the-art AI systems achieve only 39% accuracy on this challenging dataset.
+
+**Note**
+Accurate solving of above graduate level questions would require both tool use (e.g., python for calculations) and web-search for finding related facts as information required for the questions might not be part of the LLM knowledge / training data.
+
+Further references:
+- https://arxiv.org/pdf/2311.12022
+- https://paperswithcode.com/dataset/gpqa
+- https://github.com/idavidrein/gpqa
+
+## TODOs
+- [ ] Add support for other agents (currently only tested on `CodeActAgent`)
+- [ ] Complete full benchmark evaluation
+- [ ] Fix intermittent `BrowserException: Failed to start browser environment` error
+
+## Setup Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
+
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/tmp/cache"
+ssh_hostname = "localhost"
+enable_auto_lint = true
+
+# TODO: Change these to the model you want to evaluate
+[eval_gpt4_1106_preview]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[eval_azure_openai_compatible_model]
+model = "AZURE_OPENAI_EXACT_DEPLOYMENT_MODEL_NAME"
+base_url = "AZURE_OPENAI_ENDPOINT"
+api_key = "AZURE_ENDPOINT_API_KEY"
+temperature = 0.0
+```
+
+## Run Inference on GPQA Benchmark
+'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
+From the root of the OpenDevin repo, run the following command:
+```bash
+./evaluation/gpqa/scripts/run_infer.sh [model_config_name] [num_samples_eval] [data_split] [AgentClass]
+```
+You can replace `model_config_name` with any model you set up in `config.toml`.
+
+- `model_config_name`: The model configuration name from `config.toml` that you want to evaluate.
+- `num_samples_eval`: Number of samples to evaluate (useful for testing and debugging).
+- `data_split`: The data split to evaluate on. Must be one of `gpqa_main`, `gqpa_diamond`, `gpqa_experts`, `gpqa_extended`. Defaults to `gpqa_diamond` as done in the paper.
+- `AgentClass`: The agent class to use for evaluation. Currently only supports `CodeActAgent` for CodeActAgent.
+
+
+## Benchmark Evaluation Results
+
+- [] TODO: Finish the evaluation run across the entire benchmark and compile results
--- a/evaluation/gpqa/init.py
+++ b/evaluation/gpqa/init.py
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -0,0 +1,468 @@
+"""
+Overview:
+This code implements the evaluation of agents on the GPQA Benchmark with Open Book setting.
+- The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-expert validators achieve only 34% accuracy despite unrestricted access to the web.
+- Even experts in the corresponding domains achieve only 65% accuracy.
+- State-of-the-art AI systems achieve only 39% accuracy on this challenging dataset.
+
+Accurate solving of above graduate level questions would require both tool use (e.g., python for calculations) and web-search for finding related facts as information required for the questions might not be part of the LLM knowledge / training data.
+
+Further references:
+- https://arxiv.org/pdf/2311.12022
+- https://paperswithcode.com/dataset/gpqa
+- https://github.com/idavidrein/gpqa
+
+TODOs:
+- Add evaluation on other Agent classes (e.g., MonologueAgent)
+- Batch inference and evaluation of agents on the GPQA Benchmark.
+"""
+
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import random
+import re
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+import pandas as pd
+from datasets import load_dataset
+from tqdm import tqdm
+
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, get_parser
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    logger.info('Cleaning up child processes...')
+    for process in mp.active_children():
+        logger.info(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'Feel free to use all tools for calculations and solving the problem, and web-search for finding relevant facts during the process if needed\n'
+        'If you think you have reliably finished solving the problem, first generate a message reporting the final concise answer to the user. Once that is done, please run the following command: <execute_bash> exit </execute_bash>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP TO SOLVE THIS TASK.\n'
+    )
+    if state.history:
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, just generate a final answer message to the user and in the next turn --> run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': '\n\n SUPER IMPORTANT: When you think you have solved the question, first report it back to the user in the requested format. Only once that is done, in the next turn, please run the following command: <execute_bash> exit </execute_bash>.\n'
+}
+
+
+def parse_final_answer(final_answer: str) -> str:
+    """
+    Parse the final answer from the final message generated by the agent
+    to extract the final answer. The final answer is usually enclosed in the format:
+    <<FINAL_ANSWER||
+    <insert correct answer here>
+    ||FINAL_ANSWER>>
+    """
+    pattern = re.compile(r'<<FINAL_ANSWER\|\|(.*?)\|\|FINAL_ANSWER>>', re.DOTALL)
+    match = pattern.search(final_answer)
+
+    if match:
+        return match.group(1).strip()
+    else:
+        return 'No final answer found in the provided string.'
+
+
+def compare_answers(predicted_answer, ground_truth):
+    """
+    Compare the predicted answer with the ground truth answer
+    """
+    return predicted_answer == ground_truth
+
+
+def get_test_result(model_output, ground_truth):
+    """
+    Implements the evaluation logic for GPQA
+    Checks if the output of a given instance is correct (as per the ground truth)
+    """
+    # parse the final answer from model output
+    predicted_answer = parse_final_answer(model_output)
+
+    # check if the model output matches the ground truth
+    result = compare_answers(predicted_answer, ground_truth)
+
+    return result
+
+
+def convert_instance_dict(instance):
+    """
+    Used for preprocessing the hf dataset into a format that can be used by the agent.
+    Reads and extracts relevant information from the dataset instance.
+    """
+    out_instance_dict = {}
+    out_instance_dict['question'] = instance['Question']
+    correct_answer = instance['Correct Answer']
+    out_instance_dict['choices'] = [
+        correct_answer,
+        instance['Incorrect Answer 1'],
+        instance['Incorrect Answer 2'],
+        instance['Incorrect Answer 3'],
+    ]
+
+    # Randomize the order of choices
+    random.shuffle(out_instance_dict['choices'])
+
+    # Find the index of the correct answer after shuffling and store it as a letter (A/B/C/D)
+    correct_index = out_instance_dict['choices'].index(correct_answer)
+    correct_letter = chr(
+        65 + correct_index
+    )  # Convert index (0-3) to corresponding letter (A-D)
+
+    out_instance_dict['correct_solution'] = correct_letter
+
+    return out_instance_dict
+
+
+def process_instance(
+    instance: dict,
+    agent_class: str,
+    metadata: dict,
+    skip_workspace_mount: bool,
+    eval_output_dir: str,
+    reset_logger: bool = True,
+):
+    """
+    Process a single instance from the dataset
+    """
+    old_workspace_mount_path = config.workspace_mount_path
+    old_workspace_base = config.workspace_base
+    try:
+        workspace_mount_path = os.path.join(
+            config.workspace_mount_path, '_eval_workspace'
+        )
+        # create process-specific workspace dir
+        # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+        # so that different agent don't interfere with each other.
+        skip_workspace_mount = False
+        if not skip_workspace_mount:
+            workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+            pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+        # reset workspace to config
+        config.workspace_base = workspace_mount_path
+        config.workspace_mount_path = workspace_mount_path
+
+        # workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
+        # workspace_mount_path = os.path.abspath(workspace_mount_path)
+        # # create process-specific workspace dir
+        # # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+        # # so that different agent don't interfere with each other.
+        # if not skip_workspace_mount:
+        #     workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        #     pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+        if reset_logger:
+            # Set up logger
+            log_file = os.path.join(
+                eval_output_dir, 'logs', f'instance_{instance.instance_id}.log'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            # add back the console handler to print ONE line
+            logger.addHandler(get_console_handler())
+            logger.info(
+                f'Starting evaluation for instance {instance.instance_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setFormatter(
+                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+            )
+            logger.addHandler(file_handler)
+        else:
+            logger.info(f'Starting evaluation for instance {instance.instance_id}.')
+
+        if not skip_workspace_mount:
+            logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+        # ======= Run the agent on the instance =======
+        # Prepare instruction for the agent using suggested format in gpqa codebase
+        instruction = f"""
+        What is the correct answer to this question:\n
+        {instance['question']}\n
+
+        Choices:\n
+        (A) {instance['choices'][0]}\n
+        (B) {instance['choices'][1]}\n
+        (C) {instance['choices'][2]}\n
+        (D) {instance['choices'][3]}\n
+        \n\n
+
+        MOST IMPORTANT: Format your response as follows:
+        <<FINAL_ANSWER||
+        <insert correct answer here, must be one of A, B, C, D> (Please dont use any additional characters. Just the letter of the correct answer (A/B/C/D).)
+        ||FINAL_ANSWER>>
+
+        Additional Instructions:
+        - You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.
+        """
+
+        # NOTE: You can actually set slightly different instruction for different agents
+        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State = asyncio.run(
+            main(
+                instruction,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+                    agent_class
+                ),
+            )
+        )
+
+        # ======= Attempt to evaluate the agent's edits =======
+        # get the final message from the state history (default to None if not found)
+        final_message = next(
+            (
+                act.content
+                for act in reversed(state.history)
+                if isinstance(act, MessageAction)
+            ),
+            None,
+        )
+
+        logger.info(f'Final message generated by the agent: {final_message}')
+
+        test_result = get_test_result(final_message, instance.correct_solution)
+
+        # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+        if state is None:
+            raise ValueError('State should not be None.')
+
+        metrics = state.metrics.get() if state.metrics else None
+
+        # Save the output
+        output = {
+            'task_id': instance.task_id,
+            'instance_id': instance.instance_id,
+            'instruction': instruction,
+            'metadata': metadata,
+            'history': [
+                (event_to_dict(action), event_to_dict(obs))
+                for action, obs in state.history
+            ],
+            'metrics': metrics,
+            'error': state.error if state and state.error else None,
+            'test_result': test_result,
+        }
+
+    except Exception:
+        logger.error('Process instance failed')
+        raise
+    finally:
+        config.workspace_mount_path = old_workspace_mount_path
+        config.workspace_base = old_workspace_base
+    return output
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    # data split must be one of 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended'
+    parser.add_argument(
+        '--data-split',
+        type=str,
+        choices=['gpqa_main', 'gpqa_diamond', 'gpqa_experts', 'gpqa_extended'],
+        default='gpqa_diamond',
+        help='data split to evaluate, eg. gpqa_diamond',
+    )
+    args, _ = parser.parse_known_args()
+
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenDevin's repo
+    dataset = load_dataset('Idavidrein/gpqa', args.data_split)
+    gpqa_dataset = dataset['train']
+    # preprocess the dataset
+    gpqa_dataset = gpqa_dataset.map(convert_instance_dict)
+    gpqa_dataset = gpqa_dataset.to_pandas()
+    # Add a new column 'instance_id' with the index
+    gpqa_dataset['instance_id'] = gpqa_dataset.index
+    gpqa_dataset['task_id'] = gpqa_dataset.index
+    # gpqa_dataset = dataset['train'].to_pandas().sort_values(by='id').reset_index(drop=True)
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'gpqa',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproduciblity
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit  # NOTE: This is useful for debugging and testing using a smaller subset of the dataset
+    if eval_n_limit:
+        # start_index = 20
+        # gpqa_dataset = gpqa_dataset.iloc[start_index:]
+        gpqa_dataset = gpqa_dataset.head(eval_n_limit)
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    logger.info('#############################################')
+    logger.info(f'{eval_n_limit} instances will be evaluated.')
+    logger.info('#############################################')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_instance_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_instance_ids.add(data['instance_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_gpqa_dataset = []
+    for idx, instance in gpqa_dataset.iterrows():
+        # instance = convert_instance_dict(instance) # preprocessing
+        if instance.instance_id in finished_instance_ids:
+            logger.info(
+                f'Skipping instance {instance.instance_id} as it is already finished.'
+            )
+            continue
+        new_gpqa_dataset.append(instance)
+
+    gpqa_dataset = pd.DataFrame(new_gpqa_dataset)
+    logger.info(
+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(gpqa_dataset)}'
+    )
+    # =============================================
+
+    pbar = tqdm(total=len(gpqa_dataset))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(future):
+        pbar.update(1)
+        output = future.result()
+        pbar.set_description(f'Instance {output["instance_id"]}')
+        pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        output_fp.flush()
+
+    # This sets the multi-processing
+    num_workers = args.eval_num_workers
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    # This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
+    skip_workspace_mount = agent_class == 'CodeActAgent'
+    logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
+
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multi-processing
+            for row_idx, instance in gpqa_dataset.iterrows():
+                future = executor.submit(
+                    process_instance,
+                    instance,
+                    agent_class,
+                    metadata,
+                    skip_workspace_mount,
+                    eval_output_dir,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
--- a/evaluation/gpqa/scripts/run_infer.sh
+++ b/evaluation/gpqa/scripts/run_infer.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+MODEL_CONFIG=$1
+EVAL_LIMIT=$2
+DATA_SPLIT=$3
+AGENT=$4
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent ..."
+  AGENT="CodeActAgent"
+fi
+
+# NOTE: if data split is not provided, use the default value 'gpqa_diamond'
+if [ -z "$DATA_SPLIT" ]; then
+  echo "Data split not specified, using default gpqa_diamond ..."
+  DATA_SPLIT="gpqa_diamond"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+COMMAND="poetry run python evaluation/gpqa/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 10 \
+  --max-chars 10000000 \
+  --eval-num-workers 1 \
+  --data-split $DATA_SPLIT \
+  --eval-note $AGENT_VERSION"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/miniwob/README.md
+++ b/evaluation/miniwob/README.md
@@ -0,0 +1,81 @@
+# WebArena Evaluation with OpenDevin Browsing Agents
+
+This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on synthetic web browsing tasks.
+
+## Setup OpenDevin Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/tmp/cache"
+sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
+sandbox_type = "ssh"
+ssh_hostname = "localhost"
+sandbox_timeout = 120
+
+# TODO: Change these to the model you want to evaluate
+[eval_gpt4_1106_preview]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[eval_some_openai_compatible_model]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Setup MiniWoB++ Environment and Environment Variables of MiniWoB++
+MiniWoB++ requires you to set up websites containing a static website that is accessible via URL to the machine running the OpenDevin agents.
+
+- Clone miniwob (use a specific frozen commit for reproducibility)
+```sh
+git clone git@github.com:Farama-Foundation/miniwob-plusplus.git
+git -C "./miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838
+```
+
+- Setup Miniwob URL (change `PATH_TO_MINIWOB_CLONED_REPO` here to the absolute path to your `miniwob-plusplus` folder) in `evaluation/miniwob/scripts/run_infer.sh`
+```sh
+export MINIWOB_URL="file://<PATH_TO_MINIWOB_CLONED_REPO>/miniwob/html/miniwob/"
+```
+
+## Test if your environment works
+
+Access with browser the above MiniWoB URLs and see if they load correctly.
+
+## Run Evaluation
+
+```sh
+bash evaluation/miniwob/scripts/run_infer.sh
+```
+
+Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
+
+To calculate the average reward, run:
+
+```sh
+poetry run python evaluation/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
+```
+
+## Submit your evaluation results
+
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+
+
+## BrowsingAgent V1.0 result
+
+Tested on BrowsingAgent V1.0
+
+MiniWoB++, 125 tasks (3 runs due to random init task), max step 10
+
+- GPT4o: 0.384, 0.416, 0.424, avg: 0.408
+- GPT3.5: 0.288, 0.256, 0.272, avg: 0.272
--- a/evaluation/miniwob/init.py
+++ b/evaluation/miniwob/init.py
--- a/evaluation/miniwob/get_avg_reward.py
+++ b/evaluation/miniwob/get_avg_reward.py
@@ -0,0 +1,33 @@
+import argparse
+import json
+
+import browsergym.miniwob  # noqa F401 register miniwob tasks as gym environments
+import gymnasium as gym
+
+parser = argparse.ArgumentParser(description='Calculate average reward.')
+parser.add_argument('output_path', type=str, help='path to output.jsonl')
+
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    env_ids = [
+        id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
+    ]
+    total_num = len(env_ids)
+    print('Total number of tasks: ', total_num)
+    total_reward = 0
+    total_cost = 0
+    actual_num = 0
+    with open(args.output_path, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            actual_num += 1
+            total_cost += data['metrics']['accumulated_cost']
+            total_reward += data['test_result']
+
+    avg_reward = total_reward / total_num
+    print('Avg Reward: ', avg_reward)
+
+    avg_cost = total_cost / actual_num
+    print('Avg Cost: ', avg_cost)
+    print('Actual number of tasks finished: ', actual_num)
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -0,0 +1,214 @@
+import asyncio
+import json
+import logging
+import os
+import pathlib
+import subprocess
+import time
+
+import browsergym.miniwob  # noqa F401 register miniwob tasks as gym environments
+import gymnasium as gym
+from tqdm import tqdm
+
+from opendevin.controller.state.state import State
+from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.serialization.event import event_to_dict
+from opendevin.runtime.docker.ssh_box import DockerSSHBox
+from opendevin.runtime.tools import RuntimeTool
+
+SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
+
+
+def process_instance(
+    env_id: str,
+    metadata: dict,
+    eval_output_dir: str,
+    docker_sandbox: DockerSSHBox,
+    reset_logger: bool = True,
+):
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(eval_output_dir, 'logs', f'instance_{env_id}.log')
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+    else:
+        logger.info(f'Starting evaluation for instance {env_id}.')
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    runtime_tools_config = {
+        RuntimeTool.BROWSER: {
+            'browsergym_eval': env_id,
+            'browsergym_eval_save_dir': eval_output_dir,
+        }
+    }
+
+    state: State = asyncio.run(
+        main(
+            'PLACEHOLDER_GOAL',
+            runtime_tools_config=runtime_tools_config,
+            sandbox=docker_sandbox,
+        )
+    )
+
+    # ======= Attempt to evaluate the agent's environment impact =======
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    metrics = state.metrics.get() if state.metrics else None
+    browsergym_eval_dir = os.path.join(eval_output_dir, env_id.split('/')[1])
+    # read goal
+    with open(
+        os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
+    ) as f:
+        instruction = f.read()
+    # read reward
+    with open(
+        os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
+    ) as f:
+        rewards = json.load(f)
+        reward = max(rewards)
+
+    # Save the output
+    output = {
+        'instance_id': env_id,
+        'instruction': instruction,
+        'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'metrics': metrics,
+        'error': state.error if state and state.error else None,
+        'test_result': reward,
+    }
+
+    return output
+
+
+if __name__ == '__main__':
+    env_ids = [
+        id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
+    ]
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert agent_class in SUPPORTED_AGENT_CLS, f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'miniwob',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproducibility
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        env_ids = env_ids[:eval_n_limit]
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_instance_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_instance_ids.add(data['instance_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_env_ids = []
+    for idx in env_ids:
+        if idx in finished_instance_ids:
+            logger.info(f'Skipping instance {idx} as it is already finished.')
+            continue
+        new_env_ids.append(idx)
+
+    env_ids = new_env_ids
+    logger.info(
+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(env_ids)}'
+    )
+
+    # =============================================
+
+    docker_sandbox = DockerSSHBox()
+    for env_id in tqdm(env_ids):
+        try:
+            output = process_instance(
+                env_id=env_id,
+                metadata=metadata,
+                eval_output_dir=eval_output_dir,
+                docker_sandbox=docker_sandbox,
+                reset_logger=False,
+            )
+            output_fp.write(json.dumps(output) + '\n')
+            output_fp.flush()
+        except Exception as e:
+            logger.error(f'Error processing instance {env_id}: {e}')
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
--- a/evaluation/miniwob/scripts/run_infer.sh
+++ b/evaluation/miniwob/scripts/run_infer.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# configure miniwob website, change URL to yours
+export MINIWOB_URL="file:///home/fangzhex/miniwob-plusplus/miniwob/html/miniwob/"
+
+# configure browsing agent
+export USE_NAV="false"
+export USE_CONCISE_ANSWER="true"
+
+
+MODEL_CONFIG=$1
+AGENT=$2
+NOTE=$3
+EVAL_LIMIT=$4
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default BrowsingAgent"
+  AGENT="BrowsingAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
+
+COMMAND="poetry run python evaluation/miniwob/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 10 \
+  --max-chars 10000000 \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@@ -116,7 +116,7 @@ def process_instance(
        config.workspace_base = workspace_mount_path
        config.workspace_mount_path = workspace_mount_path

-        # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
        if reset_logger:
            # Set up logger
            log_file = os.path.join(
@@ -305,7 +305,7 @@ if __name__ == '__main__':
        'max_iterations': max_iterations,
        'eval_output_dir': eval_output_dir,
        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
-        # get the commit id of current repo for reproduciblity
+        # get the commit id of current repo for reproducibility
        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
        .decode('utf-8')
        .strip(),
@@ -366,7 +366,7 @@ if __name__ == '__main__':
    try:
        with ProcessPoolExecutor(num_workers) as executor:
            futures = []
-            for _, instance in ml_bench.iterrows():
+            for _, instance in enumerate(new_instances):
                future = executor.submit(
                    process_instance,
                    instance,
--- a/evaluation/ml_bench/scripts/summarise_results.py
+++ b/evaluation/ml_bench/scripts/summarise_results.py
@@ -7,10 +7,17 @@ def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
    passed = []
    failed = []
    costs = []
+    instance_ids = set()
+    instances = []
    with open(res_file_path, 'r') as file:
        for line in file:
            data = json.loads(line.strip())
            success = data['metrics']['success']
+            if data['instance_id'] in instance_ids:
+                print(f'WARNING: Duplicate instance_id found: {data["instance_id"]}')
+                continue
+            instance_ids.add(data['instance_id'])
+            instances.append(data)
            if success:
                passed.append(
                    {
@@ -36,6 +43,12 @@ def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
                    }
                )
            costs.append(data['metrics']['accumulated_cost'])
+
+        # sort by instance_id
+        instances.sort(key=lambda x: x['instance_id'])
+        with open(res_file_path, 'w') as file:
+            for instance in instances:
+                file.write(json.dumps(instance) + '\n')
        return passed, failed, costs


--- a/evaluation/swe_bench/EVAL_PATCH.md
+++ b/evaluation/swe_bench/EVAL_PATCH.md
@@ -1,256 +0,0 @@
-# Evaluate Generated Patches
-
-## Evaluate patches generated by OpenDevin
-
-This section explains in detail how `evaluation/swe_bench/scripts/eval_infer.sh` described in [SWE-Bench README](./README.md) works.
-
-Use `scripts/setup/get_agent_report.sh` to evaluate patches generated by an OpenDevin agent. This script is available in the container at `/swe_util/get_agent_report.sh`.
-
- `output-file` (*required*): specify the path to your patch file inside the container
- `agent-name` (*required*): your agent name
- `dataset` (*required*): `swe-bench-test-lite` or `swe-bench-test`
- `num-processes`: defaults to 15.
- `experiment-name`: set to `${parent_folder_of_output_fils}_${current_folder_of_output_file}` if not given. E.g., `xxx/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v2_cd/output.jsonl` -> `CodeActAgent_gpt-4-1106-preview_maxiter_50_N_v2_cd` as experiment name.
- `merge_report`: if set, merges the evaluation report into the original output jsonl file and saves as a `.merged.jsonl` file.
-
-An example to run evaluation on the given example agent output (`./examples/example_agent_output.json`).
-
-```shell
-export MINICONDA3=/swe_util/miniforge3
-export OD_SWE_BENCH=/OD-SWE-bench
-export EVAL_DATA_DIR=/swe_util/eval_data
-cd /swe_util && ./get_agent_report.sh --output-file /swe_bench_output/example_agent_output.jsonl \
--agent-name CodeActAgent \
--dataset swe-bench-test-lite \
--experiment-name test_experiment \
--merge-report
-```
-
-You should get the following report:
-```shell
- no_generation: 4
- generated: 26
- with_logs: 26
- install_fail: 0
- reset_failed: 0
- no_apply: 0
- applied: 24
- test_errored: 0
- test_timeout: 0
- resolved: 6
-['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'django__django-17087', 'sympy__sympy-20590', 'django__django-11583', 'sympy__sympy-21612']
-Report saved at /swe_util/eval_data/eval_logs/test_experiment/test_experiment_swe-bench-test-lite.report.json
-Agent output with report merged created at /swe_bench_output/example_agent_output.merged.jsonl
-```
-
-An additional `fine_grained_report` field will be added to each instance in the `example_agent_output.merged.jsonl`.
-
-```json
-"fine_grained_report": {
-  "gold_tests": {
-    "FAIL_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_default\"]",
-    "PASS_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_enabled\", \"tests/test_ext_viewcode.py::test_linkcode\", \"tests/test_ext_viewcode.py::test_local_source_files\"]"
-  },
-  "generated": true,
-  "with_logs": true,
-  "applied": true,
-  "test_errored": false,
-  "test_timeout": false,
-  "resolved": true,
-  "log_parse": {
-    "tests/test_ext_viewcode.py::test_viewcode_epub_default": "PASSED",
-    "tests/test_ext_viewcode.py::test_viewcode_epub_enabled": "PASSED",
-    "tests/test_ext_viewcode.py::test_linkcode": "PASSED",
-    "tests/test_ext_viewcode.py::test_local_source_files": "PASSED",
-    "tests/test_ext_viewcode.py::test_viewcode": "FAILED"
-  },
-  "eval_report": {
-    "FAIL_TO_PASS": {
-      "success": [
-        "tests/test_ext_viewcode.py::test_viewcode_epub_default"
-      ],
-      "failure": []
-    },
-    "PASS_TO_PASS": {
-      "success": [
-        "tests/test_ext_viewcode.py::test_viewcode_epub_enabled",
-        "tests/test_ext_viewcode.py::test_linkcode",
-        "tests/test_ext_viewcode.py::test_local_source_files"
-      ],
-      "failure": []
-    },
-    "FAIL_TO_FAIL": {
-      "success": [],
-      "failure": []
-    },
-    "PASS_TO_FAIL": {
-      "success": [],
-      "failure": []
-    }
-  }
-}
-```
-
-## If you already have patches not generated by OpenDevin
-
-### Prepare Output Files
-
-Ensure that model outputs are formatted correctly as below:
-```json
-[
-  {
-    "instance_id": "",
-    "model_patch": "",
-    "model_name_or_path": ""
-  },
-  ...
-]
-```
-An example can be found [here](./examples/example_model_output.json).
-
-Agent output should be adhere to the OpenDevin format. An example can be found [here](./examples/example_agent_output.json).
-
-### Set Up the Environment
-
-Before evaluating generated patches, you need to set up the Docker environment. Run the following command to instantiate the Docker container and mount the directory to your output files on the host:
-
-```shell
-docker run -it \
-v DIR_TO_YOUR_PATCH_FILES_ON_HOST:/swe_bench_output \
-ghcr.io/opendevin/eval-swe-bench:full-v1.2.1 /bin/bash
-```
-
-### Evaluate Model Generated Patches
-
-Use `scripts/get_model_report.sh` to evaluate patches generated by a model. This script is located in the container at `/swe_util/get_model_report.sh`.
-
- `output-file` (*required*): specify the path to your patch file inside the container
- `model-name` (*required*): this must match the `model_name_or_path` in your patch file
- `dataset` (*required*): `swe-bench-test-lite` or `swe-bench-test`
- `num-processes`: defaults to 15.
- `experiment-name`: set to `{model-name}__{dataset}` unless specified
-
-An example to run evaluation on the given example model output (`./examples/example_agent_output.json`).
-
-```shell
-export MINICONDA3=/swe_util/miniforge3
-export OD_SWE_BENCH=/swe_util/OD-SWE-bench
-export EVAL_DATA_DIR=/swe_util/eval_data
-cd /swe_util && ./get_model_report.sh --output-file /swe_bench_output/example_model_output.json \
--model-name opendevin \
--dataset swe-bench-test-lite
-```
-
-You should get the following report:
-```shell
- no_generation: 4
- generated: 26
- with_logs: 26
- install_fail: 0
- reset_failed: 0
- no_apply: 0
- applied: 24
- test_errored: 0
- test_timeout: 0
- resolved: 6
-['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'django__django-17087', 'sympy__sympy-20590', 'django__django-11583', 'sympy__sympy-21612']
-Report saved at /swe_util/eval_data/eval_logs/opendevin__swe-bench-test-lite/example_model_output.report.json
-```
-Note: please ignore the `no_apply` in the report for now.
-
-The script will generate a `{experiment_name}` folder under `$EVAL_DATA_DIR/eval_logs`
-```shell
-├── $EVAL_DATA_DIR/eval_logs/$experiment_name
-│   ├── $experiment_name.json
-│   ├── $experiment_name.report.json
-│   ├── $model_name # eval log dir
-```
-
-### Evaluate Agent Generated Patches
-
-Use `scripts/setup/get_agent_report.sh` to evaluate patches generated by an agent. This script is available in the container at `/swe_util/get_agent_report.sh`.
-
- `output-file` (*required*): specify the path to your patch file inside the container
- `agent-name` (*required*): your agent name
- `dataset` (*required*): `swe-bench-test-lite` or `swe-bench-test`
- `num-processes`: defaults to 15.
- `experiment-name`: set to `${parent_folder_of_output_fils}_${current_folder_of_output_file}` if not given. E.g., `xxx/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v2_cd/output.jsonl` -> `CodeActAgent_gpt-4-1106-preview_maxiter_50_N_v2_cd` as experiment name.
- `merge_report`: if set, merges the evaluation report into the original output jsonl file and saves as a `.merged.jsonl` file.
-
-An example to run evaluation on the given example agent output (`./examples/example_agent_output.json`).
-
-```shell
-export MINICONDA3=/swe_util/miniforge3
-export OD_SWE_BENCH=/OD-SWE-bench
-export EVAL_DATA_DIR=/swe_util/eval_data
-cd /swe_util && ./get_agent_report.sh --output-file /swe_bench_output/example_agent_output.jsonl \
--agent-name CodeActAgent \
--dataset swe-bench-test-lite \
--experiment-name test_experiment \
--merge-report
-```
-
-You should get the following report:
-```shell
- no_generation: 4
- generated: 26
- with_logs: 26
- install_fail: 0
- reset_failed: 0
- no_apply: 0
- applied: 24
- test_errored: 0
- test_timeout: 0
- resolved: 6
-['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'django__django-17087', 'sympy__sympy-20590', 'django__django-11583', 'sympy__sympy-21612']
-Report saved at /swe_util/eval_data/eval_logs/test_experiment/test_experiment_swe-bench-test-lite.report.json
-Agent output with report merged created at /swe_bench_output/example_agent_output.merged.jsonl
-```
-
-An additional `fine_grained_report` field will be added to each instance in the `example_agent_output.merged.jsonl`.
-
-```json
-"fine_grained_report": {
-  "gold_tests": {
-    "FAIL_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_default\"]",
-    "PASS_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_enabled\", \"tests/test_ext_viewcode.py::test_linkcode\", \"tests/test_ext_viewcode.py::test_local_source_files\"]"
-  },
-  "generated": true,
-  "with_logs": true,
-  "applied": true,
-  "test_errored": false,
-  "test_timeout": false,
-  "resolved": true,
-  "log_parse": {
-    "tests/test_ext_viewcode.py::test_viewcode_epub_default": "PASSED",
-    "tests/test_ext_viewcode.py::test_viewcode_epub_enabled": "PASSED",
-    "tests/test_ext_viewcode.py::test_linkcode": "PASSED",
-    "tests/test_ext_viewcode.py::test_local_source_files": "PASSED",
-    "tests/test_ext_viewcode.py::test_viewcode": "FAILED"
-  },
-  "eval_report": {
-    "FAIL_TO_PASS": {
-      "success": [
-        "tests/test_ext_viewcode.py::test_viewcode_epub_default"
-      ],
-      "failure": []
-    },
-    "PASS_TO_PASS": {
-      "success": [
-        "tests/test_ext_viewcode.py::test_viewcode_epub_enabled",
-        "tests/test_ext_viewcode.py::test_linkcode",
-        "tests/test_ext_viewcode.py::test_local_source_files"
-      ],
-      "failure": []
-    },
-    "FAIL_TO_FAIL": {
-      "success": [],
-      "failure": []
-    },
-    "PASS_TO_FAIL": {
-      "success": [],
-      "failure": []
-    }
-  }
-}
-```
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -51,6 +51,7 @@ sandbox_timeout = 120
 use_host_network = false
 run_as_devin = false
 enable_auto_lint = true
+max_budget_per_task = 4 # 4 USD

 # TODO: Change these to the model you want to evaluate
 [eval_gpt4_1106_preview]
@@ -127,6 +128,12 @@ If you want to evaluate existing results, you should first run this to clone exi
 git clone https://huggingface.co/spaces/OpenDevin/evaluation evaluation/evaluation_outputs
 ```

+To prepare for swe-bench evaluation, you should pull evaluation docker from [OpenDevin/SWE-bench-docker](https://github.com/OpenDevin/SWE-bench-docker) and download swe-bench data by running:
+
+```bash
+evaluation/swe_bench/scripts/eval/prep_eval.sh
+```
+
 Then you can run the following:

 ```bash
@@ -135,55 +142,14 @@ Then you can run the following:
 ./evaluation/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
 ```

-The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.merged.jsonl`.
+PS: You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`.

-It will contain an additional field `fine_grained_report` (see example below) compared to the `output.jsonl` from the previous inference stage.
+The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/` with the following files/directory (following format of [SWE-bench-docker](https://github.com/aorwall/SWE-bench-docker/tree/main/evaluations/SWE-bench_Lite_golden)):

-```json
-"fine_grained_report": {
-  "gold_tests": {
-    "FAIL_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_default\"]",
-    "PASS_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_enabled\", \"tests/test_ext_viewcode.py::test_linkcode\", \"tests/test_ext_viewcode.py::test_local_source_files\"]"
-  },
-  "generated": true,
-  "with_logs": true,
-  "applied": true,
-  "test_errored": false,
-  "test_timeout": false,
-  "resolved": true,
-  "log_parse": {
-    "tests/test_ext_viewcode.py::test_viewcode_epub_default": "PASSED",
-    "tests/test_ext_viewcode.py::test_viewcode_epub_enabled": "PASSED",
-    "tests/test_ext_viewcode.py::test_linkcode": "PASSED",
-    "tests/test_ext_viewcode.py::test_local_source_files": "PASSED",
-    "tests/test_ext_viewcode.py::test_viewcode": "FAILED"
-  },
-  "eval_report": {
-    "FAIL_TO_PASS": {
-      "success": [
-        "tests/test_ext_viewcode.py::test_viewcode_epub_default"
-      ],
-      "failure": []
-    },
-    "PASS_TO_PASS": {
-      "success": [
-        "tests/test_ext_viewcode.py::test_viewcode_epub_enabled",
-        "tests/test_ext_viewcode.py::test_linkcode",
-        "tests/test_ext_viewcode.py::test_local_source_files"
-      ],
-      "failure": []
-    },
-    "FAIL_TO_FAIL": {
-      "success": [],
-      "failure": []
-    },
-    "PASS_TO_FAIL": {
-      "success": [],
-      "failure": []
-    }
-  }
-}
-```
+- `README.md`: a report showing what are the instances that passed, failed, etc.
+- `logs/`: a directory of test logs
+- `report.json`: a JSON file that contains keys like `"resolved"` pointing to instance IDs that are resolved by the agent.
+- `summary.json`: a JSON file contains more fine-grained information for each test instance.

 Please refer to [EVAL_PATCH.md](./EVAL_PATCH.md) if you want to learn more about how to evaluate patches that are already generated (e.g., not by OpenDevin).

@@ -192,8 +158,8 @@ Please refer to [EVAL_PATCH.md](./EVAL_PATCH.md) if you want to learn more about
 If you just want to know the resolve rate, and/or a summary of what tests pass and what don't, you could run

 ```bash
-poetry run python ./evaluation/swe_bench/scripts/summarise_results.py <path_to_output_merged_jsonl_file>
-# e.g. poetry run python ./evaluation/swe_bench/scripts/summarise_results.py ./evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActSWEAgent/gpt-4o-2024-05-13_maxiter_50_N_v1.5-no-hint/output.merged.jsonl
+poetry run python ./evaluation/swe_bench/scripts/summarise_results.py <path_to_report_json_file>
+# e.g. poetry run python ./evaluation/swe_bench/scripts/summarise_results.py ./evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActSWEAgent/gpt-4o-2024-05-13_maxiter_50_N_v1.5-no-hint/report.json
 ```

 ## Submit your evaluation results
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -209,7 +209,7 @@ def process_instance(
    if reset_logger:
        # Set up logger
        log_file = os.path.join(
-            eval_output_dir, 'logs', f'instance_{instance.instance_id}.log'
+            eval_output_dir, 'infer_logs', f'instance_{instance.instance_id}.log'
        )
        # Remove all existing handlers from logger
        for handler in logger.handlers[:]:
@@ -471,7 +471,7 @@ if __name__ == '__main__':
    def update_progress(future):
        pbar.update(1)
        output = future.result()
-        pbar.set_description(f'Instance {output["instance_id"]}')
+        pbar.set_description(f'Instance {output["instance_id"][:10]}')
        pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
        logger.info(
            f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
--- a/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
+++ b/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+mkdir evaluation/swe_bench/eval_workspace
+pushd evaluation/swe_bench/eval_workspace
+git clone https://github.com/OpenDevin/SWE-bench-docker.git
+cd SWE-bench-docker
+scripts/pull_docker_images.sh docker/ xingyaoww
--- a/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py
+++ b/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py
@@ -0,0 +1,26 @@
+import argparse
+import os
+
+import pandas as pd
+
+parser = argparse.ArgumentParser()
+parser.add_argument('od_output_file', type=str)
+args = parser.parse_args()
+output_filepath = args.od_output_file.replace('.jsonl', '.swebench.jsonl')
+print(f'Converting {args.od_output_file} to {output_filepath}')
+
+od_format = pd.read_json(args.od_output_file, orient='records', lines=True)
+# model name is the folder name of od_output_file
+model_name = os.path.basename(os.path.dirname(args.od_output_file))
+
+
+def convert_row_to_swebench_format(row):
+    return {
+        'instance_id': row['instance_id'],
+        'model_patch': row['git_patch'].replace('\r\n', '\n'),
+        'model_name_or_path': model_name,
+    }
+
+
+swebench_format = od_format.apply(convert_row_to_swebench_format, axis=1)
+swebench_format.to_json(output_filepath, lines=True, orient='records')
--- a/evaluation/swe_bench/scripts/eval/download_swe_bench_data.py
+++ b/evaluation/swe_bench/scripts/eval/download_swe_bench_data.py
@@ -0,0 +1,34 @@
+import argparse
+import json
+
+import pandas as pd
+from datasets import load_dataset
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    'output_dir',
+    type=str,
+    default='eval_data/instances',
+    help='Path to the directory to save the instances.',
+)
+args = parser.parse_args()
+
+dataset = load_dataset('princeton-nlp/SWE-bench')
+test = dataset['test'].to_pandas()
+test['FAIL_TO_PASS'] = test['FAIL_TO_PASS'].apply(json.loads)
+test['PASS_TO_PASS'] = test['PASS_TO_PASS'].apply(json.loads)
+test.to_json(f'{args.output_dir}/swe-bench-test.json', orient='records')
+
+dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
+test = dataset['test'].to_pandas()
+test['FAIL_TO_PASS'] = test['FAIL_TO_PASS'].apply(json.loads)
+test['PASS_TO_PASS'] = test['PASS_TO_PASS'].apply(json.loads)
+test.to_json(f'{args.output_dir}/swe-bench-lite-test.json', orient='records')
+
+dev = dataset['dev'].to_pandas()
+dev['FAIL_TO_PASS'] = dev['FAIL_TO_PASS'].apply(json.loads)
+dev['PASS_TO_PASS'] = dev['PASS_TO_PASS'].apply(json.loads)
+dev.to_json(f'{args.output_dir}/swe-bench-lite-dev.json', orient='records')
+
+all_data = pd.concat([test, dev])
+all_data.to_json(f'{args.output_dir}/swe-bench-lite-all.json', orient='records')
--- a/evaluation/swe_bench/scripts/eval/prep_eval.sh
+++ b/evaluation/swe_bench/scripts/eval/prep_eval.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo "Cloning OpenDevin SWE-Bench Fork"
+git clone https://github.com/OpenDevin/SWE-bench.git evaluation/swe_bench/eval_workspace/SWE-bench
+
+echo "Pulling all evaluation dockers..."
+evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
+
+echo "Downloading SWE-bench data..."
+mkdir -p evaluation/swe_bench/eval_workspace/eval_data/instances
+poetry run python3 evaluation/swe_bench/scripts/eval/download_swe_bench_data.py evaluation/swe_bench/eval_workspace/eval_data/instances
--- a/evaluation/swe_bench/scripts/eval_infer.sh
+++ b/evaluation/swe_bench/scripts/eval_infer.sh
@@ -11,25 +11,91 @@ if [ ! -f $PROCESS_FILEPATH ]; then
    exit 1
 fi

+# If instance_id is empty, it means we want to eval on the whole $PROCESS_FILEPATH
+# otherwise, we want to eval on the instance_id
+INSTANCE_ID=$2
+echo "INSTANCE_ID: $INSTANCE_ID"
+
 PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
 FILE_DIR=$(dirname $PROCESS_FILEPATH)
 FILE_NAME=$(basename $PROCESS_FILEPATH)
-mkdir -p $FILE_DIR/eval_logs
+mkdir -p $FILE_DIR/logs
 mkdir -p $FILE_DIR/swe_bench_format

 echo "Evaluating $FILE_NAME @ $FILE_DIR"
-echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
+DOCKERHUB_NAMESPACE="xingyaoww"
+SWEBENCH_TASKS=$(realpath evaluation/swe_bench/eval_workspace/eval_data/instances/swe-bench-lite-all.json)
+export SWEBENCH_DOCKER_FORK_DIR=$(realpath evaluation/swe_bench/eval_workspace/SWE-bench-docker)

-docker run --rm \
-    -v $FILE_DIR:/swe_bench_output \
-    -e MINICONDA3=/swe_util/miniforge3 \
-    -e OD_SWE_BENCH=/swe_util/OD-SWE-bench \
-    -e EVAL_DATA_DIR=/swe_util/eval_data \
-    -w /swe_util \
-    ghcr.io/opendevin/eval-swe-bench:full-v1.2.1 \
-    bash -c "./get_agent_report.sh --output-file /swe_bench_output/$FILE_NAME \
-    --agent-name CodeActAgent \
-    --dataset swe-bench-test-lite \
-    --experiment-name test_experiment \
-    --merge-report && cp -r /swe_util/eval_data/eval_logs/test_experiment/* /swe_bench_output/eval_logs \
-    && cp -r /swe_util/eval_data/outputs/* /swe_bench_output/swe_bench_format/"
+# ================================================
+# detect whether PROCESS_FILEPATH is in OD format or in SWE-bench format
+echo "=============================================================="
+echo "Detecting whether PROCESS_FILEPATH is in OD format or in SWE-bench format"
+echo "=============================================================="
+# SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch
+function is_swebench_format() {
+    # Read the first line of the file
+    read -r first_line < "$PROCESS_FILEPATH"
+
+    # Use jq to check if the first line has the required fields
+    echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null
+
+    if [ $? -ne 0 ]; then
+        return 1 # Return 1 if the first line does not have the required fields
+    fi
+
+    return 0 # Return 0 if the first line has the required fields
+}
+# Call the function with the file path
+is_swebench_format "$PROCESS_FILEPATH"
+IS_SWEBENCH_FORMAT=$?
+# Use the result in an if-else statement
+if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
+    echo "The file IS in SWE-bench format."
+    SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH
+else
+    echo "The file IS NOT in SWE-bench format."
+
+    # ==== Convert OD format to SWE-bench format ====
+    echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
+    poetry run python3 evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py $PROCESS_FILEPATH
+    # replace .jsonl with .swebench.jsonl in filename
+    SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
+    echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
+    # assert that the file exists
+    if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then
+        echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process."
+        exit 1
+    fi
+    SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL)
+fi
+# ================================================
+
+echo "=============================================================="
+echo "Running SWE-bench evaluation"
+echo "=============================================================="
+
+if [ -z "$INSTANCE_ID" ]; then
+    echo "Running SWE-bench evaluation on the whole input file..."
+
+    poetry run python $SWEBENCH_DOCKER_FORK_DIR/run_evaluation.py \
+        --predictions_path $SWEBENCH_FORMAT_JSONL \
+        --log_dir $FILE_DIR/logs \
+        --swe_bench_tasks $SWEBENCH_TASKS \
+        --namespace $DOCKERHUB_NAMESPACE \
+        --timeout 1800
+
+else
+    echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
+    poetry run python $SWEBENCH_DOCKER_FORK_DIR/run_single_instance.py \
+        --predictions_path $SWEBENCH_FORMAT_JSONL \
+        --swe_bench_tasks $SWEBENCH_TASKS \
+        --namespace $DOCKERHUB_NAMESPACE \
+        --instance_id $INSTANCE_ID
+fi
+
+poetry run python $SWEBENCH_DOCKER_FORK_DIR/generate_report.py \
+    --predictions_path $SWEBENCH_FORMAT_JSONL \
+    --log_dir $FILE_DIR/logs \
+    --output_dir $FILE_DIR \
+    --swe_bench_tasks $SWEBENCH_TASKS
--- a/evaluation/swe_bench/scripts/summarise_results.py
+++ b/evaluation/swe_bench/scripts/summarise_results.py
@@ -3,37 +3,37 @@ import sys


 def extract_test_results(json_file_path):
-    passed_tests = []
-    failed_tests = []
+    passed_instances = set()
+    all_instances = set()
+
    with open(json_file_path, 'r') as file:
-        for line in file:
-            data = json.loads(line.strip())
-            instance_id = data['instance_id']
-            resolved = False
-            if 'fine_grained_report' in data:
-                resolved = data['fine_grained_report']['resolved']
-            else:
-                resolved = data['test_result']['result']['resolved']
-            if resolved:
-                passed_tests.append(instance_id)
-            else:
-                failed_tests.append(instance_id)
-    return passed_tests, failed_tests
+        report = json.load(file)
+
+        # Add resolved instances
+        for instance_id in report['resolved']:
+            passed_instances.add(instance_id)
+
+        # Add all instances in the report
+        for _, instance_ids in report.items():
+            for instance_id in instance_ids:
+                all_instances.add(instance_id)
+
+    return passed_instances, all_instances


 if __name__ == '__main__':
    if len(sys.argv) != 2:
        print(
-            'Usage: poetry run python summarise_results.py <path_to_output_merged_jsonl_file>'
+            'Usage: poetry run python summarise_results.py <path_to_report_json_file>'
        )
        sys.exit(1)
    json_file_path = sys.argv[1]
-    passed_tests, failed_tests = extract_test_results(json_file_path)
-    succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
+    passed_instances, all_instances = extract_test_results(json_file_path)
+    succ_rate = len(passed_instances) / len(all_instances)
    print(
-        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
+        f'\nPassed {len(passed_instances)} tests, total {len(all_instances)} tests, resolve rate = {succ_rate:.2%}'
    )
    print('PASSED TESTS:')
-    print(passed_tests)
+    print(sorted(list(passed_instances)))
    print('FAILED TESTS:')
-    print(failed_tests)
+    print(sorted(list(all_instances - passed_instances)))
--- a/evaluation/toolqa/README.md
+++ b/evaluation/toolqa/README.md
@@ -0,0 +1,45 @@
+# ToolQA Evaluation with OpenDevin
+
+This folder contains an evaluation harness we built on top of the original [ToolQA](https://github.com/night-chen/ToolQA) ([paper](https://arxiv.org/pdf/2306.13304)).
+
+## Setup Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local development environment for OpenDevin.
+
+## Configure OpenDevin and your LLM
+
+Run `make setup-config` to set up the `config.toml` file if it does not exist at the root of the workspace.
+
+## Run Inference on ToolQA Instances
+
+Make sure your Docker daemon is running, then run this bash script:
+
+```bash
+bash evaluation/toolqa/scripts/run_infer.sh [model_config] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid]
+```
+
+where `model_config` is mandatory, while all other arguments are optional.
+
+`model_config`, e.g. `llm`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+
+`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+
+`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances.
+By default, the script evaluates 1 instance.
+
+`dataset`, the dataset from ToolQA to evaluate from. You could choose from `agenda`, `airbnb`, `coffee`, `dblp`, `flight`, `gsm8k`, `scirex`, `yelp` for dataset. The default is `coffee`.
+
+`hardness`, the hardness to evaluate. You could choose from `easy` and `hard`. The default is `easy`.
+
+`wolfram_alpha_appid` is an optional argument. When given `wolfram_alpha_appid`, the agent will be able to access Wolfram Alpha's APIs.
+
+Note: in order to use `eval_limit`, you must also set `agent`; in order to use `dataset`, you must also set `eval_limit`; in order to use `hardness`, you must also set `dataset`.
+
+Let's say you'd like to run 10 instances using `llm` and CodeActAgent on `coffee` `easy` test,
+then your command would be:
+
+```bash
+bash evaluation/toolqa/scripts/run_infer.sh llm CodeActAgent 10 coffee easy
+```
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -0,0 +1,353 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+from tqdm import tqdm
+from utils import download_data, download_tools, encode_question, eval_answer, get_data
+
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, get_parser
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    print('Cleaning up child processes...')
+    for process in mp.active_children():
+        print(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'When you think you finished the task, respond with `Finish[answer]` where you include your answer in `[]`\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
+    )
+    if state.history:
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have completed the request, please run the following command: <execute_bash> exit </execute_bash>.\n'
+}
+
+
+def process_instance(task, agent_class, metadata, reset_logger: bool = True):
+    # create process-specific workspace dir
+    # we will create a workspace directory for EACH process
+    # so that different agent don't interfere with each other.
+    workspace_mount_path = config.workspace_mount_path
+    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    eval_output_dir = metadata['eval_output_dir']
+    qid = task['qid']
+    question = task['question']
+    answer = task['answer']
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(eval_output_dir, 'logs', f'instance_{qid}.log')
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {qid}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+    logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+    # Prepare instruction
+    instruction = encode_question(question)
+    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+    # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State = asyncio.run(
+        main(
+            instruction,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+        )
+    )
+    # ======= Attempt to evaluate the agent's edits =======
+    # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    model_answer_raw = ''
+    for act, _ in reversed(state.history):
+        if isinstance(act, MessageAction) and act.source == 'agent':
+            model_answer_raw = act.content
+            break
+    # attempt to parse model_answer
+    correct = eval_answer(str(model_answer_raw), str(answer))
+    metrics = state.metrics.get() if state.metrics else None
+    logger.info(f'Final message: {model_answer_raw} | Correctness: {correct}')
+    # Save the output
+    output = {
+        'qid': qid,
+        'text': model_answer_raw,
+        'correct': correct,
+        'answer_id': 'None',
+        'model_id': metadata['model_name'],
+        'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'metrics': metrics,
+        'error': state.error if state and state.error else None,
+    }
+    return output
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        help='Which dataset to evaluate from ToolQA. ToolQA contains 8 datasets, namely agenda, airbnb, coffee, dblp, flight, gsm8k, scirex, yelp. For example, the default is --dataset flight.',
+        default='flight',
+    )
+    parser.add_argument(
+        '--hardness',
+        type=str,
+        help='Which level of difficulty to evaluate from ToolQA. ToolQA contains 2 levels of hardness, namely easy and hard. For example, the default is --hardness easy.',
+        default='easy',
+    )
+    parser.add_argument(
+        '--wolfram_alpha_appid',
+        type=str,
+        help='wolfram alpha appid to use for wolfram alpha related tests',
+        default='YOUR_WOLFRAMALPHA_APPID',
+    )
+    args, _ = parser.parse_known_args()
+    if args.directory:
+        config.workspace_base = os.path.abspath(args.directory)
+        print(f'Setting workspace base to {config.workspace_base}')
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'toolqa',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    dataset = ''
+    hardness = ''
+    dataset_choices = [
+        'agenda',
+        'airbnb',
+        'coffee',
+        'dblp',
+        'flight',
+        'gsm8k',
+        'scirex',
+        'yelp',
+        'genda',
+    ]
+    if args.dataset in dataset_choices:
+        dataset = args.dataset
+    else:
+        raise ValueError(
+            'Please choose from agenda, airbnb, coffee, dblp, flight, gsm8k, scirex, yelp for dataset.'
+        )
+    if args.hardness == 'easy':
+        hardness = 'easy'
+    elif args.hardness == 'hard':
+        hardness = 'hard'
+    else:
+        raise ValueError('Please choose from easy and hard for hardness.')
+
+    logger.info(f'Evaluating ToolQA {dataset} {hardness} test')
+    # workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
+    workspace_mount_path = config.workspace_mount_path
+    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+    toolqa_test = get_data(dataset, hardness)
+    toolqa_data_path = download_data(workspace_mount_path)
+    toolqa_tool_path = download_tools(workspace_mount_path, args.wolfram_alpha_appid)
+
+    # TEST METADATA
+    metadata = {
+        'dataset': dataset,
+        'hardness': hardness,
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproduciblity
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(
+        os.path.join(eval_output_dir, f'metadata_{dataset}_{hardness}.json'), 'w'
+    ) as f:
+        json.dump(metadata, f)
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        toolqa_test = toolqa_test[:eval_n_limit]
+        logger.info(
+            f'Limiting evaluation to a total of first {eval_n_limit} instances.'
+        )
+    output_file = os.path.join(
+        eval_output_dir, f'output_{model_name}_{dataset}_{hardness}.jsonl'
+    )
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_task_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                task = json.loads(line)
+                finished_task_ids.add(task['qid'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+    # =============================================
+    # filter out finished instances
+    new_toolqa_test = []
+    for task in toolqa_test:
+        qid = task['qid']
+        if qid in finished_task_ids:
+            logger.info(f'Skipping instance {qid} as it is already finished.')
+            continue
+        new_toolqa_test.append(task)
+    finished_task_number = len(finished_task_ids)
+    toolqa_test = new_toolqa_test
+    logger.info(
+        f'Finished instances: {finished_task_number}, Remaining instances: {len(toolqa_test)}'
+    )
+
+    # =============================================
+    pbar = tqdm(total=len(toolqa_test))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(future):
+        pbar.update(1)
+        output = future.result()
+        pbar.set_description(f'Instance {output["qid"]}')
+        pbar.set_postfix_str(f'Test Result: {output["correct"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["qid"]}: {output["correct"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        output_fp.flush()
+        finished_task_ids.add(output['qid'])
+
+    # This sets the multi-processing
+    num_workers = args.eval_num_workers
+    logger.info(f'Using {num_workers} workers for evaluation.')
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multi-processing
+            for task in toolqa_test:
+                try:
+                    future = executor.submit(
+                        process_instance,
+                        task,
+                        agent_class,
+                        metadata,
+                        reset_logger=bool(num_workers > 1),
+                    )
+                    future.add_done_callback(update_progress)
+                    futures.append(future)
+                except Exception:
+                    continue
+            # Wait for all futures to complete
+            for future in futures:
+                try:
+                    future.result()
+                except Exception:
+                    continue
+    except KeyboardInterrupt:
+        logger.info('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+    output_fp.close()
+    total_correct = 0
+    output = []
+    with open(output_file, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            output.append(data)
+            if data['qid'] in finished_task_ids:
+                if str(data['correct']).lower() == 'true':
+                    total_correct += 1
+    # sort all output by question_id
+    output = sorted(output, key=lambda x: x['qid'])
+    with open(output_file, 'w') as f:
+        for dat in output:
+            f.write(json.dumps(dat) + '\n')
+            f.flush()
+    logger.info(
+        f'Evaluation finished for {dataset}-{hardness}. Total: {len(toolqa_test)+finished_task_number}; Correct: {total_correct}; Accuracy: {total_correct / (len(toolqa_test)+finished_task_number)}'
+    )
--- a/evaluation/toolqa/scripts/run_infer.sh
+++ b/evaluation/toolqa/scripts/run_infer.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+MODEL_CONFIG=$1
+AGENT=$2
+EVAL_LIMIT=$3
+DATASET=$4
+HARDNESS=$5
+WOLFRAM_APPID=$6
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+if [ -z "$DATASET" ]; then
+  DATASET="flight"
+  echo "Dataset not specified, use default $DATASET"
+fi
+
+if [ -z "$HARDNESS" ]; then
+  HARDNESS="easy"
+  echo "Hardness not specified, use default $HARDNESS"
+fi
+
+if [ -z "$WOLFRAM_APPID" ]; then
+  WOLFRAM_APPID="YOUR_WOLFRAMALPHA_APPID"
+  echo "WOLFRAM_APPID not specified"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+echo "DATASET: $DATASET"
+echo "HARDNESS: $HARDNESS"
+echo "WOLFRAM_APPID: $WOLFRAM_APPID"
+
+COMMAND="poetry run python evaluation/toolqa/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 30 \
+  --dataset $DATASET \
+  --hardness $HARDNESS \
+  --wolfram_alpha_appid $WOLFRAM_APPID\
+  --data-split validation \
+  --max-chars 10000000 \
+  --eval-num-workers 1 \
+  --eval-note ${AGENT_VERSION}_${LEVELS}"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/toolqa/utils.py
+++ b/evaluation/toolqa/utils.py
@@ -0,0 +1,112 @@
+import json
+import os
+import re
+import string
+import zipfile
+
+import gdown
+import requests
+
+
+def download_data(dir):
+    data_path = os.path.join(dir, 'data/external_corpus')
+    if os.path.exists(data_path):
+        return data_path
+    url = 'https://drive.google.com/uc?id=1zRbHzPW2x4dDcfmphBWlan8cxUCRNmqk'
+    zip_path = os.path.join(dir, 'data.zip')
+    gdown.download(url, zip_path, quiet=False)
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(os.path.join(dir, 'data'))
+    if os.path.exists(zip_path):
+        os.remove(zip_path)
+    return data_path
+
+
+def download_tools(dir, wolfram_alpha_appid='YOUR_WOLFRAMALPHA_APPID'):
+    tool_path = os.path.join(dir, 'tools')
+    if os.path.exists(tool_path):
+        return tool_path
+    os.mkdir(tool_path)
+    tools = [
+        'code/sql_interpreter.py',
+        'graph/graphtools.py',
+        'math/calculator.py',
+        'table/mysql_db_create.py',
+        'table/tabtools.py',
+        'text/agenda_retriever.py',
+        'text/scirex_retriever.py',
+    ]
+    for tool in tools:
+        url = f'https://raw.githubusercontent.com/night-chen/ToolQA/main/benchmark/ReAct/code/tools/{tool}'
+        response = requests.get(url)
+        output_file = os.path.join(tool_path, tool.split('/')[1])
+        with open(output_file, 'wb') as f:
+            f.write(response.content)
+    with open(os.path.join(tool_path, 'calculator.py'), 'r') as f:
+        content = f.read()
+    new_content = content.replace('YOUR_WOLFRAMALPHA_APPID', wolfram_alpha_appid)
+    with open(os.path.join(tool_path, 'calculator.py'), 'w') as f:
+        f.write(new_content)
+    with open(os.path.join(tool_path, 'agenda_retriever.py'), 'r') as f:
+        content = f.read()
+    new_content = content.replace('/<YOUR_OWN_PATH>/ToolQA/', '')
+    with open(os.path.join(tool_path, 'agenda_retriever.py'), 'w') as f:
+        f.write(new_content)
+    with open(os.path.join(tool_path, 'mysql_db_create.py'), 'r') as f:
+        content = f.read()
+    new_content = content.replace('/<YOUR_OWN_PATH>/ToolQA/', '')
+    with open(os.path.join(tool_path, 'mysql_db_create.py'), 'w') as f:
+        f.write(new_content)
+    with open(os.path.join(tool_path, 'scirex_retriever.py'), 'r') as f:
+        content = f.read()
+    new_content = content.replace('/<YOUR_OWN_PATH>/ToolQA/', '')
+    with open(os.path.join(tool_path, 'scirex_retriever.py'), 'w') as f:
+        f.write(new_content)
+
+
+def get_data(dataset, hardness):
+    data = []
+    url = f'https://raw.githubusercontent.com/night-chen/ToolQA/main/data/questions/{hardness}/{dataset}-{hardness}.jsonl'
+    url = requests.get(url)
+    if url.status_code == 200:
+        lines = url.text.splitlines()
+        for line in lines:
+            data.append(json.loads(line))
+    return data
+
+
+REACT_INSTRUCTION = """Use tools in the tools directory to solve the task: {question}
+You could use all tools which are under the tools/ directory and all the data under the data/ directory.
+When you think you finished the task, respond with `Finish[answer]` where you include your answer in `[]`.
+IMPORTANT: Make sure that in your final answer, you should not print any additional text/instructions other than the actual answer, which should be a word or a simple phrase.
+"""
+
+
+def encode_question(question):
+    return REACT_INSTRUCTION.format(question=question)
+
+
+# imported from https://github.com/night-chen/ToolQA/tree/main/benchmark/ReAct/code/agents_chatgpt.py
+def normalize_answer(s):
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the|usd)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def eval_answer(pred, answer):
+    pattern = r'Finish\[(.*?)\]'
+    match = re.search(pattern, pred)
+    if match:
+        pred = match.group(1)
+    return normalize_answer(pred) == normalize_answer(answer)
--- a/evaluation/webarena/README.md
+++ b/evaluation/webarena/README.md
@@ -0,0 +1,91 @@
+# WebArena Evaluation with OpenDevin Browsing Agents
+
+This folder contains evaluation for [WebArena](https://github.com/web-arena-x/webarena) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on realistic web browsing tasks.
+
+## Setup OpenDevin Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/tmp/cache"
+sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
+sandbox_type = "ssh"
+ssh_hostname = "localhost"
+sandbox_timeout = 120
+
+# TODO: Change these to the model you want to evaluate
+[eval_gpt4_1106_preview]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[eval_some_openai_compatible_model]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Setup WebArena Environment
+WebArena requires you to set up websites containing pre-populated content that is accessible via URL to the machine running the OpenDevin agents.
+Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
+Take note of the base URL of the machine where the environment is installed.
+
+## Setup Environment Variables of WebArena Websites
+
+Create a script `webarena_env.sh` under `evaluation/webarena/scripts` with the following:
+
+```bash
+export BASE_URL=<YOUR_SERVER_URL_HERE>
+export SHOPPING="$BASE_URL:7770/"
+export SHOPPING_ADMIN="$BASE_URL:7780/admin"
+export REDDIT="$BASE_URL:9999"
+export GITLAB="$BASE_URL:8023"
+export WIKIPEDIA="$BASE_URL:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing"
+export MAP="$BASE_URL:3000"
+export HOMEPAGE="$BASE_URL:4399"
+export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs
+```
+
+## Test if your environment works
+
+Access with browser the above WebArena website URLs and see if they load correctly.
+If you cannot access the website, make sure the firewall allows public access of the aforementioned ports on your server
+Check the network security policy if you are using an AWS machine.
+Follow the WebArena environment setup guide carefully, and make sure the URL fields are populated with the correct base URL of your server.
+
+## Run Evaluation
+
+```sh
+bash evaluation/webarena/scripts/run_infer.sh
+```
+
+Results will be in `evaluation/evaluation_outputs/outputs/webarena/`
+
+To calculate the success rate, run:
+
+```sh
+poetry run python evaluation/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
+```
+
+## Submit your evaluation results
+
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+
+
+## BrowsingAgent V1.0 result
+
+Tested on BrowsingAgent V1.0
+
+WebArena, 812 tasks (high cost, single run due to fixed task), max step 15
+
+- GPT4o: 0.1478
+- GPT3.5: 0.0517
--- a/evaluation/webarena/init.py
+++ b/evaluation/webarena/init.py
--- a/evaluation/webarena/get_success_rate.py
+++ b/evaluation/webarena/get_success_rate.py
@@ -0,0 +1,33 @@
+import argparse
+import json
+
+import browsergym.webarena  # noqa F401 register webarena tasks as gym environments
+import gymnasium as gym
+
+parser = argparse.ArgumentParser(description='Calculate average reward.')
+parser.add_argument('output_path', type=str, help='path to output.jsonl')
+
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    env_ids = [
+        id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
+    ]
+    total_num = len(env_ids)
+    print('Total number of tasks: ', total_num)
+    total_reward = 0
+    total_cost = 0
+    actual_num = 0
+    with open(args.output_path, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            actual_num += 1
+            total_cost += data['metrics']['accumulated_cost']
+            total_reward += data['test_result']
+
+    avg_reward = total_reward / total_num
+    print('Success Rate: ', avg_reward)
+
+    avg_cost = total_cost / actual_num
+    print('Avg Cost: ', avg_cost)
+    print('Actual number of tasks finished: ', actual_num)
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@@ -0,0 +1,214 @@
+import asyncio
+import json
+import logging
+import os
+import pathlib
+import subprocess
+import time
+
+import browsergym.webarena  # noqa F401 register webarena tasks as gym environments
+import gymnasium as gym
+from tqdm import tqdm
+
+from opendevin.controller.state.state import State
+from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.serialization.event import event_to_dict
+from opendevin.runtime.docker.ssh_box import DockerSSHBox
+from opendevin.runtime.tools import RuntimeTool
+
+SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
+
+
+def process_instance(
+    env_id: str,
+    metadata: dict,
+    eval_output_dir: str,
+    docker_sandbox: DockerSSHBox,
+    reset_logger: bool = True,
+):
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(eval_output_dir, 'logs', f'instance_{env_id}.log')
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+    else:
+        logger.info(f'Starting evaluation for instance {env_id}.')
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    runtime_tools_config = {
+        RuntimeTool.BROWSER: {
+            'browsergym_eval': env_id,
+            'browsergym_eval_save_dir': eval_output_dir,
+        }
+    }
+
+    state: State = asyncio.run(
+        main(
+            'PLACEHOLDER_GOAL',
+            runtime_tools_config=runtime_tools_config,
+            sandbox=docker_sandbox,
+        )
+    )
+
+    # ======= Attempt to evaluate the agent's environment impact =======
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    metrics = state.metrics.get() if state.metrics else None
+    browsergym_eval_dir = os.path.join(eval_output_dir, env_id.split('/')[1])
+    # read goal
+    with open(
+        os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
+    ) as f:
+        instruction = f.read()
+    # read reward
+    with open(
+        os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
+    ) as f:
+        rewards = json.load(f)
+        reward = max(rewards)
+
+    # Save the output
+    output = {
+        'instance_id': env_id,
+        'instruction': instruction,
+        'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'metrics': metrics,
+        'error': state.error if state and state.error else None,
+        'test_result': reward,
+    }
+
+    return output
+
+
+if __name__ == '__main__':
+    env_ids = [
+        id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
+    ]
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert agent_class in SUPPORTED_AGENT_CLS, f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'webarena',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproducibility
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        env_ids = env_ids[:eval_n_limit]
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_instance_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_instance_ids.add(data['instance_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_env_ids = []
+    for idx in env_ids:
+        if idx in finished_instance_ids:
+            logger.info(f'Skipping instance {idx} as it is already finished.')
+            continue
+        new_env_ids.append(idx)
+
+    env_ids = new_env_ids
+    logger.info(
+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(env_ids)}'
+    )
+
+    # =============================================
+
+    docker_sandbox = DockerSSHBox()
+    for env_id in tqdm(env_ids):
+        try:
+            output = process_instance(
+                env_id=env_id,
+                metadata=metadata,
+                eval_output_dir=eval_output_dir,
+                docker_sandbox=docker_sandbox,
+                reset_logger=False,
+            )
+            output_fp.write(json.dumps(output) + '\n')
+            output_fp.flush()
+        except Exception as e:
+            logger.error(f'Error processing instance {env_id}: {e}')
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
--- a/evaluation/webarena/scripts/run_infer.sh
+++ b/evaluation/webarena/scripts/run_infer.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# configure webarena websites and environment
+source evaluation/webarena/scripts/webarena_env.sh
+
+# configure browsing agent
+export USE_NAV="false"
+export USE_CONCISE_ANSWER="true"
+
+MODEL_CONFIG=$1
+AGENT=$2
+EVAL_LIMIT=$3
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default BrowsingAgent"
+  AGENT="BrowsingAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE="$AGENT_VERSION"
+
+COMMAND="poetry run python evaluation/webarena/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 15 \
+  --max-chars 10000000 \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -12,7 +12,7 @@
        "@nextui-org/react": "^2.4.1",
        "@react-types/shared": "^3.23.1",
        "@reduxjs/toolkit": "^2.2.5",
-        "@vitejs/plugin-react": "^4.3.0",
+        "@vitejs/plugin-react": "^4.3.1",
        "@xterm/addon-fit": "^0.10.0",
        "@xterm/xterm": "^5.4.0",
        "clsx": "^2.1.1",
@@ -34,7 +34,7 @@
        "react-router-dom": "^6.23.1",
        "react-syntax-highlighter": "^15.5.0",
        "tailwind-merge": "^2.3.0",
-        "vite": "^5.2.12",
+        "vite": "^5.2.13",
        "web-vitals": "^3.5.2"
      },
      "devDependencies": {
@@ -64,7 +64,7 @@
        "lint-staged": "^15.2.5",
        "postcss": "^8.4.38",
        "prettier": "^3.3.1",
-        "tailwindcss": "^3.4.2",
+        "tailwindcss": "^3.4.4",
        "typescript": "^5.4.5",
        "vite-tsconfig-paths": "^4.3.2",
        "vitest": "^1.6.0"
@@ -6777,9 +6777,9 @@
      "integrity": "sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ=="
    },
    "node_modules/@vitejs/plugin-react": {
-      "version": "4.3.0",
-      "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.3.0.tgz",
-      "integrity": "sha512-KcEbMsn4Dpk+LIbHMj7gDPRKaTMStxxWRkRmxsg/jVdFdJCZWt1SchZcf0M4t8lIKdwwMsEyzhrcOXRrDPtOBw==",
+      "version": "4.3.1",
+      "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.3.1.tgz",
+      "integrity": "sha512-m/V2syj5CuVnaxcUJOQRel/Wr31FFXRFlnOoq1TVtkCxsY5veGMTEmpWHndrhB2U8ScHtCQB1e+4hWYExQc6Lg==",
      "dependencies": {
        "@babel/core": "^7.24.5",
        "@babel/plugin-transform-react-jsx-self": "^7.24.5",
@@ -15673,9 +15673,9 @@
      }
    },
    "node_modules/tailwindcss": {
-      "version": "3.4.3",
-      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.3.tgz",
-      "integrity": "sha512-U7sxQk/n397Bmx4JHbJx/iSOOv5G+II3f1kpLpY2QeUv5DcPdcTsYLlusZfq1NthHS1c1cZoyFmmkex1rzke0A==",
+      "version": "3.4.4",
+      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.4.tgz",
+      "integrity": "sha512-ZoyXOdJjISB7/BcLTR6SEsLgKtDStYyYZVLsUtWChO4Ps20CBad7lfJKVDiejocV4ME1hLmyY0WJE3hSDcmQ2A==",
      "dependencies": {
        "@alloc/quick-lru": "^5.2.0",
        "arg": "^5.0.2",
@@ -16433,9 +16433,9 @@
      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
    },
    "node_modules/vite": {
-      "version": "5.2.12",
-      "resolved": "https://registry.npmjs.org/vite/-/vite-5.2.12.tgz",
-      "integrity": "sha512-/gC8GxzxMK5ntBwb48pR32GGhENnjtY30G4A0jemunsBkiEZFw60s8InGpN8gkhHEkjnRK1aSAxeQgwvFhUHAA==",
+      "version": "5.2.13",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-5.2.13.tgz",
+      "integrity": "sha512-SSq1noJfY9pR3I1TUENL3rQYDQCFqgD+lM6fTRAM8Nv6Lsg5hDLaXkjETVeBt+7vZBCMoibD+6IWnT2mJ+Zb/A==",
      "dependencies": {
        "esbuild": "^0.20.1",
        "postcss": "^8.4.38",
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -11,7 +11,7 @@
    "@nextui-org/react": "^2.4.1",
    "@react-types/shared": "^3.23.1",
    "@reduxjs/toolkit": "^2.2.5",
-    "@vitejs/plugin-react": "^4.3.0",
+    "@vitejs/plugin-react": "^4.3.1",
    "@xterm/addon-fit": "^0.10.0",
    "@xterm/xterm": "^5.4.0",
    "clsx": "^2.1.1",
@@ -33,7 +33,7 @@
    "react-router-dom": "^6.23.1",
    "react-syntax-highlighter": "^15.5.0",
    "tailwind-merge": "^2.3.0",
-    "vite": "^5.2.12",
+    "vite": "^5.2.13",
    "web-vitals": "^3.5.2"
  },
  "scripts": {
@@ -85,7 +85,7 @@
    "lint-staged": "^15.2.5",
    "postcss": "^8.4.38",
    "prettier": "^3.3.1",
-    "tailwindcss": "^3.4.2",
+    "tailwindcss": "^3.4.4",
    "typescript": "^5.4.5",
    "vite-tsconfig-paths": "^4.3.2",
    "vitest": "^1.6.0"
--- a/frontend/src/components/chat/ChatInput.test.tsx
+++ b/frontend/src/components/chat/ChatInput.test.tsx
@@ -3,11 +3,11 @@ import userEvent from "@testing-library/user-event";
 import { act, render, fireEvent } from "@testing-library/react";
 import ChatInput from "./ChatInput";

-afterEach(() => {
-  vi.clearAllMocks();
-});
-
 describe("ChatInput", () => {
+  afterEach(() => {
+    vi.clearAllMocks();
+  });
+
  const onSendMessage = vi.fn();

  it("should render a textarea", () => {
--- a/frontend/src/components/chat/ChatInterface.tsx
+++ b/frontend/src/components/chat/ChatInterface.tsx
@@ -3,7 +3,6 @@ import { useDispatch, useSelector } from "react-redux";
 import { IoMdChatbubbles } from "react-icons/io";
 import { RiArrowRightDoubleLine } from "react-icons/ri";
 import { useTranslation } from "react-i18next";
-import { twMerge } from "tailwind-merge";
 import { VscArrowDown } from "react-icons/vsc";
 import { FaRegThumbsDown, FaRegThumbsUp } from "react-icons/fa";
 import { useDisclosure } from "@nextui-org/react";
@@ -125,14 +124,6 @@ function ChatInterface() {
        >
          <Chat messages={messages} />
        </div>
-        {/* Fade between messages and input */}
-        <div
-          className={twMerge(
-            "absolute bottom-0 left-0 right-0",
-            curAgentState === AgentState.AWAITING_USER_INPUT ? "h-10" : "h-4",
-            "bg-gradient-to-b from-transparent to-neutral-800",
-          )}
-        />
      </div>

      <div className="relative">
--- a/frontend/src/components/file-explorer/FileExplorer.tsx
+++ b/frontend/src/components/file-explorer/FileExplorer.tsx
@@ -167,17 +167,19 @@ function FileExplorer() {
          isHidden ? "min-w-[48px]" : "min-w-[228px]",
        )}
      >
-        <div className="flex p-2 items-center justify-between relative">
+        <div className="flex flex-col p-2 relative">
+          <div className="flex items-center justify-end mb-8">
+            <ExplorerActions
+              isHidden={isHidden}
+              toggleHidden={() => setIsHidden((prev) => !prev)}
+              onRefresh={refreshWorkspace}
+              onUpload={selectFileInput}
+            />
+          </div>
+
          <div style={{ display: isHidden ? "none" : "block" }}>
            <ExplorerTree files={files} defaultOpen />
          </div>
-
-          <ExplorerActions
-            isHidden={isHidden}
-            toggleHidden={() => setIsHidden((prev) => !prev)}
-            onRefresh={refreshWorkspace}
-            onUpload={selectFileInput}
-          />
        </div>
        <input
          data-testid="file-input"
--- a/frontend/src/components/modals/base-modal/BaseModal.tsx
+++ b/frontend/src/components/modals/base-modal/BaseModal.tsx
@@ -39,7 +39,7 @@ function BaseModal({
      size="sm"
      className="bg-neutral-900 rounded-lg"
    >
-      <ModalContent className="max-w-[24rem] p-[40px]">
+      <ModalContent className="max-w-[30rem] p-[40px]">
        {(closeModal) => (
          <>
            <ModalHeader className="flex flex-col p-0">
--- a/frontend/src/components/modals/settings/SettingsModal.test.tsx
+++ b/frontend/src/components/modals/settings/SettingsModal.test.tsx
@@ -26,6 +26,7 @@ vi.mock("#/services/settings", async (importOriginal) => ({
    LLM_MODEL: "gpt-4o",
    AGENT: "MonologueAgent",
    LANGUAGE: "en",
+    LLM_API_KEY: "sk-...",
  }),
  getDefaultSettings: vi.fn().mockReturnValue({
    LLM_MODEL: "gpt-4o",
@@ -103,7 +104,7 @@ describe("SettingsModal", () => {
  describe("onHandleSave", () => {
    const initialSettings: Settings = {
      LLM_MODEL: "gpt-4o",
-      AGENT: "MonologueAgent",
+      AGENT: "CodeActAgent",
      LANGUAGE: "en",
      LLM_API_KEY: "sk-...",
    };
@@ -139,7 +140,6 @@ describe("SettingsModal", () => {
      expect(saveSettings).toHaveBeenCalledWith({
        ...initialSettings,
        LLM_MODEL: "model3",
-        LLM_API_KEY: "", // reset after model change
      });
    });

@@ -196,7 +196,7 @@ describe("SettingsModal", () => {
        await userEvent.click(saveButton);
      });

-      expect(toastSpy).toHaveBeenCalledTimes(2);
+      expect(toastSpy).toHaveBeenCalledTimes(3);
    });

    it("should change the language", async () => {
--- a/frontend/src/components/modals/settings/SettingsModal.tsx
+++ b/frontend/src/components/modals/settings/SettingsModal.tsx
@@ -66,12 +66,9 @@ function SettingsModal({ isOpen, onOpenChange }: SettingsProps) {
  }, []);

  const handleModelChange = (model: string) => {
-    // Needs to also reset the API key.
-    const key = localStorage.getItem(`API_KEY_${model}`);
    setSettings((prev) => ({
      ...prev,
      LLM_MODEL: model,
-      LLM_API_KEY: key || "",
    }));
  };

--- a/frontend/src/services/actions.ts
+++ b/frontend/src/services/actions.ts
@@ -1,4 +1,3 @@
-import { setScreenshotSrc, setUrl } from "#/state/browserSlice";
 import { addAssistantMessage, addUserMessage } from "#/state/chatSlice";
 import { setCode, setActiveFilepath } from "#/state/codeSlice";
 import { appendInput } from "#/state/commandSlice";
@@ -13,18 +12,14 @@ import { getRootTask } from "./taskService";

 const messageActions = {
  [ActionType.BROWSE]: (message: ActionMessage) => {
-    const { url, screenshotSrc } = message.args;
-    store.dispatch(setUrl(url));
-    store.dispatch(setScreenshotSrc(screenshotSrc));
    store.dispatch(addAssistantMessage(message.message));
  },
  [ActionType.BROWSE_INTERACTIVE]: (message: ActionMessage) => {
    if (message.args.thought) {
      store.dispatch(addAssistantMessage(message.args.thought));
+    } else {
+      store.dispatch(addAssistantMessage(message.message));
    }
-    const { url, screenshotSrc } = message.args;
-    store.dispatch(setUrl(url));
-    store.dispatch(setScreenshotSrc(screenshotSrc));
  },
  [ActionType.WRITE]: (message: ActionMessage) => {
    const { path, content } = message.args;
--- a/opendevin/controller/action_parser.py
+++ b/opendevin/controller/action_parser.py
@@ -0,0 +1,76 @@
+from abc import ABC, abstractmethod
+
+from opendevin.events.action import Action
+
+
+class ResponseParser(ABC):
+    """
+    This abstract base class is a general interface for an response parser dedicated to
+    parsing the action from the response from the LLM.
+    """
+
+    def __init__(
+        self,
+    ):
+        # Need pay attention to the item order in self.action_parsers
+        self.action_parsers = []
+
+    @abstractmethod
+    def parse(self, response: str) -> Action:
+        """
+        Parses the action from the response from the LLM.
+
+        Parameters:
+        - response (str): The response from the LLM.
+
+        Returns:
+        - action (Action): The action parsed from the response.
+        """
+        pass
+
+    @abstractmethod
+    def parse_response(self, response) -> str:
+        """
+        Parses the action from the response from the LLM.
+
+        Parameters:
+        - response (str): The response from the LLM.
+
+        Returns:
+        - action_str (str): The action str parsed from the response.
+        """
+        pass
+
+    @abstractmethod
+    def parse_action(self, action_str: str) -> Action:
+        """
+        Parses the action from the response from the LLM.
+
+        Parameters:
+        - action_str (str): The response from the LLM.
+
+        Returns:
+        - action (Action): The action parsed from the response.
+        """
+        pass
+
+
+class ActionParser(ABC):
+    """
+    This abstract base class is an general interface for an action parser dedicated to
+    parsing the action from the action str from the LLM.
+    """
+
+    @abstractmethod
+    def check_condition(self, action_str: str) -> bool:
+        """
+        Check if the action string can be parsed by this parser.
+        """
+        pass
+
+    @abstractmethod
+    def parse(self, action_str: str) -> Action:
+        """
+        Parses the action from the action string from the LLM response.
+        """
+        pass
--- a/opendevin/controller/agent_controller.py
+++ b/opendevin/controller/agent_controller.py
@@ -19,6 +19,7 @@ from opendevin.events.action import (
    AddTaskAction,
    AgentDelegateAction,
    AgentFinishAction,
+    AgentRejectAction,
    ChangeAgentStateAction,
    MessageAction,
    ModifyTaskAction,
@@ -164,6 +165,9 @@ class AgentController:
        elif isinstance(event, AgentFinishAction):
            self.state.outputs = event.outputs  # type: ignore[attr-defined]
            await self.set_agent_state_to(AgentState.FINISHED)
+        elif isinstance(event, AgentRejectAction):
+            self.state.outputs = event.outputs  # type: ignore[attr-defined]
+            await self.set_agent_state_to(AgentState.REJECTED)
        elif isinstance(event, Observation):
            if self._pending_action and self._pending_action.id == event.cause:
                await self.add_history(self._pending_action, event)
@@ -252,7 +256,7 @@ class AgentController:
                # propagate error state until an agent or user can handle it
                await self.set_agent_state_to(AgentState.ERROR)
                return
-            delegate_done = delegate_state == AgentState.FINISHED
+            delegate_done = delegate_state in (AgentState.FINISHED, AgentState.REJECTED)
            if delegate_done:
                logger.info(
                    f'[Agent Controller {self.id}] Delegate agent has finished execution'
--- a/opendevin/core/config.py
+++ b/opendevin/core/config.py
@@ -149,7 +149,6 @@ class AppConfig(metaclass=Singleton):
        disable_color: Whether to disable color. For terminals that don't support color.
        sandbox_user_id: The user ID for the sandbox.
        sandbox_timeout: The timeout for the sandbox.
-        github_token: The GitHub token.
        debug: Whether to enable debugging.
        enable_auto_lint: Whether to enable auto linting. This is False by default, for regular runs of the app. For evaluation, please set this to True.
    """
@@ -183,7 +182,6 @@ class AppConfig(metaclass=Singleton):
    persist_sandbox: bool = False
    ssh_port: int = 63710
    ssh_password: str | None = None
-    github_token: str | None = None
    jwt_secret: str = uuid.uuid4().hex
    debug: bool = False
    enable_auto_lint: bool = (
--- a/opendevin/core/main.py
+++ b/opendevin/core/main.py
@@ -1,4 +1,5 @@
 import asyncio
+import os
 import sys
 from typing import Callable, Optional, Type

@@ -34,6 +35,7 @@ async def main(
    exit_on_message: bool = False,
    fake_user_response_fn: Optional[Callable[[Optional[State]], str]] = None,
    sandbox: Optional[Sandbox] = None,
+    runtime_tools_config: Optional[dict] = None,
 ) -> Optional[State]:
    """Main coroutine to run the agent controller with task input flexibility.
    It's only used when you launch opendevin backend directly via cmdline.
@@ -92,7 +94,21 @@ async def main(
    )
    runtime = ServerRuntime(event_stream=event_stream, sandbox=sandbox)
    runtime.init_sandbox_plugins(controller.agent.sandbox_plugins)
-    runtime.init_runtime_tools(controller.agent.runtime_tools, is_async=False)
+    runtime.init_runtime_tools(
+        controller.agent.runtime_tools,
+        is_async=False,
+        runtime_tools_config=runtime_tools_config,
+    )
+
+    # browser eval specific
+    # TODO: move to a better place
+    if runtime.browser and runtime.browser.eval_dir:
+        logger.info(f'Evaluation directory: {runtime.browser.eval_dir}')
+        with open(
+            os.path.join(runtime.browser.eval_dir, 'goal.txt'), 'r', encoding='utf-8'
+        ) as f:
+            task = f.read()
+            logger.info(f'Dynamic Eval task: {task}')

    await event_stream.add_event(MessageAction(content=task), EventSource.USER)

@@ -111,6 +127,7 @@ async def main(
    event_stream.subscribe(EventStreamSubscriber.MAIN, on_event)
    while controller.get_agent_state() not in [
        AgentState.FINISHED,
+        AgentState.REJECTED,
        AgentState.ERROR,
        AgentState.PAUSED,
        AgentState.STOPPED,
--- a/opendevin/core/schema/agent.py
+++ b/opendevin/core/schema/agent.py
@@ -30,6 +30,10 @@ class AgentState(str, Enum):
    """The agent is finished with the current task.
    """

+    REJECTED = 'rejected'
+    """The agent rejects the task.
+    """
+
    ERROR = 'error'
    """An error occurred during the task.
    """
--- a/opendevin/core/schema/config.py
+++ b/opendevin/core/schema/config.py
@@ -41,5 +41,4 @@ class ConfigType(str, Enum):
    USE_HOST_NETWORK = 'USE_HOST_NETWORK'
    SSH_HOSTNAME = 'SSH_HOSTNAME'
    DISABLE_COLOR = 'DISABLE_COLOR'
-    GITHUB_TOKEN = 'GITHUB_TOKEN'
    DEBUG = 'DEBUG'
--- a/opendevin/events/action/browse.py
+++ b/opendevin/events/action/browse.py
@@ -29,6 +29,7 @@ class BrowseURLAction(Action):
 class BrowseInteractiveAction(Action):
    browser_actions: str
    thought: str = ''
+    browsergym_send_msg_to_user: str = ''
    action: str = ActionType.BROWSE_INTERACTIVE
    runnable: ClassVar[bool] = True

--- a/opendevin/events/observation/browse.py
+++ b/opendevin/events/observation/browse.py
@@ -21,6 +21,9 @@ class BrowserOutputObservation(Observation):
    active_page_index: int = -1
    dom_object: dict = field(default_factory=dict, repr=False)  # don't show in repr
    axtree_object: dict = field(default_factory=dict, repr=False)  # don't show in repr
+    extra_element_properties: dict = field(
+        default_factory=dict, repr=False
+    )  # don't show in repr
    last_browser_action: str = ''
    last_browser_action_error: str = ''
    focused_element_bid: str = ''
--- a/opendevin/events/serialization/event.py
+++ b/opendevin/events/serialization/event.py
@@ -20,6 +20,7 @@ DELETE_FROM_MEMORY_EXTRAS = {
    'last_browser_action',
    'last_browser_action_error',
    'focused_element_bid',
+    'extra_element_properties',
 }


--- a/opendevin/llm/llm.py
+++ b/opendevin/llm/llm.py
@@ -61,6 +61,7 @@ class LLM:
        max_output_tokens=None,
        llm_config=None,
        metrics=None,
+        cost_metric_supported=True,
    ):
        """
        Initializes the LLM. If LLMConfig is passed, its values will be the fallback.
@@ -81,6 +82,7 @@ class LLM:
            llm_timeout (int, optional): The maximum time to wait for a response in seconds. Defaults to LLM_TIMEOUT.
            llm_temperature (float, optional): The temperature for LLM sampling. Defaults to LLM_TEMPERATURE.
            metrics (Metrics, optional): The metrics object to use. Defaults to None.
+            cost_metric_supported (bool, optional): Whether the cost metric is supported. Defaults to True.
        """
        if llm_config is None:
            llm_config = config.llm
@@ -127,6 +129,7 @@ class LLM:
        self.llm_timeout = llm_timeout
        self.custom_llm_provider = custom_llm_provider
        self.metrics = metrics
+        self.cost_metric_supported = cost_metric_supported

        # litellm actually uses base Exception here for unknown model
        self.model_info = None
@@ -226,11 +229,12 @@ class LLM:
            cur_cost = self.completion_cost(response)
        except Exception:
            cur_cost = 0
-        logger.info(
-            'Cost: %.2f USD | Accumulated Cost: %.2f USD',
-            cur_cost,
-            self.metrics.accumulated_cost,
-        )
+        if self.cost_metric_supported:
+            logger.info(
+                'Cost: %.2f USD | Accumulated Cost: %.2f USD',
+                cur_cost,
+                self.metrics.accumulated_cost,
+            )

    def get_token_count(self, messages):
        """
@@ -271,6 +275,9 @@ class LLM:
        Returns:
            number: The cost of the response.
        """
+        if not self.cost_metric_supported:
+            return 0.0
+
        extra_kwargs = {}
        if (
            config.llm.input_cost_per_token is not None
@@ -291,6 +298,7 @@ class LLM:
                self.metrics.add_cost(cost)
                return cost
            except Exception:
+                self.cost_metric_supported = False
                logger.warning('Cost calculation not supported for this model.')
        return 0.0

--- a/opendevin/runtime/browser/browser_env.py
+++ b/opendevin/runtime/browser/browser_env.py
@@ -1,7 +1,9 @@
 import atexit
 import base64
 import io
+import json
 import multiprocessing
+import os
 import threading
 import time
 import uuid
@@ -18,15 +20,27 @@ from opendevin.core.logger import opendevin_logger as logger


 class BrowserEnv:
-    def __init__(self, is_async: bool = True):
-        self.html_text_converter = html2text.HTML2Text()
-        # ignore links and images
-        self.html_text_converter.ignore_links = False
-        self.html_text_converter.ignore_images = True
-        # use alt text for images
-        self.html_text_converter.images_to_alt = True
-        # disable auto text wrapping
-        self.html_text_converter.body_width = 0
+    def __init__(
+        self,
+        is_async: bool = True,
+        browsergym_eval: str = '',
+        browsergym_eval_save_dir: str = '',
+    ):
+        self.html_text_converter = self.get_html_text_converter()
+        self.eval_mode = False
+        self.eval_dir = ''
+        # EVAL only: browsergym_eval and browsergym_eval_save_dir must be provided for evaluation
+        self.browsergym_eval = browsergym_eval
+        self.browsergym_eval_save_dir = browsergym_eval_save_dir
+        if self.browsergym_eval:
+            assert (
+                self.browsergym_eval_save_dir
+            ), 'browsergym_eval_save_dir must be provided for evaluation.'
+            self.eval_mode = True
+            self.eval_dir = os.path.join(
+                self.browsergym_eval_save_dir, self.browsergym_eval.split('/')[1]
+            )
+            os.makedirs(self.eval_dir, exist_ok=True)
        # Initialize browser environment process
        multiprocessing.set_start_method('spawn', force=True)
        self.browser_side, self.agent_side = multiprocessing.Pipe()
@@ -39,6 +53,17 @@ class BrowserEnv:
            self.init_browser()
        atexit.register(self.close)

+    def get_html_text_converter(self):
+        html_text_converter = html2text.HTML2Text()
+        # ignore links and images
+        html_text_converter.ignore_links = False
+        html_text_converter.ignore_images = True
+        # use alt text for images
+        html_text_converter.images_to_alt = True
+        # disable auto text wrapping
+        html_text_converter.body_width = 0
+        return html_text_converter
+
    def init_browser(self):
        logger.info('Starting browser env...')
        self.process.start()
@@ -47,14 +72,26 @@ class BrowserEnv:
            raise BrowserInitException('Failed to start browser environment.')

    def browser_process(self):
-        env = gym.make(
-            'browsergym/openended',
-            task_kwargs={'start_url': 'about:blank'},
-            wait_for_user_message=False,
-            headless=True,
-            disable_env_checker=True,
-        )
+        if self.eval_mode:
+            logger.info('Creating browser env for evaluation purpose.')
+            env = gym.make(self.browsergym_eval)
+        else:
+            env = gym.make(
+                'browsergym/openended',
+                task_kwargs={'start_url': 'about:blank', 'goal': 'PLACEHOLDER_GOAL'},
+                wait_for_user_message=False,
+                headless=True,
+                disable_env_checker=True,
+            )
        obs, info = env.reset()
+        # EVAL only: save the goal into file for evaluation
+        if self.eval_mode:
+            rewards = []  # store rewards if in eval mode
+            logger.info(obs['goal'])
+            with open(
+                os.path.join(self.eval_dir, 'goal.txt'), 'w', encoding='utf-8'
+            ) as f:
+                f.write(obs['goal'])
        logger.info('Browser env started.')
        while True:
            try:
@@ -70,6 +107,15 @@ class BrowserEnv:
                        continue
                    action = action_data['action']
                    obs, reward, terminated, truncated, info = env.step(action)
+                    # EVAL only: save the rewards into file for evaluation
+                    if self.eval_mode:
+                        rewards.append(reward)
+                        with open(
+                            os.path.join(self.eval_dir, 'rewards.json'),
+                            'w',
+                            encoding='utf-8',
+                        ) as f:
+                            f.write(json.dumps(rewards))
                    # add text content of the page
                    html_str = flatten_dom_to_str(obs['dom_object'])
                    obs['text_content'] = self.html_text_converter.handle(html_str)
@@ -86,7 +132,7 @@ class BrowserEnv:
                    pass
                return

-    def step(self, action_str: str, timeout: float = 10) -> dict:
+    def step(self, action_str: str, timeout: float = 30) -> dict:
        unique_request_id = str(uuid.uuid4())
        self.agent_side.send((unique_request_id, {'action': action_str}))
        start_time = time.time()
@@ -108,7 +154,6 @@ class BrowserEnv:

    def close(self):
        if not self.process.is_alive():
-            logger.info('BrowserEnv already closed, no need to close again')
            return
        try:
            self.agent_side.send(('SHUTDOWN', None))
--- a/opendevin/runtime/files.py
+++ b/opendevin/runtime/files.py
@@ -1,42 +0,0 @@
-from pathlib import Path
-from typing import Any
-
-
-class WorkspaceFile:
-    name: str
-    children: list['WorkspaceFile']
-
-    def __init__(self, name: str, children: list['WorkspaceFile']):
-        self.name = name
-        self.children = children
-
-    def to_dict(self) -> dict[str, Any]:
-        """Converts the File object to a dictionary.
-
-        Returns:
-            The dictionary representation of the File object.
-        """
-        return {
-            'name': self.name,
-            'children': [child.to_dict() for child in self.children],
-        }
-
-
-def get_folder_structure(workdir: Path) -> WorkspaceFile:
-    """Gets the folder structure of a directory.
-
-    Args:
-        workdir: The directory path.
-
-    Returns:
-        The folder structure.
-    """
-    root = WorkspaceFile(name=workdir.name, children=[])
-    for item in workdir.iterdir():
-        if item.is_dir():
-            dir = get_folder_structure(item)
-            if dir.children:
-                root.children.append(dir)
-        else:
-            root.children.append(WorkspaceFile(name=item.name, children=[]))
-    return root
--- a/opendevin/runtime/plugins/agent_skills/agentskills.py
+++ b/opendevin/runtime/plugins/agent_skills/agentskills.py
@@ -12,13 +12,15 @@ Functions:
 - search_dir(search_term, dir_path='./'): Searches for a term in all files in the specified directory.
 - search_file(search_term, file_path=None): Searches for a term in the specified file or the currently open file.
 - find_file(file_name, dir_path='./'): Finds all files with the given name in the specified directory.
- edit_file(start, end, content): Replaces lines in a file with the given content.
+- edit_file(file_name, start, end, content): Replaces lines in a file with the given content.
+- append_file(file_name, content): Appends given content to a file.
 """

 import base64
 import functools
 import os
 import subprocess
+import tempfile
 from inspect import signature
 from typing import Optional

@@ -34,6 +36,9 @@ WINDOW = 100

 ENABLE_AUTO_LINT = os.getenv('ENABLE_AUTO_LINT', 'false').lower() == 'true'

+# This is also used in unit tests!
+MSG_FILE_UPDATED = '[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]'
+
 # OPENAI
 OPENAI_API_KEY = os.getenv(
    'OPENAI_API_KEY', os.getenv('SANDBOX_ENV_OPENAI_API_KEY', '')
@@ -63,12 +68,13 @@ def update_pwd_decorator(func):
    return wrapper


-def _lint_file(file_path: str) -> Optional[str]:
+def _lint_file(file_path: str) -> tuple[Optional[str], Optional[int]]:
    """
-    Lint the file at the given path.
+    Lint the file at the given path and return a tuple with a boolean indicating if there are errors,
+    and the line number of the first error, if any.

    Returns:
-        Optional[str]: A string containing the linting report if the file failed to lint, None otherwise.
+        tuple[str, Optional[int]]: (lint_error, first_error_line_number)
    """

    if file_path.endswith('.py'):
@@ -88,13 +94,28 @@ def _lint_file(file_path: str) -> Optional[str]:
        )
        if result.returncode == 0:
            # Linting successful. No issues found.
-            return None
-        else:
-            ret = 'ERRORS:\n'
-            ret += result.stdout.decode().strip()
-            return ret.rstrip('\n')
+            return None, None
+
+        # Extract the line number from the first error message
+        error_message = result.stdout.decode().strip()
+        lint_error = 'ERRORS:\n' + error_message
+        first_error_line = None
+        for line in error_message.split('\n'):
+            if line.strip():
+                # The format of the error message is: <filename>:<line>:<column>: <error code> <error message>
+                parts = line.split(':')
+                if len(parts) >= 2:
+                    try:
+                        first_error_line = int(parts[1])
+                        break
+                    except ValueError:
+                        # Not a valid line number, continue to the next line
+                        continue
+
+        return lint_error, first_error_line
+
    # Not a python file, skip linting
-    return None
+    return None, None


 def _print_window(CURRENT_FILE, CURRENT_LINE, WINDOW, return_str=False):
@@ -244,25 +265,26 @@ def create_file(filename: str) -> None:


@update_pwd_decorator
-def edit_file(start: int, end: int, content: str) -> None:
+def edit_file(file_name: str, start: int, end: int, content: str) -> None:
    """Edit a file.

-    It replaces lines `start` through `end` (inclusive) with the given text `content` in the open file. Remember, the file must be open before editing.
+    Replaces in given file `file_name` the lines `start` through `end` (inclusive) with the given text `content`.

    Args:
+        file_name: str: The name of the file to edit.
        start: int: The start line number. Must satisfy start >= 1.
        end: int: The end line number. Must satisfy start <= end <= number of lines in the file.
        content: str: The content to replace the lines with.
    """
    global CURRENT_FILE, CURRENT_LINE, WINDOW
-    if not CURRENT_FILE or not os.path.isfile(CURRENT_FILE):
-        raise FileNotFoundError('No file open. Use the open_file function first.')
+    if not os.path.isfile(file_name):
+        raise FileNotFoundError(f'File {file_name} not found.')

    # Load the file
-    with open(CURRENT_FILE, 'r') as file:
+    with open(file_name, 'r') as file:
        lines = file.readlines()

-    ERROR_MSG = f'[Error editing opened file {CURRENT_FILE}. Please confirm the opened file is correct.]'
+    ERROR_MSG = f'[Error editing file {file_name}. Please confirm the file is correct.]'
    ERROR_MSG_SUFFIX = (
        'Your changes have NOT been applied. Please fix your edit command and try again.\n'
        'You either need to 1) Open the correct file and try again or 2) Specify the correct start/end line arguments.\n'
@@ -293,24 +315,30 @@ def edit_file(start: int, end: int, content: str) -> None:
        return

    edited_content = content + '\n'
-    n_edited_lines = len(edited_content.split('\n'))
    new_lines = lines[: start - 1] + [edited_content] + lines[end:]

    # directly write edited lines to the file
-    with open(CURRENT_FILE, 'w') as file:
+    with open(file_name, 'w') as file:
        file.writelines(new_lines)

+    # set current line to the center of the edited lines
+    CURRENT_LINE = (start + end) // 2
+    first_error_line = None
+
    # Handle linting
    if ENABLE_AUTO_LINT:
        # BACKUP the original file
        original_file_backup_path = os.path.join(
-            os.path.dirname(CURRENT_FILE), f'.backup.{os.path.basename(CURRENT_FILE)}'
+            os.path.dirname(file_name), f'.backup.{os.path.basename(file_name)}'
        )
        with open(original_file_backup_path, 'w') as f:
            f.writelines(lines)

-        lint_error = _lint_file(CURRENT_FILE)
-        if lint_error:
+        lint_error, first_error_line = _lint_file(file_name)
+        if lint_error is not None:
+            if first_error_line is not None:
+                CURRENT_LINE = int(first_error_line)
+            # only change any literal strings here in combination with unit tests!
            print(
                '[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]'
            )
@@ -318,8 +346,8 @@ def edit_file(start: int, end: int, content: str) -> None:

            print('[This is how your edit would have looked if applied]')
            print('-------------------------------------------------')
-            cur_line = (n_edited_lines // 2) + start
-            _print_window(CURRENT_FILE, cur_line, 10)
+            cur_line = first_error_line
+            _print_window(file_name, cur_line, 10)
            print('-------------------------------------------------\n')

            print('[This is the original code before your edit]')
@@ -335,7 +363,7 @@ def edit_file(start: int, end: int, content: str) -> None:

            # recover the original file
            with open(original_file_backup_path, 'r') as fin, open(
-                CURRENT_FILE, 'w'
+                file_name, 'w'
            ) as fout:
                fout.write(fin.read())
            os.remove(original_file_backup_path)
@@ -343,13 +371,123 @@ def edit_file(start: int, end: int, content: str) -> None:

        os.remove(original_file_backup_path)

-    with open(CURRENT_FILE, 'r') as file:
+    # Update the file information and print the updated content
+    with open(file_name, 'r') as file:
        n_total_lines = len(file.readlines())
-    # set current line to the center of the edited lines
-    CURRENT_LINE = (start + end) // 2
+    if first_error_line is not None and int(first_error_line) > 0:
+        CURRENT_LINE = first_error_line
+    else:
+        CURRENT_LINE = n_total_lines
    print(
-        f'[File: {os.path.abspath(CURRENT_FILE)} ({n_total_lines} lines total after edit)]'
+        f'[File: {os.path.abspath(file_name)} ({n_total_lines} lines total after edit)]'
    )
+    CURRENT_FILE = file_name
+    _print_window(CURRENT_FILE, CURRENT_LINE, WINDOW)
+    print(MSG_FILE_UPDATED)
+
+
+@update_pwd_decorator
+def append_file(file_name: str, content: str) -> None:
+    """Append content to the given file.
+
+    It appends text `content` to the end of the specified file.
+
+    Args:
+        file_name: str: The name of the file to append to.
+        content: str: The content to append to the file.
+    """
+    global CURRENT_FILE, CURRENT_LINE, WINDOW
+    if not os.path.isfile(file_name):
+        raise FileNotFoundError(f'File {file_name} not found.')
+
+    # Use a temporary file to write changes
+    temp_file_path = ''
+    first_error_line = None
+    try:
+        # Create a temporary file
+        with tempfile.NamedTemporaryFile('w', delete=False) as temp_file:
+            temp_file_path = temp_file.name
+
+            # Read the original file and check if empty and for a trailing newline
+            with open(file_name, 'r') as original_file:
+                lines = original_file.readlines()
+
+            if lines and not (len(lines) == 1 and lines[0].strip() == ''):
+                if not lines[-1].endswith('\n'):
+                    lines[-1] += '\n'
+                content = ''.join(lines) + content
+            else:
+                content = content
+
+            if not content.endswith('\n'):
+                content += '\n'
+
+            # Append the new content with a trailing newline
+            temp_file.write(content)
+
+        # Replace the original file with the temporary file atomically
+        os.replace(temp_file_path, file_name)
+
+        # Handle linting
+        if ENABLE_AUTO_LINT:
+            # BACKUP the original file
+            original_file_backup_path = os.path.join(
+                os.path.dirname(file_name),
+                f'.backup.{os.path.basename(file_name)}',
+            )
+            with open(original_file_backup_path, 'w') as f:
+                f.writelines(lines)
+
+            lint_error, first_error_line = _lint_file(file_name)
+            if lint_error is not None:
+                if first_error_line is not None:
+                    CURRENT_LINE = int(first_error_line)
+                print(
+                    '[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]'
+                )
+                print(lint_error)
+
+                print('[This is how your edit would have looked if applied]')
+                print('-------------------------------------------------')
+                _print_window(file_name, CURRENT_LINE, 10)
+                print('-------------------------------------------------\n')
+
+                print('[This is the original code before your edit]')
+                print('-------------------------------------------------')
+                _print_window(original_file_backup_path, CURRENT_LINE, 10)
+                print('-------------------------------------------------')
+
+                print(
+                    'Your changes have NOT been applied. Please fix your edit command and try again.\n'
+                    'You need to correct your added code.\n'
+                    'DO NOT re-run the same failed edit command. Running it again will lead to the same error.'
+                )
+
+                # recover the original file
+                with open(original_file_backup_path, 'r') as fin, open(
+                    file_name, 'w'
+                ) as fout:
+                    fout.write(fin.read())
+                os.remove(original_file_backup_path)
+                return
+
+    except Exception as e:
+        # Clean up the temporary file if an error occurs
+        if temp_file_path and os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+        raise e
+
+    # Update the file information and print the updated content
+    with open(file_name, 'r', encoding='utf-8') as file:
+        n_total_lines = len(file.readlines())
+    if first_error_line is not None and int(first_error_line) > 0:
+        CURRENT_LINE = first_error_line
+    else:
+        CURRENT_LINE = n_total_lines
+    print(
+        f'[File: {os.path.abspath(file_name)} ({n_total_lines} lines total after edit)]'
+    )
+    CURRENT_FILE = file_name
    _print_window(CURRENT_FILE, CURRENT_LINE, WINDOW)
    print(
        '[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]'
@@ -670,6 +808,7 @@ __all__ = [
    'scroll_down',
    'scroll_up',
    'create_file',
+    'append_file',
    'edit_file',
    'search_dir',
    'search_file',
--- a/opendevin/runtime/plugins/jupyter/execute_cli
+++ b/opendevin/runtime/plugins/jupyter/execute_cli
@@ -1,3 +1,4 @@
 #!/bin/bash
 # Run the Python script with the specified interpreter
+export JUPYTER_PWD=$(pwd)
 $OPENDEVIN_PYTHON_INTERPRETER /opendevin/plugins/jupyter/execute_cli.py
--- a/opendevin/runtime/runtime.py
+++ b/opendevin/runtime/runtime.py
@@ -1,5 +1,6 @@
 import asyncio
 from abc import abstractmethod
+from typing import Any, Optional

 from opendevin.core.config import config
 from opendevin.core.exceptions import BrowserInitException
@@ -91,12 +92,18 @@ class Runtime:
        self.sandbox.init_plugins(plugins)

    def init_runtime_tools(
-        self, runtime_tools: list[RuntimeTool], is_async: bool = True
+        self,
+        runtime_tools: list[RuntimeTool],
+        runtime_tools_config: Optional[dict[RuntimeTool, Any]] = None,
+        is_async: bool = True,
    ) -> None:
        # if browser in runtime_tools, init it
        if RuntimeTool.BROWSER in runtime_tools:
+            if runtime_tools_config is None:
+                runtime_tools_config = {}
+            browser_env_config = runtime_tools_config.get(RuntimeTool.BROWSER, {})
            try:
-                self.browser = BrowserEnv(is_async)
+                self.browser = BrowserEnv(is_async=is_async, **browser_env_config)
            except BrowserInitException:
                logger.warn(
                    'Failed to start browser environment, web browsing functionality will not work'
--- a/opendevin/runtime/server/browse.py
+++ b/opendevin/runtime/server/browse.py
@@ -30,6 +30,9 @@ async def browse(action, browser: BrowserEnv | None) -> BrowserOutputObservation
            active_page_index=obs['active_page_index'],  # index of the active page
            dom_object=obs['dom_object'],  # DOM object
            axtree_object=obs['axtree_object'],  # accessibility tree object
+            extra_element_properties=obs[
+                'extra_element_properties'
+            ],  # extra element properties
            last_browser_action=obs['last_action'],  # last browser env action performed
            focused_element_bid=obs['focused_element_bid'],  # focused element bid
            screenshot=obs['screenshot'],  # base64-encoded screenshot, png
--- a/opendevin/runtime/server/runtime.py
+++ b/opendevin/runtime/server/runtime.py
@@ -48,7 +48,6 @@ class ServerRuntime(Runtime):
        )

    async def run_ipython(self, action: IPythonRunCellAction) -> Observation:
-        action.code = action.code.replace('`', r'\`')
        obs = self._run_command(
            ("cat > /tmp/opendevin_jupyter_temp.py <<'EOL'\n" f'{action.code}\n' 'EOL'),
            background=False,
--- a/opendevin/server/README.md
+++ b/opendevin/server/README.md
@@ -3,6 +3,7 @@
 This is a WebSocket server that executes tasks using an agent.

 ## Install
+
 Follow the instructions in the base README.md to install dependencies and set up.

 ## Start the Server
@@ -13,7 +14,7 @@ uvicorn opendevin.server.listen:app --reload --port 3000

 ## Test the Server

-You can use `websocat` to test the server: https://github.com/vi/websocat
+You can use [`websocat`](https://github.com/vi/websocat) to test the server.

 ```sh
 websocat ws://127.0.0.1:3000/ws
@@ -24,23 +25,28 @@ websocat ws://127.0.0.1:3000/ws

 ```sh
 LLM_API_KEY=sk-... # Your OpenAI API Key
-LLM_MODEL=gpt-4o # Default model for the agent to use
-WORKSPACE_BASE=/path/to/your/workspace # Default path to model's workspace
+LLM_MODEL=gpt-4o   # Default model for the agent to use
+WORKSPACE_BASE=/path/to/your/workspace # Default absolute path to workspace
 ```

 ## API Schema
+
 There are two types of messages that can be sent to, or received from, the server:
+
 * Actions
 * Observations

 ### Actions
+
 An action has three parts:
+
 * `action`: The action to be taken
 * `args`: The arguments for the action
 * `message`: A friendly message that can be put in the chat log

 There are several kinds of actions. Their arguments are listed below.
 This list may grow over time.
+
 * `initialize` - initializes the agent. Only sent by client.
  * `model` - the name of the model to use
  * `directory` - the path to the workspace
@@ -66,7 +72,9 @@ This list may grow over time.
 * `finish` - agent signals that the task is completed

 ### Observations
+
 An observation has four parts:
+
 * `observation`: The observation type
 * `content`: A string representing the observed data
 * `extras`: additional structured data
@@ -74,6 +82,7 @@ An observation has four parts:

 There are several kinds of observations. Their extras are listed below.
 This list may grow over time.
+
 * `read` - the content of a file
  * `path` - the path of the file read
 * `browse` - the HTML content of a url
--- a/opendevin/server/listen.py
+++ b/opendevin/server/listen.py
@@ -6,6 +6,8 @@ from opendevin.server.data_models.feedback import FeedbackDataModel, store_feedb
 with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    import litellm
+from pathlib import Path
+
 from fastapi import FastAPI, Request, Response, UploadFile, WebSocket, status
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
@@ -190,7 +192,7 @@ async def get_litellm_models():
    bedrock_model_list = bedrock.list_foundation_models()
    model_list = litellm_model_list_without_bedrock + bedrock_model_list

-    return list(set(model_list))
+    return list(sorted(set(model_list)))


@app.get('/api/options/agents')
@@ -203,7 +205,7 @@ async def get_agents():
    curl http://localhost:3000/api/agents
    ```
    """
-    agents = Agent.list_agents()
+    agents = sorted(Agent.list_agents())
    return agents


@@ -223,8 +225,41 @@ def list_files(request: Request, path: str = '/'):
            content={'error': 'Runtime not yet initialized'},
        )

+    exclude_list = (
+        '.git',
+        '.DS_Store',
+        '.svn',
+        '.hg',
+        '.idea',
+        '.vscode',
+        '.settings',
+        '.pytest_cache',
+        '__pycache__',
+        'node_modules',
+        'vendor',
+        'build',
+        'dist',
+        'bin',
+        'logs',
+        'log',
+        'tmp',
+        'temp',
+        'coverage',
+        'venv',
+        'env',
+    )
+
    try:
-        return request.state.session.agent_session.runtime.file_store.list(path)
+        entries = request.state.session.agent_session.runtime.file_store.list(path)
+
+        # Filter entries, excluding special folders
+        if entries:
+            return [
+                entry
+                for entry in entries
+                if Path(entry).parts and Path(entry).parts[-1] not in exclude_list
+            ]
+        return []
    except Exception as e:
        logger.error(f'Error refreshing files: {e}', exc_info=False)
        error_msg = f'Error refreshing files: {e}'
--- a/opendevin/server/session/agent.py
+++ b/opendevin/server/session/agent.py
@@ -114,5 +114,6 @@ class AgentSession:
        try:
            agent_state = State.restore_from_session(self.sid)
            self.controller.set_state(agent_state)
+            logger.info(f'Restored agent state from session, sid: {self.sid}')
        except Exception as e:
            print('Error restoring state', e)
--- a/poetry.lock
+++ b/poetry.lock
@@ -416,17 +416,17 @@ files = [

 [[package]]
 name = "boto3"
-version = "1.34.118"
+version = "1.34.122"
 description = "The AWS SDK for Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "boto3-1.34.118-py3-none-any.whl", hash = "sha256:e9edaf979fbe59737e158f2f0f3f0861ff1d61233f18f6be8ebb483905f24587"},
-    {file = "boto3-1.34.118.tar.gz", hash = "sha256:4eb8019421cb664a6fcbbee6152aa95a28ce8bbc1c4ee263871c09cdd58bf8ee"},
+    {file = "boto3-1.34.122-py3-none-any.whl", hash = "sha256:b2d7400ff84fa547e53b3d9acfa3c95d65d45b5886ba1ede1f7df4768d1cc0b1"},
+    {file = "boto3-1.34.122.tar.gz", hash = "sha256:56840d8ce91654d182f1c113f0791fa2113c3aa43230c50b4481f235348a6037"},
 ]

 [package.dependencies]
-botocore = ">=1.34.118,<1.35.0"
+botocore = ">=1.34.122,<1.35.0"
 jmespath = ">=0.7.1,<2.0.0"
 s3transfer = ">=0.10.0,<0.11.0"

@@ -435,13 +435,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]

 [[package]]
 name = "botocore"
-version = "1.34.118"
+version = "1.34.122"
 description = "Low-level, data-driven core of boto 3."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "botocore-1.34.118-py3-none-any.whl", hash = "sha256:e3f6c5636a4394768e81e33a16f5c6ae7f364f512415d423f9b9dc67fc638df4"},
-    {file = "botocore-1.34.118.tar.gz", hash = "sha256:0a3d1ec0186f8b516deb39474de3d226d531f77f92a0f56ad79b80219db3ae9e"},
+    {file = "botocore-1.34.122-py3-none-any.whl", hash = "sha256:6d75df3af831b62f0c7baa109728d987e0a8d34bfadf0476eb32e2f29a079a36"},
+    {file = "botocore-1.34.122.tar.gz", hash = "sha256:9374e16a36f1062c3e27816e8599b53eba99315dfac71cc84fc3aee3f5d3cbe3"},
 ]

 [package.dependencies]
@@ -450,7 +450,7 @@ python-dateutil = ">=2.1,<3.0.0"
 urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""}

 [package.extras]
-crt = ["awscrt (==0.20.9)"]
+crt = ["awscrt (==0.20.11)"]

 [[package]]
 name = "browsergym"
@@ -1437,6 +1437,23 @@ fastapi = "*"
 typer = ">=0.12.3"
 uvicorn = {version = ">=0.15.0", extras = ["standard"]}

+[[package]]
+name = "fastcore"
+version = "1.5.38"
+description = "Python supercharged for fastai development"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "fastcore-1.5.38-py3-none-any.whl", hash = "sha256:327f011613c986e7f627f63d1d9993c8d6de116c586df94d85806fbfbe45e52a"},
+    {file = "fastcore-1.5.38.tar.gz", hash = "sha256:7732403778de9bc2b25bf52617c7fbb9e7ae96010f534a5f00f7e6dee73f1d39"},
+]
+
+[package.dependencies]
+packaging = "*"
+
+[package.extras]
+dev = ["matplotlib", "nbclassic", "nbdev (>=0.2.39)", "numpy", "pandas", "pillow", "torch"]
+
 [[package]]
 name = "filelock"
 version = "3.14.0"
@@ -1754,6 +1771,25 @@ monitor = ["psutil (>=5.7.0)"]
 recommended = ["cffi (>=1.12.2)", "dnspython (>=1.16.0,<2.0)", "idna", "psutil (>=5.7.0)"]
 test = ["cffi (>=1.12.2)", "coverage (>=5.0)", "dnspython (>=1.16.0,<2.0)", "idna", "objgraph", "psutil (>=5.7.0)", "requests"]

+[[package]]
+name = "ghapi"
+version = "1.0.5"
+description = "A python client for the GitHub API"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "ghapi-1.0.5-py3-none-any.whl", hash = "sha256:24a851b7a256861f173437c807701beac3857a84979067ddc25a8555868ce6dc"},
+    {file = "ghapi-1.0.5.tar.gz", hash = "sha256:57f170d50d4e6cbf475d234056c54b1ea7bb917b96b0a19798f6127d8a0c40b1"},
+]
+
+[package.dependencies]
+fastcore = ">=1.5.4"
+packaging = "*"
+pip = "*"
+
+[package.extras]
+dev = ["jsonref", "matplotlib"]
+
 [[package]]
 name = "gitdb"
 version = "4.0.11"
@@ -2627,13 +2663,13 @@ types-tqdm = "*"

 [[package]]
 name = "litellm"
-version = "1.40.2"
+version = "1.40.7"
 description = "Library to easily interface with LLM API providers"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8"
 files = [
-    {file = "litellm-1.40.2-py3-none-any.whl", hash = "sha256:56ee777eed30ee9acb86e74401d090dcac4adb57b5c8a8714f791b0c97a34afc"},
-    {file = "litellm-1.40.2.tar.gz", hash = "sha256:1f5dc4eab7100962c3a2985c7d8c13070ff5793b341540d19b98a2bd85955cb0"},
+    {file = "litellm-1.40.7-py3-none-any.whl", hash = "sha256:c98dd8733e632aba16f14bf82e56f7159222097a6d085b242a3140b5d3e7baa4"},
+    {file = "litellm-1.40.7.tar.gz", hash = "sha256:557bb19e8e484d0dfe8e4eaa9ccefc888617852988a46d6e7adc41585a2c0600"},
 ]

 [package.dependencies]
@@ -2754,13 +2790,13 @@ query-tools = ["guidance (>=0.0.64,<0.0.65)", "jsonpath-ng (>=1.6.0,<2.0.0)", "l

 [[package]]
 name = "llama-index-embeddings-azure-openai"
-version = "0.1.9"
+version = "0.1.10"
 description = "llama-index embeddings azure openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_azure_openai-0.1.9-py3-none-any.whl", hash = "sha256:67c91c953e81b9b83fac8385700aa042bf5a410fdc1ac61b73ea810f0e2c313a"},
-    {file = "llama_index_embeddings_azure_openai-0.1.9.tar.gz", hash = "sha256:dcc1b5b2b37b7b249ae529731a5ed2bc7d325cb270d6d55dde889474dd997ae2"},
+    {file = "llama_index_embeddings_azure_openai-0.1.10-py3-none-any.whl", hash = "sha256:b100b7338bdfb236ea445eab341c52db8945dac3642141134ec77302ac6fa405"},
+    {file = "llama_index_embeddings_azure_openai-0.1.10.tar.gz", hash = "sha256:e772268d064f082c2d276c26505a3c087973e766d3d411d0e12f14f38dd92eaa"},
 ]

 [package.dependencies]
@@ -2981,13 +3017,13 @@ llama-parse = ">=0.4.0,<0.5.0"

 [[package]]
 name = "llama-index-vector-stores-chroma"
-version = "0.1.8"
+version = "0.1.9"
 description = "llama-index vector_stores chroma integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_vector_stores_chroma-0.1.8-py3-none-any.whl", hash = "sha256:77f5081a08bcede4fafb3c47b15b3bd5cacaef8d038750207d1858f73bc2e255"},
-    {file = "llama_index_vector_stores_chroma-0.1.8.tar.gz", hash = "sha256:9c574baf370faf456bcb67b9d5ea273a6fa1f2b4fd205a59c47b68112364b9e7"},
+    {file = "llama_index_vector_stores_chroma-0.1.9-py3-none-any.whl", hash = "sha256:0d900fe97def537c2dd1c2d155287fae014b63848e3aff28902eb38c45e0bc28"},
+    {file = "llama_index_vector_stores_chroma-0.1.9.tar.gz", hash = "sha256:6a5c27ab3ae25cf504bed9513c1f035365dfb576b886fe334d46908ca24a59cf"},
 ]

 [package.dependencies]
@@ -3104,13 +3140,9 @@ files = [
    {file = "lxml-5.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:edcfa83e03370032a489430215c1e7783128808fd3e2e0a3225deee278585196"},
    {file = "lxml-5.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28bf95177400066596cdbcfc933312493799382879da504633d16cf60bba735b"},
    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a745cc98d504d5bd2c19b10c79c61c7c3df9222629f1b6210c0368177589fb8"},
-    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b590b39ef90c6b22ec0be925b211298e810b4856909c8ca60d27ffbca6c12e6"},
    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b336b0416828022bfd5a2e3083e7f5ba54b96242159f83c7e3eebaec752f1716"},
-    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:c2faf60c583af0d135e853c86ac2735ce178f0e338a3c7f9ae8f622fd2eb788c"},
    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:4bc6cb140a7a0ad1f7bc37e018d0ed690b7b6520ade518285dc3171f7a117905"},
-    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7ff762670cada8e05b32bf1e4dc50b140790909caa8303cfddc4d702b71ea184"},
    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:57f0a0bbc9868e10ebe874e9f129d2917750adf008fe7b9c1598c0fbbfdde6a6"},
-    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:a6d2092797b388342c1bc932077ad232f914351932353e2e8706851c870bca1f"},
    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:60499fe961b21264e17a471ec296dcbf4365fbea611bf9e303ab69db7159ce61"},
    {file = "lxml-5.2.2-cp37-cp37m-win32.whl", hash = "sha256:d9b342c76003c6b9336a80efcc766748a333573abf9350f4094ee46b006ec18f"},
    {file = "lxml-5.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b16db2770517b8799c79aa80f4053cd6f8b716f21f8aca962725a9565ce3ee40"},
@@ -4054,13 +4086,13 @@ sympy = "*"

 [[package]]
 name = "openai"
-version = "1.30.5"
+version = "1.33.0"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.7.1"
 files = [
-    {file = "openai-1.30.5-py3-none-any.whl", hash = "sha256:2ad95e926de0d2e09cde632a9204b0a6dca4a03c2cdcc84329b01f355784355a"},
-    {file = "openai-1.30.5.tar.gz", hash = "sha256:5366562eb2c5917e6116ae0391b7ae6e3acd62b0ae3f565ada32b35d8fcfa106"},
+    {file = "openai-1.33.0-py3-none-any.whl", hash = "sha256:621163b56570897ab8389d187f686a53d4771fd6ce95d481c0a9611fe8bc4229"},
+    {file = "openai-1.33.0.tar.gz", hash = "sha256:1169211a7b326ecbc821cafb427c29bfd0871f9a3e0947dd9e51acb3b0f1df78"},
 ]

 [package.dependencies]
@@ -4508,6 +4540,17 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa
 typing = ["typing-extensions"]
 xmp = ["defusedxml"]

+[[package]]
+name = "pip"
+version = "24.0"
+description = "The PyPA recommended tool for installing Python packages."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pip-24.0-py3-none-any.whl", hash = "sha256:ba0d021a166865d2265246961bec0152ff124de910c5cc39f1156ce3fa7c69dc"},
+    {file = "pip-24.0.tar.gz", hash = "sha256:ea9bd1a847e8c5774a5777bb398c19e80bcd4e2aa16a4b301b718fe6f593aba2"},
+]
+
 [[package]]
 name = "platformdirs"
 version = "4.2.2"
@@ -5665,28 +5708,28 @@ pyasn1 = ">=0.1.3"

 [[package]]
 name = "ruff"
-version = "0.4.7"
+version = "0.4.8"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.4.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:e089371c67892a73b6bb1525608e89a2aca1b77b5440acf7a71dda5dac958f9e"},
-    {file = "ruff-0.4.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:10f973d521d910e5f9c72ab27e409e839089f955be8a4c8826601a6323a89753"},
-    {file = "ruff-0.4.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59c3d110970001dfa494bcd95478e62286c751126dfb15c3c46e7915fc49694f"},
-    {file = "ruff-0.4.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa9773c6c00f4958f73b317bc0fd125295110c3776089f6ef318f4b775f0abe4"},
-    {file = "ruff-0.4.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07fc80bbb61e42b3b23b10fda6a2a0f5a067f810180a3760c5ef1b456c21b9db"},
-    {file = "ruff-0.4.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:fa4dafe3fe66d90e2e2b63fa1591dd6e3f090ca2128daa0be33db894e6c18648"},
-    {file = "ruff-0.4.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7c0083febdec17571455903b184a10026603a1de078428ba155e7ce9358c5f6"},
-    {file = "ruff-0.4.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ad1b20e66a44057c326168437d680a2166c177c939346b19c0d6b08a62a37589"},
-    {file = "ruff-0.4.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbf5d818553add7511c38b05532d94a407f499d1a76ebb0cad0374e32bc67202"},
-    {file = "ruff-0.4.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:50e9651578b629baec3d1513b2534de0ac7ed7753e1382272b8d609997e27e83"},
-    {file = "ruff-0.4.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8874a9df7766cb956b218a0a239e0a5d23d9e843e4da1e113ae1d27ee420877a"},
-    {file = "ruff-0.4.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:b9de9a6e49f7d529decd09381c0860c3f82fa0b0ea00ea78409b785d2308a567"},
-    {file = "ruff-0.4.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:13a1768b0691619822ae6d446132dbdfd568b700ecd3652b20d4e8bc1e498f78"},
-    {file = "ruff-0.4.7-py3-none-win32.whl", hash = "sha256:769e5a51df61e07e887b81e6f039e7ed3573316ab7dd9f635c5afaa310e4030e"},
-    {file = "ruff-0.4.7-py3-none-win_amd64.whl", hash = "sha256:9e3ab684ad403a9ed1226894c32c3ab9c2e0718440f6f50c7c5829932bc9e054"},
-    {file = "ruff-0.4.7-py3-none-win_arm64.whl", hash = "sha256:10f2204b9a613988e3484194c2c9e96a22079206b22b787605c255f130db5ed7"},
-    {file = "ruff-0.4.7.tar.gz", hash = "sha256:2331d2b051dc77a289a653fcc6a42cce357087c5975738157cd966590b18b5e1"},
+    {file = "ruff-0.4.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:7663a6d78f6adb0eab270fa9cf1ff2d28618ca3a652b60f2a234d92b9ec89066"},
+    {file = "ruff-0.4.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:eeceb78da8afb6de0ddada93112869852d04f1cd0f6b80fe464fd4e35c330913"},
+    {file = "ruff-0.4.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aad360893e92486662ef3be0a339c5ca3c1b109e0134fcd37d534d4be9fb8de3"},
+    {file = "ruff-0.4.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:284c2e3f3396fb05f5f803c9fffb53ebbe09a3ebe7dda2929ed8d73ded736deb"},
+    {file = "ruff-0.4.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7354f921e3fbe04d2a62d46707e569f9315e1a613307f7311a935743c51a764"},
+    {file = "ruff-0.4.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:72584676164e15a68a15778fd1b17c28a519e7a0622161eb2debdcdabdc71883"},
+    {file = "ruff-0.4.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9678d5c9b43315f323af2233a04d747409d1e3aa6789620083a82d1066a35199"},
+    {file = "ruff-0.4.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704977a658131651a22b5ebeb28b717ef42ac6ee3b11e91dc87b633b5d83142b"},
+    {file = "ruff-0.4.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d05f8d6f0c3cce5026cecd83b7a143dcad503045857bc49662f736437380ad45"},
+    {file = "ruff-0.4.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:6ea874950daca5697309d976c9afba830d3bf0ed66887481d6bca1673fc5b66a"},
+    {file = "ruff-0.4.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:fc95aac2943ddf360376be9aa3107c8cf9640083940a8c5bd824be692d2216dc"},
+    {file = "ruff-0.4.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:384154a1c3f4bf537bac69f33720957ee49ac8d484bfc91720cc94172026ceed"},
+    {file = "ruff-0.4.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e9d5ce97cacc99878aa0d084c626a15cd21e6b3d53fd6f9112b7fc485918e1fa"},
+    {file = "ruff-0.4.8-py3-none-win32.whl", hash = "sha256:6d795d7639212c2dfd01991259460101c22aabf420d9b943f153ab9d9706e6a9"},
+    {file = "ruff-0.4.8-py3-none-win_amd64.whl", hash = "sha256:e14a3a095d07560a9d6769a72f781d73259655919d9b396c650fc98a8157555d"},
+    {file = "ruff-0.4.8-py3-none-win_arm64.whl", hash = "sha256:14019a06dbe29b608f6b7cbcec300e3170a8d86efaddb7b23405cb7f7dcaf780"},
+    {file = "ruff-0.4.8.tar.gz", hash = "sha256:16d717b1d57b2e2fd68bd0bf80fb43931b79d05a7131aa477d66fc40fbd86268"},
 ]

 [[package]]
@@ -6178,6 +6221,32 @@ files = [
    {file = "striprtf-0.0.26.tar.gz", hash = "sha256:fdb2bba7ac440072d1c41eab50d8d74ae88f60a8b6575c6e2c7805dc462093aa"},
 ]

+[[package]]
+name = "swebench"
+version = "1.1.5"
+description = "The official SWE-bench package - a benchmark for evaluating LMs on software engineering"
+optional = false
+python-versions = ">=3.8"
+files = []
+develop = false
+
+[package.dependencies]
+beautifulsoup4 = "*"
+chardet = "*"
+datasets = "*"
+ghapi = "*"
+GitPython = "*"
+python-dotenv = "*"
+requests = "*"
+rich = "*"
+tqdm = "*"
+
+[package.source]
+type = "git"
+url = "https://github.com/OpenDevin/SWE-bench.git"
+reference = "HEAD"
+resolved_reference = "7b0c4b1c249ed4b4600a5bba8afb916d543e034a"
+
 [[package]]
 name = "sympy"
 version = "1.12"
@@ -7506,39 +7575,47 @@ test = ["zope.testrunner"]

 [[package]]
 name = "zope-interface"
-version = "6.4"
+version = "6.4.post2"
 description = "Interfaces for Python"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "zope.interface-6.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72faa868fcfde49a29d287dce3c83180322467eecd725dd351098efe96e8d4bb"},
-    {file = "zope.interface-6.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:855b7233fa5d0d1f3be8c14fadf4718dee1c928e1d75f1584bea6ecec6dcc4af"},
-    {file = "zope.interface-6.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36ee6e507a9fd4f1f0aab8e8dfc801d162e7211c27503cbfb47e1d558941a7fa"},
-    {file = "zope.interface-6.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:604fa920478dfc0c76cdb7c203572400a8317ffcdac288245c408b42b3d9aee9"},
-    {file = "zope.interface-6.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c04bd4ee4766d285e83c6d8c042663a98efb934389e05ccd643fefb066c88a9d"},
-    {file = "zope.interface-6.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4782e173c2fde4f649c2a9a68082445bc1f2c27f41907de06bf1ba82585847f2"},
-    {file = "zope.interface-6.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:646cd83d24065d074f22f61fe101d20dbf4b729ca7831cc782ec986eb9156f93"},
-    {file = "zope.interface-6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b0f61ccbc26e08031d0e72b6a0cbf9b4030f035913cb2b39f940aa42eb8e0063"},
-    {file = "zope.interface-6.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:414e6dccdf4a5c96c0c98da68ba040dbf9ba7511b61b34e228f11b0ed90c439d"},
-    {file = "zope.interface-6.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f5092f2712e1fd07579fc3101b18e9c95857c853e836847598bf992c8e672434"},
-    {file = "zope.interface-6.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:21732994aa3ca43bbb6b36335c288023428a3c5b7322b637c7b0a03053937578"},
-    {file = "zope.interface-6.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe636b49c333bfc5b0913590e36a2f151167c462fb36d9f4acc66029e45c974b"},
-    {file = "zope.interface-6.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57f34b7997f8de7d2db08363eaccd05dad20f106e39efe95bed4fac84af2d022"},
-    {file = "zope.interface-6.4-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6494dc0314e782ce4fb0e624b4ce2458f54d074382f50a920c7700c05cbcef28"},
-    {file = "zope.interface-6.4-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7cda82ab32f984985f09e4ec20a4f9665b26779a1b8e443b34a148de256f2052"},
-    {file = "zope.interface-6.4-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f78e1eac48c4f4e0168a91cabcd8d1aedb972836df5c8769071fc6173294a0a3"},
-    {file = "zope.interface-6.4-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:8e246357f52952ae5fa950d19eda8572594c49e6cb1e5462508e6cec561a37de"},
-    {file = "zope.interface-6.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:93f28d84517dcd6c240979bd9b2f262a373832baef856fe663a24b9171d7f04d"},
-    {file = "zope.interface-6.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4cd56eb9a23767958c9a0654306b9a4a74def485f645b3a7378cc6ab661ef31c"},
-    {file = "zope.interface-6.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:502d2c9c4231d022b20225dba5c6c736236ed65e1d7e2f6f402b5aa6a7040ec9"},
-    {file = "zope.interface-6.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ee1e3ca6c98efe213a96dece89100a8aa52e210ac354861d8039d69bd1d6e5ff"},
-    {file = "zope.interface-6.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62e6b756663deade5270f67899753437b39d970f9eecd49e19fae3b880310cf0"},
-    {file = "zope.interface-6.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f33af86ed460eb28dc9da1de1f3305795271a19c665161c1d973a737596b2081"},
-    {file = "zope.interface-6.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:86e85eada0eb551950df05d72dc0e892320f14daa78bc434059e834d4b1f9300"},
-    {file = "zope.interface-6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3945f4fda92c1b6fb0cb6eaaaf72599e5c2c2059654bdc42bc09c6e711c214c8"},
-    {file = "zope.interface-6.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5fbbb290751f5c4ed81e54ae73fe8557c4a85973f5ab019edbb0f746244ecea6"},
-    {file = "zope.interface-6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e4cc017206c1429a6d8fdd8a25c6efc15512065eec0a8d45c350df96a0911ed"},
-    {file = "zope_interface-6.4.tar.gz", hash = "sha256:b11f2b67ccc990a1522fa8cd3f5d185a068459f944ab2d0e7a1b15d31bcb4af4"},
+    {file = "zope.interface-6.4.post2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2eccd5bef45883802848f821d940367c1d0ad588de71e5cabe3813175444202c"},
+    {file = "zope.interface-6.4.post2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:762e616199f6319bb98e7f4f27d254c84c5fb1c25c908c2a9d0f92b92fb27530"},
+    {file = "zope.interface-6.4.post2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ef8356f16b1a83609f7a992a6e33d792bb5eff2370712c9eaae0d02e1924341"},
+    {file = "zope.interface-6.4.post2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e4fa5d34d7973e6b0efa46fe4405090f3b406f64b6290facbb19dcbf642ad6b"},
+    {file = "zope.interface-6.4.post2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d22fce0b0f5715cdac082e35a9e735a1752dc8585f005d045abb1a7c20e197f9"},
+    {file = "zope.interface-6.4.post2-cp310-cp310-win_amd64.whl", hash = "sha256:97e615eab34bd8477c3f34197a17ce08c648d38467489359cb9eb7394f1083f7"},
+    {file = "zope.interface-6.4.post2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:599f3b07bde2627e163ce484d5497a54a0a8437779362395c6b25e68c6590ede"},
+    {file = "zope.interface-6.4.post2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:136cacdde1a2c5e5bc3d0b2a1beed733f97e2dad8c2ad3c2e17116f6590a3827"},
+    {file = "zope.interface-6.4.post2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47937cf2e7ed4e0e37f7851c76edeb8543ec9b0eae149b36ecd26176ff1ca874"},
+    {file = "zope.interface-6.4.post2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f0a6be264afb094975b5ef55c911379d6989caa87c4e558814ec4f5125cfa2e"},
+    {file = "zope.interface-6.4.post2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47654177e675bafdf4e4738ce58cdc5c6d6ee2157ac0a78a3fa460942b9d64a8"},
+    {file = "zope.interface-6.4.post2-cp311-cp311-win_amd64.whl", hash = "sha256:e2fb8e8158306567a3a9a41670c1ff99d0567d7fc96fa93b7abf8b519a46b250"},
+    {file = "zope.interface-6.4.post2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b912750b13d76af8aac45ddf4679535def304b2a48a07989ec736508d0bbfbde"},
+    {file = "zope.interface-6.4.post2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4ac46298e0143d91e4644a27a769d1388d5d89e82ee0cf37bf2b0b001b9712a4"},
+    {file = "zope.interface-6.4.post2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86a94af4a88110ed4bb8961f5ac72edf782958e665d5bfceaab6bf388420a78b"},
+    {file = "zope.interface-6.4.post2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:73f9752cf3596771c7726f7eea5b9e634ad47c6d863043589a1c3bb31325c7eb"},
+    {file = "zope.interface-6.4.post2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00b5c3e9744dcdc9e84c24ed6646d5cf0cf66551347b310b3ffd70f056535854"},
+    {file = "zope.interface-6.4.post2-cp312-cp312-win_amd64.whl", hash = "sha256:551db2fe892fcbefb38f6f81ffa62de11090c8119fd4e66a60f3adff70751ec7"},
+    {file = "zope.interface-6.4.post2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96ac6b3169940a8cd57b4f2b8edcad8f5213b60efcd197d59fbe52f0accd66e"},
+    {file = "zope.interface-6.4.post2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cebff2fe5dc82cb22122e4e1225e00a4a506b1a16fafa911142ee124febf2c9e"},
+    {file = "zope.interface-6.4.post2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33ee982237cffaf946db365c3a6ebaa37855d8e3ca5800f6f48890209c1cfefc"},
+    {file = "zope.interface-6.4.post2-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:fbf649bc77510ef2521cf797700b96167bb77838c40780da7ea3edd8b78044d1"},
+    {file = "zope.interface-6.4.post2-cp37-cp37m-win_amd64.whl", hash = "sha256:4c0b208a5d6c81434bdfa0f06d9b667e5de15af84d8cae5723c3a33ba6611b82"},
+    {file = "zope.interface-6.4.post2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d3fe667935e9562407c2511570dca14604a654988a13d8725667e95161d92e9b"},
+    {file = "zope.interface-6.4.post2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a96e6d4074db29b152222c34d7eec2e2db2f92638d2b2b2c704f9e8db3ae0edc"},
+    {file = "zope.interface-6.4.post2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:866a0f583be79f0def667a5d2c60b7b4cc68f0c0a470f227e1122691b443c934"},
+    {file = "zope.interface-6.4.post2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5fe919027f29b12f7a2562ba0daf3e045cb388f844e022552a5674fcdf5d21f1"},
+    {file = "zope.interface-6.4.post2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e0343a6e06d94f6b6ac52fbc75269b41dd3c57066541a6c76517f69fe67cb43"},
+    {file = "zope.interface-6.4.post2-cp38-cp38-win_amd64.whl", hash = "sha256:dabb70a6e3d9c22df50e08dc55b14ca2a99da95a2d941954255ac76fd6982bc5"},
+    {file = "zope.interface-6.4.post2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:706efc19f9679a1b425d6fa2b4bc770d976d0984335eaea0869bd32f627591d2"},
+    {file = "zope.interface-6.4.post2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d136e5b8821073e1a09dde3eb076ea9988e7010c54ffe4d39701adf0c303438"},
+    {file = "zope.interface-6.4.post2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1730c93a38b5a18d24549bc81613223962a19d457cfda9bdc66e542f475a36f4"},
+    {file = "zope.interface-6.4.post2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bc2676312cc3468a25aac001ec727168994ea3b69b48914944a44c6a0b251e79"},
+    {file = "zope.interface-6.4.post2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a62fd6cd518693568e23e02f41816adedfca637f26716837681c90b36af3671"},
+    {file = "zope.interface-6.4.post2-cp39-cp39-win_amd64.whl", hash = "sha256:d3f7e001328bd6466b3414215f66dde3c7c13d8025a9c160a75d7b2687090d15"},
+    {file = "zope.interface-6.4.post2.tar.gz", hash = "sha256:1c207e6f6dfd5749a26f5a5fd966602d6b824ec00d2df84a7e9a924e8933654e"},
 ]

 [package.dependencies]
@@ -7552,4 +7629,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "22b73cfd99133089498d0a55cac9fc52bfbdbce5074ef187a774e658a40748c3"
+content-hash = "6ecc369caf1256f86a6cfb642213180173c011eb6de7ffecac002ce5d0b4a661"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,7 @@ minio = "^7.2.7"
 gevent = "^24.2.1"
 pyarrow = "16.1.0" # transitive dependency, pinned here to avoid conflicts
 tenacity = "^8.3.0"
+zope-interface = "6.4.post2"

 [tool.poetry.group.llama-index.dependencies]
 llama-index = "*"
@@ -44,7 +45,7 @@ llama-index-embeddings-azure-openai = "*"
 llama-index-embeddings-ollama = "*"

 [tool.poetry.group.dev.dependencies]
-ruff = "0.4.7"
+ruff = "0.4.8"
 mypy = "1.10.0"
 pre-commit = "3.7.1"

@@ -66,11 +67,13 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]

+
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
 retry = "*"
 evaluate = "*"
+swebench = { git = "https://github.com/OpenDevin/SWE-bench.git" }

 [build-system]
 build-backend = "poetry.core.masonry.api"
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -55,6 +55,15 @@ TEST_ONLY=true ./tests/integration/regenerate.sh

 to run all integration tests until the first failure.

+If you only want to run a specific test, set environment variable
+`ONLY_TEST_NAME` to the test name. If you only want to run a specific agent,
+set environment variable `ONLY_TEST_AGENT` to the agent. You could also use both,
+e.g.
+
+```bash
+TEST_ONLY=true ONLY_TEST_NAME="test_simple_task_rejection" ONLY_TEST_AGENT="ManagerAgent" ./tests/integration/regenerate.sh
+```
+

 ## Regenerate Integration Tests
 When you make changes to an agent's prompt, the integration tests will fail. You'll need to regenerate them
@@ -91,7 +100,7 @@ set environment variable `ONLY_TEST_AGENT` to the agent. You could also use both
 e.g.

 ```bash
-ONLY_TEST_NAME="test_write_simple_script" ONLY_TEST_AGENT="MonologueAgent" ./tests/integration/regenerate.sh
+ONLY_TEST_NAME="test_write_simple_script" ONLY_TEST_AGENT="CodeActAgent" ./tests/integration/regenerate.sh
 ```

 ### Force Regenerate with real LLM
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -1,9 +1,8 @@
 import io
 import os
 import re
-import sys
-import tempfile
 import subprocess
+import tempfile
 from functools import partial
 from http.server import HTTPServer, SimpleHTTPRequestHandler
 from threading import Thread
@@ -17,8 +16,23 @@ script_dir = os.path.dirname(os.path.realpath(__file__))
 workspace_path = os.getenv('WORKSPACE_BASE')


+class SecretExit(Exception):
+    pass
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_exception_interact(node, call, report):
+    if isinstance(call.excinfo.value, SecretExit):
+        report.outcome = 'failed'
+        report.longrepr = (
+            'SecretExit: Exiting due to an error without revealing secrets.'
+        )
+        call.excinfo = None
+
+
 def filter_out_symbols(input):
-    return ' '.join([char for char in input if char.isalnum()])
+    input = re.sub(r'\\n|\\r\\n|\\r|\s+', '', input)
+    return input


 def get_log_id(prompt_log_name):
@@ -84,13 +98,19 @@ def get_mock_response(test_name: str, messages: str, id: int) -> str:
            print('Mismatched Prompt File path', prompt_file_path)
            print('---' * 10)
            # Create a temporary file to store messages
-            with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as tmp_file:
+            with tempfile.NamedTemporaryFile(
+                delete=False, mode='w', encoding='utf-8'
+            ) as tmp_file:
                tmp_file_path = tmp_file.name
                tmp_file.write(messages)

            try:
                # Use diff command to compare files and capture the output
-                result = subprocess.run(['diff', '-u', prompt_file_path, tmp_file_path], capture_output=True, text=True)
+                result = subprocess.run(
+                    ['diff', '-u', prompt_file_path, tmp_file_path],
+                    capture_output=True,
+                    text=True,
+                )
                if result.returncode != 0:
                    print('Diff:')
                    print(result.stdout)
@@ -136,9 +156,7 @@ def mock_completion(*args, test_name, **kwargs):
    else:
        mock_response = get_mock_response(test_name, message_str, cur_id)
    if mock_response is None:
-        print('Mock response for prompt is not found\n\n')
-        print('Exiting...')
-        sys.exit(1)
+        raise SecretExit('Mock response for prompt is not found')
    response = completion(**kwargs, mock_response=mock_response)
    return response

--- a/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_002.log
+++ b/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_002.log
@@ -118,12 +118,11 @@ RootWebArea 'The Ultimate Answer', focused
 	[8] heading 'The Ultimate Answer'
 	[9] paragraph ''
 		StaticText 'Click the button to reveal the answer to life, the universe, and everything.'
-	[10] button 'Click me'
+	[10] button 'Click me', clickable

 # Previous Actions
 goto('http://localhost:8000')

-
 Here is an example with chain of thought of a valid action when clicking on a button:
 "
 In order to accomplish my goal I need to click on the button with bid 12
--- a/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_003.log
+++ b/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_003.log
@@ -118,14 +118,13 @@ RootWebArea 'The Ultimate Answer', focused
 	[8] heading 'The Ultimate Answer'
 	[9] paragraph ''
 		StaticText 'Click the button to reveal the answer to life, the universe, and everything.'
-	[10] button 'Click me', focused
+	[10] button 'Click me', clickable, focused
 	StaticText 'The answer is OpenDevin is all you need!'

 # Previous Actions
 goto('http://localhost:8000')
 click("10")

-
 Here is an example with chain of thought of a valid action when clicking on a button:
 "
 In order to accomplish my goal I need to click on the button with bid 12
--- a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_001.log
+++ b/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_001.log
@@ -0,0 +1,328 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: Optional[int] = None) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    Args:
+    path: str: The path to the file to open.
+    line_number: Optional[int]: The line number to move to.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to append to.
+    content: str: The content to append to the file.
+
+edit_file(file_name: str, start: int, end: int, content: str) -> None:
+    Edit a file.
+    Replaces in given file `file_name` the lines `start` through `end` (inclusive) with the given text `content`.
+    Args:
+    file_name: str: The name of the file to edit.
+    start: int: The start line number. Must satisfy start >= 1.
+    end: int: The end line number. Must satisfy start <= end <= number of lines in the file.
+    content: str: The content to replace the lines with.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` and `append_file` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+1|
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+edit_file('app.py', start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file('app.py', start=7, end=7, content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'")
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.
+
+ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
--- a/Show More
+++ b/Show More