Solved Hugging Face cache issue. (#2277 )

Bump litellm from 1.40.0 to 1.40.2 (#2282 )
Bumps [litellm](https://github.com/BerriAI/litellm) from 1.40.0 to 1.40.2. - [Release notes](https://github.com/BerriAI/litellm/releases) - [Commits](https://github.com/BerriAI/litellm/compare/v1.40.0...v1.40.2) --- updated-dependencies: - dependency-name: litellm dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-04-29 03:00:45 -04:00 · 2024-06-05 21:18:33 +05:30 · 2024-06-05 23:46:00 +08:00 · 2024-06-05 23:45:42 +08:00 · 2024-06-05 23:40:53 +08:00 · 2024-06-05 23:39:41 +08:00
236 changed files with 13922 additions and 3373 deletions
@@ -10,6 +10,9 @@ on:
    - main
  pull_request:

+env:
+  PERSIST_SANDBOX : "false"
+
 jobs:
  test:
    runs-on: ubuntu-latest
@@ -47,11 +47,4 @@ jobs:
      - name: Install pre-commit
        run: pip install pre-commit==3.7.0
      - name: Run pre-commit hooks
-        if: github.ref != 'refs/heads/main'
-        run: |
-          git fetch https://github.com/OpenDevin/OpenDevin.git main:main && \
-          pre-commit run \
-            --files \
-            $(git diff --name-only $(git merge-base main $(git branch --show-current)) $(git branch --show-current) | tr '\n' ' ') \
-            --show-diff-on-failure \
-            --config ./dev_config/python/.pre-commit-config.yaml
+        run: pre-commit run --files opendevin/**/* agenthub/**/* evaluation/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
@@ -44,12 +44,24 @@ jobs:
        echo "" >> task.txt
        echo "Diff file is: ${{ github.event.pull_request.number }}.diff" >> task.txt

+    - name: Set up environment
+      run: |
+        curl -sSL https://install.python-poetry.org | python3 -
+        export PATH="/github/home/.local/bin:$PATH"
+        poetry install --without evaluation
+        poetry run playwright install --with-deps chromium
+
    - name: Run OpenDevin
      env:
        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        SANDBOX_TYPE: exec
      run: |
-        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
+        # Append path to launch poetry
+        export PATH="/github/home/.local/bin:$PATH"
+        # Append path to correctly import package, note: must set pwd at first
+        export PYTHONPATH=$(pwd):$PYTHONPATH
+        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
        rm task.txt

    - name: Check if review file is non-empty
@@ -15,6 +15,9 @@ on:
      - 'evaluation/**'
  pull_request:

+env:
+  PERSIST_SANDBOX : "false"
+
 jobs:
  integration-tests-on-linux:
    name: Integration Tests on Linux
@@ -15,6 +15,9 @@ on:
      - 'evaluation/**'
  pull_request:

+env:
+  PERSIST_SANDBOX : "false"
+
 jobs:
  test-on-macos:
    name: Test on macOS
@@ -5,8 +5,8 @@ This guide is for people working on OpenDevin and editing the source code.

 ### 1. Requirements
 * Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install)
-* [Docker](https://docs.docker.com/engine/install/)(For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
-* [Python](https://www.python.org/downloads/) >= 3.11
+* [Docker](https://docs.docker.com/engine/install/) (For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
+* [Python](https://www.python.org/downloads/) = 3.11
 * [NodeJS](https://nodejs.org/en/download/package-manager) >= 18.17.1
 * [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8

@@ -45,6 +45,7 @@ To configure the LM of your choice, follow these steps:
   make setup-config
   ```
   This command will prompt you to enter the LLM API key, model name, and other variables ensuring that OpenDevin is tailored to your specific needs. Note that the model name will apply only when you run headless. If you use the UI, please set the model in the UI.
+   Set `persist_sandbox` to false if you want to use clean sandbox for each task. If `persist_sandbox` is set to true, you will need to set the `ssh_password` as well.

 **Note on Alternative Models:**
 Some alternative models may prove more challenging to tame than others. Fear not, brave adventurer! We shall soon unveil LLM-specific documentation to guide you on your quest. And if you've already mastered the art of wielding a model other than OpenAI's GPT, we encourage you to [share your setup instructions with us](https://github.com/OpenDevin/OpenDevin/issues/417).
@@ -97,5 +98,5 @@ Please refer to [this README](./tests/integration/README.md) for details.

 ### 9. Add or update dependency

-1. Add your dependency in `pyproject.toml` or use `peotry add xxx`
-2. Update the poetry.lock file via `poetry lock --no-update`
+1. Add your dependency in `pyproject.toml` or use `poetry add xxx`
+2. Update the poetry.lock file via `poetry lock --no-update`
@@ -7,7 +7,7 @@ BACKEND_PORT = 3000
 BACKEND_HOST = "127.0.0.1:$(BACKEND_PORT)"
 FRONTEND_PORT = 3001
 DEFAULT_WORKSPACE_DIR = "./workspace"
-DEFAULT_MODEL = "gpt-3.5-turbo"
+DEFAULT_MODEL = "gpt-4o"
 CONFIG_FILE = config.toml
 PRECOMMIT_CONFIG_PATH = "./dev_config/python/.pre-commit-config.yaml"

@@ -142,7 +142,14 @@ install-python-dependencies:
 		poetry run pip install playwright; \
 		poetry run playwright install chromium; \
 	else \
-		poetry run playwright install --with-deps chromium; \
+		if [ ! -f cache/playwright_chromium_is_installed.txt ]; then \
+			echo "Running playwright install --with-deps chromium..."; \
+			poetry run playwright install --with-deps chromium; \
+			mkdir -p cache; \
+			touch cache/playwright_chromium_is_installed.txt; \
+		else \
+			echo "Setup already done. Skipping playwright installation."; \
+		fi \
 	fi
 	@echo "$(GREEN)Python dependencies installed successfully.$(RESET)"

@@ -165,7 +172,7 @@ install-precommit-hooks:

 lint-backend:
 	@echo "$(YELLOW)Running linters...$(RESET)"
-	@poetry run pre-commit run --files $$(git diff --name-only $$(git merge-base main $$(git branch --show-current)) $$(git branch --show-current) | tr '\n' ' ') --show-diff-on-failure --config $(PRECOMMIT_CONFIG_PATH)
+	@poetry run pre-commit run --files opendevin/**/* agenthub/**/* evaluation/**/* --show-diff-on-failure --config $(PRECOMMIT_CONFIG_PATH)

 lint-frontend:
 	@echo "$(YELLOW)Running linters for frontend...$(RESET)"
@@ -226,6 +233,15 @@ setup-config-prompts:
 	 workspace_dir=$${workspace_dir:-$(DEFAULT_WORKSPACE_DIR)}; \
 	 echo "workspace_base=\"$$workspace_dir\"" >> $(CONFIG_FILE).tmp

+	@read -p "Do you want to persist the sandbox container? [true/false] [default: true]: " persist_sandbox; \
+	 persist_sandbox=$${persist_sandbox:-true}; \
+	 if [ "$$persist_sandbox" = "true" ]; then \
+		 read -p "Enter a password for the sandbox container: " ssh_password; \
+		 echo "ssh_password=\"$$ssh_password\"" >> $(CONFIG_FILE).tmp; \
+	 else \
+		echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
+	 fi
+
 	@echo "" >> $(CONFIG_FILE).tmp

 	@echo "[llm]" >> $(CONFIG_FILE).tmp
@@ -49,31 +49,30 @@ OpenDevin agents collaborate with human developers to write code, fix bugs, and
 The easiest way to run OpenDevin is inside a Docker container. It works best with the most recent version of Docker, `26.0.0`.
 You must be using Linux, Mac OS, or WSL on Windows.

-To start the app, run these commands, replacing `$(pwd)/workspace` with the directory you want OpenDevin to work with.
+To start OpenDevin in a docker container, run the following commands in your terminal:
+
+> [!WARNING]
+> When you run the following command, files in `./workspace` may be modified or deleted.

 ```bash
-# The directory you want OpenDevin to work with. MUST be an absolute path!
-export WORKSPACE_BASE=$(pwd)/workspace;
-```
-
-> [!WARNING]  
-> OpenDevin runs bash commands within a Docker sandbox, so it should not affect your machine. 
-> But your workspace directory will be attached to that sandbox, and files in the directory may be modified or deleted.
-
-```bash
-docker run \
-    -it \
+OPENDEVIN_WORKSPACE=$(pwd)/workspace
+docker run -it \
    --pull=always \
    -e SANDBOX_USER_ID=$(id -u) \
-    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-    -v $WORKSPACE_BASE:/opt/workspace_base \
+    -e PERSIST_SANDBOX="true" \
+    -e SSH_PASSWORD="make something up here" \
+    -e WORKSPACE_MOUNT_PATH=$OPENDEVIN_WORKSPACE \
+    -v $OPENDEVIN_WORKSPACE:/opt/workspace_base \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
-    ghcr.io/opendevin/opendevin:0.5
+    --name opendevin-app-$(date +%Y%m%d%H%M%S) \
+    ghcr.io/opendevin/opendevin:0.6
 ```

-You'll find OpenDevin running at [http://localhost:3000](http://localhost:3000).
+You'll find OpenDevin running at [http://localhost:3000](http://localhost:3000) with access to `./workspace`. To have OpenDevin operate on your code, place it in `./workspace`.
+
+OpenDevin will only have access to this workspace folder. The rest of your system will not be affected as it runs in a secured docker sandbox.

 ## 🚀 Documentation

@@ -100,7 +99,7 @@ For details, please check [CONTRIBUTING.md](./CONTRIBUTING.md).
 Whether you're a developer, a researcher, or simply enthusiastic about OpenDevin, we'd love to have you in our community.
 Let's make software engineering better together!

- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw) - Here we talk about research, architecture, and future development.
+- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) - Here we talk about research, architecture, and future development.
 - [Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback.

 ## 📈 Progress
@@ -8,6 +8,7 @@ from opendevin.events.action import (
 )
 from opendevin.events.serialization.event import event_to_memory
 from opendevin.llm.llm import LLM
+from opendevin.runtime.tools import RuntimeTool

 from .parser import parse_command
 from .prompts import (
@@ -27,6 +28,7 @@ class SWEAgent(Agent):

    SWE-agent includes ACI functions like 'goto', 'search_for', 'edit', 'scroll', 'run'
    """
+    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]

    def __init__(self, llm: LLM):
        super().__init__(llm)
@@ -12,6 +12,7 @@ from . import (  # noqa: E402
    SWE_agent,
    browsing_agent,
    codeact_agent,
+    codeact_swe_agent,
    delegator_agent,
    dummy_agent,
    monologue_agent,
@@ -21,6 +22,7 @@ from . import (  # noqa: E402
 __all__ = [
    'monologue_agent',
    'codeact_agent',
+    'codeact_swe_agent',
    'planner_agent',
    'SWE_agent',
    'delegator_agent',
@@ -17,6 +17,7 @@ from opendevin.llm.llm import LLM
 from opendevin.runtime.plugins import (
    PluginRequirement,
 )
+from opendevin.runtime.tools import RuntimeTool


 def parse_response(response: str) -> Action:
@@ -42,6 +43,7 @@ class BrowsingAgent(Agent):
    """

    sandbox_plugins: list[PluginRequirement] = []
+    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]

    def __init__(
        self,
@@ -146,7 +146,7 @@ class Shrinkable(PromptElement, abc.ABC):
        """Implement shrinking of this prompt element.

        You need to recursively call all shrinkable elements that are part of
-        this prompt. You can also implement a shriking startegy for this prompt.
+        this prompt. You can also implement a shrinking strategy for this prompt.
        Shrinking is can be called multiple times to progressively shrink the
        prompt until it fits max_tokens. Default max shrink iterations is 20.
        """
@@ -161,7 +161,7 @@ class Truncater(Shrinkable):

    def __init__(self, visible, shrink_speed=0.3, start_truncate_iteration=10):
        super().__init__(visible=visible)
-        self.shrink_speed = shrink_speed  # the percentage shrinked in each iteration
+        self.shrink_speed = shrink_speed  # the percentage shrunk in each iteration
        self.start_truncate_iteration = (
            start_truncate_iteration  # the iteration to start truncating
        )
@@ -494,11 +494,13 @@ def _get_action_space(flags: Flags) -> AbstractActionSet:
            action_space = PythonActionSet(strict=flags.is_strict)
            if flags.multi_actions:
                warn(
-                    f'Flag action_space={repr(flags.action_space)} incompatible with multi_actions={repr(flags.multi_actions)}.'
+                    f'Flag action_space={repr(flags.action_space)} incompatible with multi_actions={repr(flags.multi_actions)}.',
+                    stacklevel=2,
                )
            if flags.demo_mode != 'off':
                warn(
-                    f'Flag action_space={repr(flags.action_space)} incompatible with demo_mode={repr(flags.demo_mode)}.'
+                    f'Flag action_space={repr(flags.action_space)} incompatible with demo_mode={repr(flags.demo_mode)}.',
+                    stacklevel=2,
                )
            return action_space
        case 'bid':
@@ -16,7 +16,7 @@ def yaml_parser(message):
        valid = True
        retry_message = ''
    except yaml.YAMLError as e:
-        warn(str(e))
+        warn(str(e), stacklevel=2)
        value = {}
        valid = False
        retry_message = "Your response is not a valid yaml. Please try again and be careful to the format. Don't add any apology or comment, just the answer."
@@ -28,6 +28,7 @@ from opendevin.runtime.plugins import (
    JupyterRequirement,
    PluginRequirement,
 )
+from opendevin.runtime.tools import RuntimeTool

 ENABLE_GITHUB = True

@@ -105,6 +106,18 @@ def truncate_observation(observation: str, max_chars: int = 10_000) -> str:
    )


+# FIXME: We can tweak these two settings to create MicroAgents specialized toward different area
+def get_system_message() -> str:
+    if ENABLE_GITHUB:
+        return f'{SYSTEM_PREFIX}\n{GITHUB_MESSAGE}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
+    else:
+        return f'{SYSTEM_PREFIX}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
+
+
+def get_in_context_example() -> str:
+    return EXAMPLES
+
+
 class CodeActAgent(Agent):
    VERSION = '1.5'
    """
@@ -150,13 +163,11 @@ class CodeActAgent(Agent):
        AgentSkillsRequirement(),
        JupyterRequirement(),
    ]
+    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]
    jupyter_kernel_init_code: str = 'from agentskills import *'

-    system_message: str = (
-        f'{SYSTEM_PREFIX}\n{GITHUB_MESSAGE}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
-        if ENABLE_GITHUB
-        else f'{SYSTEM_PREFIX}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
-    )
+    system_message: str = get_system_message()
+    in_context_example: str = f"Here is an example of how you can interact with the environment for task solving:\n{get_in_context_example()}\n\nNOW, LET'S START!"

    def __init__(
        self,
@@ -194,10 +205,7 @@ class CodeActAgent(Agent):
        """
        messages: list[dict[str, str]] = [
            {'role': 'system', 'content': self.system_message},
-            {
-                'role': 'user',
-                'content': f"Here is an example of how you can interact with the environment for task solving:\n{EXAMPLES}\n\nNOW, LET'S START!",
-            },
+            {'role': 'user', 'content': self.in_context_example},
        ]

        for prev_action, obs in state.history:
@@ -8,17 +8,23 @@ COMMAND_DOCS = (
    "Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run."
 )

-SYSTEM_PREFIX = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+# ======= SYSTEM MESSAGE =======
+MINIMAL_SYSTEM_PREFIX = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
 The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
 <execute_ipython>
 print("Hello World!")
 </execute_ipython>
 The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
 For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
-The assistant can browse the Internet with commands on behalf of the user by wrapping them with <execute_browse> and </execute_browse>.
+"""
+
+BROWSING_PREFIX = """The assistant can browse the Internet with commands on behalf of the user by wrapping them with <execute_browse> and </execute_browse>.
 For example, you can browse a given URL by <execute_browse> goto("<URL>") </execute_browse>.
 The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
-The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them."""
+"""
+PIP_INSTALL_PREFIX = """The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them."""
+
+SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX

 GITHUB_MESSAGE = """To do any activities on GitHub, the assistant should use the token in the $GITHUB_TOKEN environment variable.
 For instance, to push a local branch `my_branch` to the github repo `owner/repo`, the assistant can use the following four commands:
@@ -30,6 +36,8 @@ The assistant should include ONLY ONE <execute_ipython> or <execute_bash> or <ex
 IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> or <execute_browse> instead of providing it.
 """

+
+# ======= EXAMPLE MESSAGE =======
 EXAMPLES = """
 --- START OF EXAMPLE ---

@@ -0,0 +1,7 @@
+# CodeAct (SWE Edit Specialized)
+
+This agent is an adaptation of the original [SWE Agent](https://swe-agent.com/) based on CodeAct using the `agentskills` library of OpenDevin.
+
+Its intended use is **solving Github issues**.
+
+It removes web-browsing and Github capability from the original CodeAct agent to avoid confusion to the agent.
@@ -0,0 +1,5 @@
+from opendevin.controller.agent import Agent
+
+from .codeact_swe_agent import CodeActSWEAgent
+
+Agent.register('CodeActSWEAgent', CodeActSWEAgent)
@@ -0,0 +1,248 @@
+import re
+
+from agenthub.codeact_swe_agent.prompt import (
+    COMMAND_DOCS,
+    MINIMAL_SYSTEM_PREFIX,
+    SWE_EXAMPLE,
+    SYSTEM_SUFFIX,
+)
+from opendevin.controller.agent import Agent
+from opendevin.controller.state.state import State
+from opendevin.events.action import (
+    Action,
+    AgentFinishAction,
+    BrowseInteractiveAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+from opendevin.events.observation import (
+    BrowserOutputObservation,
+    CmdOutputObservation,
+    IPythonRunCellObservation,
+)
+from opendevin.llm.llm import LLM
+from opendevin.runtime.plugins import (
+    AgentSkillsRequirement,
+    JupyterRequirement,
+    PluginRequirement,
+)
+from opendevin.runtime.tools import RuntimeTool
+
+
+def parse_response(response) -> str:
+    action = response.choices[0].message.content
+    for lang in ['bash', 'ipython', 'browse']:
+        if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
+            action += f'</execute_{lang}>'
+    return action
+
+
+def action_to_str(action: Action) -> str:
+    if isinstance(action, CmdRunAction):
+        return f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
+    elif isinstance(action, IPythonRunCellAction):
+        return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
+    elif isinstance(action, BrowseInteractiveAction):
+        return f'{action.thought}\n<execute_browse>\n{action.browser_actions}\n</execute_browse>'
+    elif isinstance(action, MessageAction):
+        return action.content
+    return ''
+
+
+def get_action_message(action: Action) -> dict[str, str] | None:
+    if (
+        isinstance(action, BrowseInteractiveAction)
+        or isinstance(action, CmdRunAction)
+        or isinstance(action, IPythonRunCellAction)
+        or isinstance(action, MessageAction)
+    ):
+        return {
+            'role': 'user' if action.source == 'user' else 'assistant',
+            'content': action_to_str(action),
+        }
+    return None
+
+
+def get_observation_message(obs) -> dict[str, str] | None:
+    if isinstance(obs, CmdOutputObservation):
+        content = 'OBSERVATION:\n' + truncate_observation(obs.content)
+        content += (
+            f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]]'
+        )
+        return {'role': 'user', 'content': content}
+    elif isinstance(obs, IPythonRunCellObservation):
+        content = 'OBSERVATION:\n' + obs.content
+        # replace base64 images with a placeholder
+        splitted = content.split('\n')
+        for i, line in enumerate(splitted):
+            if '![image](data:image/png;base64,' in line:
+                splitted[i] = (
+                    '![image](data:image/png;base64, ...) already displayed to user'
+                )
+        content = '\n'.join(splitted)
+        content = truncate_observation(content)
+        return {'role': 'user', 'content': content}
+    elif isinstance(obs, BrowserOutputObservation):
+        content = 'OBSERVATION:\n' + truncate_observation(obs.content)
+        return {'role': 'user', 'content': content}
+    return None
+
+
+def truncate_observation(observation: str, max_chars: int = 10_000) -> str:
+    """
+    Truncate the middle of the observation if it is too long.
+    """
+    if len(observation) <= max_chars:
+        return observation
+    half = max_chars // 2
+    return (
+        observation[:half]
+        + '\n[... Observation truncated due to length ...]\n'
+        + observation[-half:]
+    )
+
+
+def get_system_message() -> str:
+    return f'{MINIMAL_SYSTEM_PREFIX}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
+
+
+def get_in_context_example() -> str:
+    return SWE_EXAMPLE
+
+
+class CodeActSWEAgent(Agent):
+    VERSION = '1.5'
+    """
+    This agent is an adaptation of the original [SWE Agent](https://swe-agent.com/) based on CodeAct 1.5 using the `agentskills` library of OpenDevin.
+
+    It is intended use is **solving Github issues**.
+
+    It removes web-browsing and Github capability from the original CodeAct agent to avoid confusion to the agent.
+    """
+
+    sandbox_plugins: list[PluginRequirement] = [
+        # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
+        # AgentSkillsRequirement provides a lot of Python functions
+        # and it need to be initialized before Jupyter for Jupyter to use those functions.
+        AgentSkillsRequirement(),
+        JupyterRequirement(),
+    ]
+    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]
+    jupyter_kernel_init_code: str = 'from agentskills import *'
+
+    system_message: str = get_system_message()
+    in_context_example: str = f"Here is an example of how you can interact with the environment for task solving:\n{get_in_context_example()}\n\nNOW, LET'S START!"
+
+    def __init__(
+        self,
+        llm: LLM,
+    ) -> None:
+        """
+        Initializes a new instance of the CodeActAgent class.
+
+        Parameters:
+        - llm (LLM): The llm to be used by this agent
+        """
+        super().__init__(llm)
+        self.reset()
+
+    def reset(self) -> None:
+        """
+        Resets the CodeAct Agent.
+        """
+        super().reset()
+
+    def step(self, state: State) -> Action:
+        """
+        Performs one step using the CodeAct Agent.
+        This includes gathering info on previous steps and prompting the model to make a command to execute.
+
+        Parameters:
+        - state (State): used to get updated info and background commands
+
+        Returns:
+        - CmdRunAction(command) - bash command to run
+        - IPythonRunCellAction(code) - IPython code to run
+        - BrowseInteractiveAction(browsergym_command) - BrowserGym commands to run
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - AgentFinishAction() - end the interaction
+        """
+        messages: list[dict[str, str]] = [
+            {'role': 'system', 'content': self.system_message},
+            {'role': 'user', 'content': self.in_context_example},
+        ]
+
+        for prev_action, obs in state.history:
+            action_message = get_action_message(prev_action)
+            if action_message:
+                messages.append(action_message)
+
+            obs_message = get_observation_message(obs)
+            if obs_message:
+                messages.append(obs_message)
+
+        latest_user_message = [m for m in messages if m['role'] == 'user'][-1]
+        if latest_user_message:
+            if latest_user_message['content'].strip() == '/exit':
+                return AgentFinishAction()
+            latest_user_message['content'] += (
+                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
+            )
+
+        response = self.llm.do_completion(
+            messages=messages,
+            stop=[
+                '</execute_ipython>',
+                '</execute_bash>',
+                '</execute_browse>',
+            ],
+            temperature=0.0,
+        )
+
+        action_str: str = parse_response(response)
+        state.num_of_chars += sum(
+            len(message['content']) for message in messages
+        ) + len(action_str)
+
+        if finish_command := re.search(r'<finish>.*</finish>', action_str, re.DOTALL):
+            thought = action_str.replace(finish_command.group(0), '').strip()
+            return AgentFinishAction(thought=thought)
+        if bash_command := re.search(
+            r'<execute_bash>(.*?)</execute_bash>', action_str, re.DOTALL
+        ):
+            # remove the command from the action string to get thought
+            thought = action_str.replace(bash_command.group(0), '').strip()
+            # a command was found
+            command_group = bash_command.group(1).strip()
+
+            if command_group.strip() == 'exit':
+                return AgentFinishAction()
+            return CmdRunAction(command=command_group, thought=thought)
+        elif python_code := re.search(
+            r'<execute_ipython>(.*?)</execute_ipython>', action_str, re.DOTALL
+        ):
+            # a code block was found
+            code_group = python_code.group(1).strip()
+            thought = action_str.replace(python_code.group(0), '').strip()
+            return IPythonRunCellAction(
+                code=code_group,
+                thought=thought,
+                kernel_init_code=self.jupyter_kernel_init_code,
+            )
+        elif browse_command := re.search(
+            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
+        ):
+            # BrowserGym actions was found
+            browse_actions = browse_command.group(1).strip()
+            thought = action_str.replace(browse_command.group(0), '').strip()
+            return BrowseInteractiveAction(
+                browser_actions=browse_actions, thought=thought
+            )
+        else:
+            # We assume the LLM is GOOD enough that when it returns pure natural language
+            # it want to talk to the user
+            return MessageAction(content=action_str, wait_for_response=True)
+
+    def search_memory(self, query: str) -> list[str]:
+        raise NotImplementedError('Implement this abstract method')
@@ -0,0 +1,451 @@
+from opendevin.runtime.plugins import AgentSkillsRequirement
+
+_AGENT_SKILLS_DOCS = AgentSkillsRequirement.documentation
+
+COMMAND_DOCS = (
+    '\nApart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:\n'
+    f'{_AGENT_SKILLS_DOCS}'
+    "Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run."
+)
+
+# ======= SYSTEM MESSAGE =======
+MINIMAL_SYSTEM_PREFIX = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+"""
+
+SYSTEM_SUFFIX = """The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> or <execute_browse> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> or <execute_browse> instead of providing it.
+"""
+
+SWE_EXAMPLE = """
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = \"\"\"from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+\"\"\"
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    \"\"\"An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    \"\"\"
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    \"\"\"
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    \"\"\"An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    \"\"\"
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+"""
@@ -1,2 +1,2 @@
-* `finish` - if you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working. Arguments:
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
  * `outputs` - a dictionary representing the outputs of your task, if any
@@ -55,14 +55,13 @@ class MicroAgent(Agent):
        del self.delegates[self.agent_definition['name']]

    def step(self, state: State) -> Action:
-        latest_user_message = state.get_current_user_intent()
        prompt = self.prompt_template.render(
            state=state,
            instructions=instructions,
            to_json=to_json,
            history_to_json=history_to_json,
            delegates=self.delegates,
-            latest_user_message=latest_user_message,
+            latest_user_message=state.get_current_user_intent(),
        )
        messages = [{'content': prompt, 'role': 'user'}]
        resp = self.llm.do_completion(messages=messages)
@@ -2,5 +2,5 @@ name: CoderAgent
 description: Given a particular task, and a detailed description of the codebase, accomplishes the task
 inputs:
  task: string
-  codebase_summary: string
+  summary: string
 outputs: {}
@@ -2,7 +2,7 @@
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:

-{{ latest_user_message }}
+{{ state.inputs.task }}

 {% if state.inputs.summary %}
 Here's a summary of the codebase, as it relates to this task:
@@ -1,7 +1,7 @@
 # Task
 You are a brilliant mathematician and programmer. You've been given the following problem to solve:

-{{ latest_user_message }}
+`{{ state.inputs.task }}`

 Please write a python script that solves this problem, and prints the answer to stdout.
 ONLY print the answer to stdout, nothing else.
@@ -2,7 +2,7 @@
 You are a database engineer. You are working on an existing Postgres project, and have been given
 the following task:

-{{ latest_user_message }}
+{{ state.inputs.task }}

 You must:
 * Investigate the existing migrations to understand the current schema
@@ -4,7 +4,10 @@ import yaml

 all_microagents = {}

-for dir in os.listdir(os.path.dirname(__file__)):
+# Get the list of directories and sort them to preserve determinism
+dirs = sorted(os.listdir(os.path.dirname(__file__)))
+
+for dir in dirs:
    base = os.path.dirname(__file__) + '/' + dir
    if os.path.isfile(base):
        continue
@@ -1,9 +1,11 @@
 # Task
-You are a software engineer. You've inherited an existing codebase, which you're
-learning about for the first time. You need to study the codebase to find all
-the information needed to complete this task:
+You are a software architect. Your team has inherited an existing codebase, and
+need to finish a project:

-{{ latest_user_message }}
+{{ state.inputs.task }}
+
+As an architect, you need to study the codebase to find all the information that
+might be helpful for your software engineering team.

 ## Available Actions
 {{ instructions.actions.run }}
@@ -11,11 +13,14 @@ the information needed to complete this task:
 {{ instructions.actions.message }}
 {{ instructions.actions.finish }}

-You must ONLY `run` commands that have no side-effects, like `ls` and `grep`.
+You must ONLY `run` commands that have no side-effects, like `ls` and `grep`. You
+MUST NOT modify or write to any file.

 Do NOT finish until you have a complete understanding of which parts of the
-codebase are relevant to the task, including particular files, functions, and classes.
+codebase are relevant to the project, including particular files, functions, and classes.
 When you're done, put your summary in `outputs.summary` in the `finish` action.
+Remember, your task is to explore and study the current repository, not actually
+implement the solution. If the codebase is empty, you should call the `finish` action.

 ## History
 {{ instructions.history_truncated }}
@@ -23,3 +28,36 @@ When you're done, put your summary in `outputs.summary` in the `finish` action.

 ## Format
 {{ instructions.format.action }}
+
+## Examples
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+{
+  "action": "run",
+  "args": {
+    "command": "ls",
+    "background": false
+  }
+}
+
+USER:
+OBSERVATION:
+[]
+
+ASSISTANT:
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "summary": "The codebase appears to be empty. Engineers should start everything from scratch."
+    }
+  }
+}
+
+--- END OF EXAMPLE ---
@@ -1,5 +1,6 @@
 name: TypoFixerAgent
 description: Fixes typos in files in the current working directory
-inputs: {}
+inputs:
+  task: string
 outputs:
  summary: string
@@ -1,5 +1,13 @@
 # Task
-You are a proofreader tasked with fixing typos in the files in your current working directory. Your goal is to:
+You are a proofreader tasked with fixing typos in the files in your current working directory.
+
+{% if state.inputs.task %}
+Specifically, your task is:
+{{ state.inputs.task }}
+{% endif %}
+
+To achieve this goal, you should:
+
 1. Scan the files for typos
 2. Overwrite the files with the typos fixed
 3. Provide a summary of the typos fixed
@@ -13,10 +21,10 @@ You are a proofreader tasked with fixing typos in the files in your current work

 To complete this task:
 1. Use the `read` action to read the contents of the files in your current working directory. Make sure to provide the file path in the format `'./file_name.ext'`.
-2. Use the `think` action to analyze the contents and identify typos.
+2. Use the `message` action to analyze the contents and identify typos.
 3. Use the `write` action to create new versions of the files with the typos fixed.
  - Overwrite the original files with the corrected content. Make sure to provide the file path in the format `'./file_name.ext'`.
-4. Use the `think` action to generate a summary of the typos fixed, including the original and fixed versions of each typo, and the file(s) they were found in.
+4. Use the `message` action to generate a summary of the typos fixed, including the original and fixed versions of each typo, and the file(s) they were found in.
 5. Use the `finish` action to return the summary in the `outputs.summary` field.

 Do NOT finish until you have fixed all the typos and generated a summary.
@@ -2,9 +2,10 @@
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:

-{{ latest_user_message }}
+{{ state.inputs.task }}

-Your goal is to verify that the changes are correct and bug-free.
+Note the changes might have already been applied in-line. You should focus on
+validating if the task is solved, nothing else.

 ## Available Actions
 {{ instructions.actions.run }}
@@ -26,6 +26,7 @@ from opendevin.events.observation import (
 from opendevin.events.serialization.event import event_to_memory
 from opendevin.llm.llm import LLM
 from opendevin.memory.condenser import MemoryCondenser
+from opendevin.runtime.tools import RuntimeTool

 if config.agent.memory_enabled:
    from opendevin.memory.memory import LongTermMemory
@@ -46,6 +47,7 @@ class MonologueAgent(Agent):
    initial_thoughts: list[dict[str, str]]
    memory: 'LongTermMemory | None'
    memory_condenser: MemoryCondenser
+    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]

    def __init__(self, llm: LLM):
        """
@@ -2,6 +2,7 @@ from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.events.action import Action, AgentFinishAction
 from opendevin.llm.llm import LLM
+from opendevin.runtime.tools import RuntimeTool

 from .prompt import get_prompt, parse_response

@@ -12,6 +13,7 @@ class PlannerAgent(Agent):
    The planner agent utilizes a special prompting strategy to create long term plans for solving problems.
    The agent is given its previous action-observation pairs, current task, and hint based on last action taken at every step.
    """
+    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]

    def __init__(self, llm: LLM):
        """
@@ -50,6 +50,7 @@ else
    groupadd -g $DOCKER_SOCKET_GID docker
  fi

+  mkdir -p /home/enduser/.cache/huggingface/hub/
  mkdir -p /home/enduser/.cache/ms-playwright/
  mv /home/opendevin/.cache/ms-playwright/ /home/enduser/.cache/

@@ -51,7 +51,6 @@ const config: Config = {
      } satisfies Preset.Options,
    ],
  ],
-
  themeConfig: {
    image: "img/docusaurus.png",
    navbar: {
@@ -81,43 +80,6 @@ const config: Config = {
        },
      ],
    },
-    footer: {
-      style: "dark",
-      links: [
-        {
-          title: "OpenDevin",
-          items: [
-            {
-              label: "Docs",
-              to: "/modules/usage/intro",
-            },
-          ],
-        },
-        {
-          title: "Community",
-          items: [
-            {
-              label: "Slack",
-              href: "https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw"
-            },
-            {
-              label: "Discord",
-              href: "https://discord.gg/ESHStjSjD4",
-            },
-          ],
-        },
-        {
-          title: "More",
-          items: [
-            {
-              label: "GitHub",
-              href: "https://github.com/OpenDevin/OpenDevin",
-            },
-          ],
-        },
-      ],
-      copyright: `Copyright © ${new Date().getFullYear()} OpenDevin`,
-    },
    prism: {
      theme: prismThemes.oneLight,
      darkTheme: prismThemes.oneDark,
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 6
+sidebar_position: 7
 ---

 # 📚 Misc
@@ -31,7 +31,7 @@ For details, please check [this document](https://github.com/OpenDevin/OpenDevin

 Now we have both Slack workspace for the collaboration on building OpenDevin and Discord server for discussion about anything related, e.g., this project, LLM, agent, etc.

- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw)
+- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
 - [Discord server](https://discord.gg/ESHStjSjD4)

 If you would love to contribute, feel free to join our community. Let's simplify software engineering together!
@@ -139,4 +139,4 @@ The agent is given its previous action-observation pairs, current task, and hint
 | --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `__init__`      | Initializes an agent with `llm`                                                                                                                                                           |
 | `step`          | Checks to see if current step is completed, returns `AgentFinishAction` if True. Otherwise, creates a plan prompt and sends to model for inference, adding the result as the next action. |
-| `search_memory` | Not yet implemented                                                                                                                                                                       |
+| `search_memory` | Not yet implemented                                                                                                                                                                       |
@@ -0,0 +1,18 @@
+---
+sidebar_position: 6
+---
+
+# ✅ Providing Feedback
+
+When using OpenDevin, you will undoubtably encounter cases where things work well, and others where they don't. We encourage you to provide feedback when you use OpenDevin to help give feedback to the development team, and perhaps more importantly, create an open corpus of coding agent training examples -- Share-OpenDevin!
+
+## 📝 How to Provide Feedback
+
+Providing feedback is easy! When you are using OpenDevin, you can press the thumbs-up or thumbs-down button at any point during your interaction with. You will be prompted to provide your email address (e.g. so we can contact you if we want to ask any follow-up questions), and you can choose whether you want to provide feedback publicly or privately.
+
+<iframe width="560" height="315" src="https://www.youtube.com/embed/5rFx-StMVV0?si=svo7xzp6LhGK_GXr" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
+
+## 📜 Data License and Privacy
+
+* **Public** data will be distributed under the MIT License, like OpenDevin itself, and can be used by the community to train and test models. Obviously, feedback that you can make public will be more valuable for the community as a whole, so when you are not dealing with sensitive information, we would encourage you to choose this option!
+* **Private** data will only be shared with the OpenDevin team for the purpose of improving OpenDevin.
@@ -42,7 +42,7 @@ Explore the codebase of OpenDevin on [GitHub](https://github.com/OpenDevin/OpenD
  />
 </a>
 <br></br>
-<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw">
+<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
  <img
    src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
    alt="Join our Slack community"
@@ -61,38 +61,37 @@ Explore the codebase of OpenDevin on [GitHub](https://github.com/OpenDevin/OpenD
 The easiest way to run OpenDevin is inside a Docker container. It works best with the most recent version of Docker, `26.0.0`.
 You must be using Linux, Mac OS, or WSL on Windows.

-To start the app, run these commands, replacing `$(pwd)/workspace` with the directory you want OpenDevin to work with.
-
-```
-# The directory you want OpenDevin to work with. It MUST be an absolute path!
-export WORKSPACE_BASE=$(pwd)/workspace
-```
+To start OpenDevin in a docker container, run the following commands in your terminal:

 :::warning
-OpenDevin runs bash commands within a Docker sandbox, so it should not affect your machine. But your workspace directory will be attached to that sandbox, and files in the directory may be modified or deleted.
+When you run the following command, files in `./workspace` may be modified or deleted.
 :::

-```
-docker run \
-    -it \
+```bash
+OPENDEVIN_WORKSPACE=$(pwd)/workspace
+docker run -it \
    --pull=always \
-    -e LLM_API_KEY \
    -e SANDBOX_USER_ID=$(id -u) \
-    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-    -v $WORKSPACE_BASE:/opt/workspace_base \
+    -e PERSIST_SANDBOX="true" \
+    -e SSH_PASSWORD="make something up here" \
+    -e WORKSPACE_MOUNT_PATH=$OPENDEVIN_WORKSPACE \
+    -v $OPENDEVIN_WORKSPACE:/opt/workspace_base \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
-    ghcr.io/opendevin/opendevin:0.5
+    --name opendevin-app-$(date +%Y%m%d%H%M%S) \
+    ghcr.io/opendevin/opendevin:0.6
 ```

-You'll find OpenDevin running at [http://localhost:3000](http://localhost:3000).
+You'll find OpenDevin running at [http://localhost:3000](http://localhost:3000) with access to `./workspace`. To have OpenDevin operate on your code, place it in `./workspace`.
+
+OpenDevin will only have access to this workspace folder. The rest of your system will not be affected as it runs in a secured docker sandbox.

 :::tip
 If you want to use the **(unstable!)** bleeding edge, you can use `ghcr.io/opendevin/opendevin:main` as the image (last line).
 :::

-See [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) for instructions on running OpenDevin without Docker.
+For the development workflow, see [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md).

 Are you having trouble? Check out our [Troubleshooting Guide](https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting).

@@ -133,4 +133,4 @@ the API you're trying to connect to. Most often this happens for Azure or ollama
  - [Google](/OpenDevin/modules/usage/llms/googleLLMs)
 - Make sure your API key is correct
 - See if you can connect to the LLM using `curl`
- Try [connecting via LiteLLM directly](https://github.com/BerriAI/litellm) to test your setup
+- Try [connecting via LiteLLM directly](https://github.com/BerriAI/litellm) to test your setup
@@ -5,32 +5,72 @@ Please be sure to run all commands inside your WSL terminal.

 ## Troubleshooting

+### Error: 'docker' could not be found in this WSL 2 distro.
+
+If you are using Docker Desktop, make sure to start it before calling any docker command from inside WSL.
+Docker also needs to have the WSL integration option activated.
+
+### Recommendation: Do not run as root user
+
+For security reasons, it is highly recommended to not run OpenDevin as the root user, but a user with a non-zero UID.
+In addition, persistent sandboxes won't be supported when running as root and during start of OpenDevin an appropriate message may appear.
+
+References:
+
+* [Why it is bad to login as root](https://askubuntu.com/questions/16178/why-is-it-bad-to-log-in-as-root)
+* [Set default user in WSL](https://www.tenforums.com/tutorials/128152-set-default-user-windows-subsystem-linux-distro-windows-10-a.html#option2)  
+Hint about the 2nd reference: for Ubuntu users, the command could actually be "ubuntupreview" instead of "ubuntu".
+
 ### Failed to create opendevin user

-If you encounter the following error during setup: `Exception: Failed to create opendevin user in sandbox: b'useradd: UID 0 is not unique\n'`.
+If you encounter the following error during setup:
+
+```sh
+Exception: Failed to create opendevin user in sandbox: 'useradd: UID 0 is not unique'
+ ```
+
 You can resolve it by running:
-`    export SANDBOX_USER_ID=1000
-   `
+
+```sh
+export SANDBOX_USER_ID=1000
+```

 ### Poetry Installation

-If you face issues running Poetry even after installing it during the build process, you may need to add its binary path to your environment:
-`    export PATH="$HOME/.local/bin:$PATH"
-   `
+* If you face issues running Poetry even after installing it during the build process, you may need to add its binary path to your environment:
+
+```sh
+export PATH="$HOME/.local/bin:$PATH"
+```
+
+* If make build stops on an error like this:
+
+```sh
+ModuleNotFoundError: no module named <module-name>
+```
+
+This could be an issue with Poetry's cache.
+Try to run these 2 commands after another:
+
+```sh
+rm -r ~/.cache/pypoetry
+make build
+```

 ### NoneType object has no attribute 'request'

 If you are experiencing issues related to networking, such as `NoneType object has no attribute 'request'` when executing `make run`, you may need to configure your WSL2 networking settings. Follow these steps:

- Open or create the `.wslconfig` file located at `C:\Users\%username%\.wslconfig` on your Windows host machine.
- Add the following configuration to the `.wslconfig` file:
+* Open or create the `.wslconfig` file located at `C:\Users\%username%\.wslconfig` on your Windows host machine.
+* Add the following configuration to the `.wslconfig` file:

-```
+```sh
 [wsl2]
 networkingMode=mirrored
 localhostForwarding=true
 ```

- Save the `.wslconfig` file.
- Restart WSL2 completely by exiting any running WSL2 instances and executing the command `wsl --shutdown` in your command prompt or terminal.
- After restarting WSL, attempt to execute `make run` again. The networking issue should be resolved.
+* Save the `.wslconfig` file.
+* Restart WSL2 completely by exiting any running WSL2 instances and executing the command `wsl --shutdown` in your command prompt or terminal.
+* After restarting WSL, attempt to execute `make run` again.  
+The networking issue should be resolved.
@@ -16,16 +16,15 @@
  },
  "dependencies": {
    "@docusaurus/core": "3.2.1",
+    "@docusaurus/plugin-content-pages": "^3.3.2",
    "@docusaurus/preset-classic": "3.2.1",
    "@mdx-js/react": "^3.0.0",
-    "autoprefixer": "^10.4.19",
    "clsx": "^2.0.0",
-    "postcss": "^8.4.38",
    "prism-react-renderer": "^2.3.0",
    "react": "^18.0.0",
    "react-dom": "^18.0.0",
-    "react-use": "^17.5.0",
-    "tailwindcss": "^3.4.3"
+    "react-icons": "^5.2.1",
+    "react-use": "^17.5.0"
  },
  "devDependencies": {
    "@docusaurus/module-type-aliases": "3.2.1",
@@ -0,0 +1,13 @@
+export default function tailwindPlugin(context, options) {
+  return {
+    name: 'tailwind-plugin',
+    configurePostCss(postcssOptions) {
+      postcssOptions.plugins = [
+        require('postcss-import'),
+        require('tailwindcss'),
+        require('autoprefixer'),
+      ];
+      return postcssOptions;
+    },
+  };
+}
@@ -0,0 +1,35 @@
+import React from "react";
+import { FaSlack, FaDiscord, FaGithub } from "react-icons/fa";
+import "../css/footer.css"; // Importing the CSS file
+
+function CustomFooter() {
+  return (
+    <footer className="custom-footer">
+      <div className="footer-content">
+        <div className="footer-top">
+          <div className="footer-title">OpenDevin</div>
+          <div className="footer-link">
+            <a href="/modules/usage/intro">Docs</a>
+          </div>
+        </div>
+        <div className="footer-community">Community</div>
+        <div className="footer-icons">
+          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA" target="_blank" rel="noopener noreferrer">
+            <FaSlack />
+          </a>
+          <a href="https://discord.gg/ESHStjSjD4" target="_blank" rel="noopener noreferrer">
+            <FaDiscord />
+          </a>
+          <a href="https://github.com/OpenDevin/OpenDevin" target="_blank" rel="noopener noreferrer">
+            <FaGithub />
+          </a>
+        </div>
+        <div className="footer-bottom">
+          <p>Copyright &copy; {new Date().getFullYear()} OpenDevin</p>
+        </div>
+      </div>
+    </footer>
+  );
+}
+
+export default CustomFooter;
@@ -2,18 +2,18 @@ import Link from "@docusaurus/Link";
 import useDocusaurusContext from "@docusaurus/useDocusaurusContext";
 import Heading from "@theme/Heading";
 import { Demo } from "../Demo/Demo";
-import styles from "./index.module.css";
+import "../../css/homepageHeader.css"; // Importing the CSS file

 export function HomepageHeader() {
  const { siteConfig } = useDocusaurusContext();
  return (
-    <div className={styles.headerContainer}>
-      <div className={styles.header}>
-        <Heading as="h1" className="hero__title">
+    <div className="homepage-header">
+      <div className="header-content">
+        <Heading as="h1" className="header-title">
          {siteConfig.title}
        </Heading>
-        <p className="hero__subtitle">{siteConfig.tagline}</p>
-        <div className={styles.buttons}>
+        <p className="header-subtitle">{siteConfig.tagline}</p>
+        <div className="header-buttons">
          <Link
            className="button button--secondary button--lg"
            to="/modules/usage/intro"
@@ -21,8 +21,9 @@ export function HomepageHeader() {
            Get Started
          </Link>
        </div>
-      </div>{" "}
-      <Demo />
+        <Demo />
+      </div>
    </div>
  );
 }
+
@@ -1,37 +0,0 @@
-.headerContainer {
-  background: radial-gradient(circle, var(--secondary), var(--secondary-light));
-  background-size: 200% 200%;
-  animation: gradientAnimation 10s linear infinite;
-  display: flex;
-  justify-content: center;
-}
-
-@media only screen and (max-width: 600px) {
-  .headerContainer {
-    flex-direction: column;
-  }
-}
-
-@keyframes gradientAnimation {
-  0% {
-    background-position: left center;
-  }
-  50% {
-    background-position: right center;
-  }
-  100% {
-    background-position: left center;
-  }
-}
-.header {
-  max-width: 1300px;
-  color: white;
-  display: flex;
-  margin-left: 100px;
-  margin-right: 100px;
-  flex-direction: column;
-  align-items: center;
-  justify-content: center;
-  overflow: hidden;
-  padding: 70px 30px 30px;
-}
@@ -1,11 +1,12 @@
-import styles from "./styles.module.css";
+import React from "react";
+import "../../css/welcome.css";  // Importing the CSS file

 export function Welcome() {
  return (
-    <div className={styles.container}>
-      <div className={styles.innerContainer}>
-        <img src="img/logo.png" className={styles.sidebarImage} />
-        <p className={styles.welcomeText}>
+    <div className="text-white">
+      <div className="welcome-container">
+        <img src="img/logo.png" className="welcome-logo" />
+        <p className="welcome-text">
          Welcome to OpenDevin, an open-source project aiming to replicate
          Devin, an autonomous AI software engineer who is capable of executing
          complex engineering tasks and collaborating actively with users on
@@ -1,27 +0,0 @@
-.container {
-  display: flex;
-  flex-direction: column;
-  padding-top: 25px;
-  padding-bottom: 25px;
-  width: 100%;
-}
-
-.innerContainer {
-  padding: 50px;
-  width: 100%;
-  max-width: 1300px;
-  padding-top: 30px;
-  margin: auto;
-  display: flex;
-  align-items: center;
-}
-
-.sidebarImage {
-  max-width: 400px;
-  padding-right: 30px;
-}
-
-.welcomeText {
-  text-align: justify;
-  font-size: larger;
-}
@@ -5,6 +5,7 @@
 */

 /* You can override the default Infima variables here. */
+
 :root {
  --ifm-color-primary: #4465db;
  --ifm-code-font-size: 95%;
@@ -33,4 +34,4 @@

 .a {
  text-decoration: underline;
-}
+}
@@ -0,0 +1,66 @@
+/* faq.css */
+
+.faq-container {
+    margin: auto;
+    padding: 24px;
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+    margin-bottom: 24px;
+  }
+  
+  .faq-title {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 2rem;
+    padding: 8px;
+    text-transform: uppercase;
+    font-weight: bold;
+  }
+  
+  @media (min-width: 1024px) {
+    .faq-title {
+      font-size: 6rem;
+    }
+  }
+  
+  .faq-section {
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+    width: 100%;
+    margin-bottom: 24px;
+  }
+  
+  .faq-section-title {
+    text-transform: uppercase;
+    font-weight: bold;
+    font-size: 2rem;
+    letter-spacing: 0.1em;
+  }
+  
+  .highlight {
+    font-weight: 600;
+    color: var(--logo);
+  }
+  
+  .faq-steps ol {
+    padding-left: 24px;
+  }
+  
+  .command-box {
+    display: flex;
+    flex-direction: column;
+    padding: 8px;
+    background-color: #e0e0e0;
+    border-radius: 0.375rem;
+    height: 6vh;
+    text-transform: uppercase;
+    color: #4a5568;
+  }
+  
+  .command-box + .command-box {
+    height: 8vh;
+  }
+  
@@ -0,0 +1,72 @@
+/* customFooter.css */
+
+.custom-footer {
+    background-color: dark;
+    color: white;
+    height: 25vh;
+    /* background: linear-gradient(to bottom, #1a1a1a, #1a1a1a); */
+    background: linear-gradient(to bottom, #1f2937, #000000);
+
+  }
+  
+  .footer-content {
+    display: flex;
+    flex-direction: column;
+    justify-content: space-between;
+    align-items: center;
+    padding: 8px;
+    height: 100%;
+  }
+  
+  .footer-top {
+    display: flex;
+    gap: 8px;
+    align-items: center;
+  }
+  
+  .footer-title {
+    font-weight: bold;
+    font-size: 1.125rem;
+  }
+  
+  @media (min-width: 768px) {
+    .footer-title {
+      font-size: 1.875rem;
+    }
+  }
+  
+  .footer-link a {
+    font-size: 0.875rem;
+    text-decoration: none;
+    color: gray;
+    transition: color 0.3s ease;
+  }
+  
+  .footer-link a:hover {
+    color: white;
+  }
+  
+  .footer-community {
+    text-transform: uppercase;
+    font-weight: 300;
+  }
+  
+  .footer-icons {
+    display: flex;
+    gap: 24px;
+    font-size: 1.875rem;
+  }
+  
+  .footer-icons a {
+    color:gray;
+    transition: color 0.3s ease;
+  }
+  
+  .footer-icons a:hover {
+    color: white;
+  }
+  
+  .footer-bottom {
+    text-transform: uppercase;
+  }
+  
@@ -0,0 +1,36 @@
+/* homepageHeader.css */
+
+.homepage-header {
+    height: 100vh;
+    color: white;
+    background: linear-gradient(to top, #64748b, #000000);
+  }
+  
+  .header-content {
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+    align-items: center;
+    padding: 24px;
+    font-weight: 300;
+    width: 100%;
+  }
+  
+  .header-title {
+    font-size: 3rem;
+  }
+  
+  @media (min-width: 768px) {
+    .header-title {
+      font-size: 5rem;
+    }
+  }
+  
+  .header-subtitle {
+    font-size: 1.25rem;
+  }
+  
+  .header-buttons {
+    margin-top: 24px;
+  }
+  
@@ -0,0 +1,53 @@
+/* welcome.css */
+
+.text-white {
+    color: white;
+  }
+
+  .welcome-container {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    flex-direction: column;
+    background: linear-gradient(to bottom, #64748b, #1f2937);
+  }
+
+  @media (min-width: 768px) {
+    .welcome-container {
+      flex-direction: row;
+      background: linear-gradient(to bottom, #64748b, #1f2937);
+    }
+  }
+
+  .welcome-logo {
+    height: 45vh;
+    width: 45vw;
+  }
+
+  @media (max-width: 640px) {
+    .welcome-logo {
+      height: 40vw;
+      width: 40vw;
+    }
+  }
+
+  @media (min-width: 768px) {
+    .welcome-logo {
+      height: auto;
+      width: 350px;
+    }
+  }
+
+  .welcome-text {
+    padding: 24px;
+    margin-bottom: 24px;
+    font-weight: 300;
+    font-size: 1.125rem;
+  }
+
+  @media (min-width: 768px) {
+    .welcome-text {
+      padding: 8px;
+      font-size: 1.5rem;
+    }
+  }
@@ -0,0 +1,6 @@
+import React from 'react';
+import CustomFooter from '../components/CustomFooter';
+
+export default function Footer() {
+  return <CustomFooter />;
+}
@@ -1,76 +1,78 @@
 import Layout from "@theme/Layout";
+import CustomFooter from "../components/CustomFooter";
+import "../css/faq.css"; 

 export default function FAQ() {
  return (
-    <Layout title="FAQ" description="Frequently Asked Questions">
-      <div
-        id="faq"
-        style={{
-          maxWidth: "900px",
-          margin: "0px auto",
-          padding: "40px",
-          textAlign: "justify",
-        }}
-      >
-        <h1 style={{ fontSize: "3rem" }}>Frequently Asked Questions</h1>
-        <h2 style={{ fontSize: "2rem" }}>Support</h2>
-        <h3>How can I report an issue with OpenDevin?</h3>
-        <p>
-          Please file a bug on{" "}
-          <a href="https://github.com/OpenDevin/OpenDevin/issues">GitHub</a> if
-          you notice a problem that likely affects others.
-          If you're having trouble installing, or have general questions, reach out on{" "}
-          <a href="https://discord.gg/mBuDGRzzES">Discord</a> or{" "}
-          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw">Slack</a>.
-        </p>
-        <h2 style={{ fontSize: "2rem" }}>General</h2>
-        <h3>What is Devin?</h3>
-        <p>
-          <span style={{ fontWeight: "600", color: "var(--logo)" }}>Devin</span>{" "}
-          represents a cutting-edge autonomous agent designed to navigate the
-          complexities of software engineering. It leverages a combination of
-          tools such as a shell, code editor, and web browser, showcasing the
-          untapped potential of LLMs in software development. Our goal is to
-          explore and expand upon Devin's capabilities, identifying both its
-          strengths and areas for improvement, to guide the progress of open
-          code models.
-        </p>
-        <h3>Why OpenDevin?</h3>
-        <p>
-          The{" "}
-          <span style={{ fontWeight: "600", color: "var(--logo)" }}>
-            OpenDevin
-          </span>{" "}
-          project is born out of a desire to replicate, enhance, and innovate
-          beyond the original Devin model. By engaging the{" "}
-          <a href="https://github.com/OpenDevin/OpenDevin">
-            open-source community
-          </a>
-          , we aim to tackle the challenges faced by Code LLMs in practical
-          scenarios, producing works that significantly contribute to the
-          community and pave the way for future advancements.
-        </p>
-        <h3>How to fix an issue on OpenDevin?</h3>
-        <p>
-          To fix an issue on GitHub using OpenDevin, send a prompt to OpenDevin asking it to follow these steps:
-          <ol>
-            <li>Read the issue on <a href="https://github.com/OpenDevin/OpenDevin/issues/1611">GitHub</a></li>
-            <li>Clone the repository and check out a new branch</li>
-            <li>Based on the instructions in the issue description, modify files to fix the issue</li>
-            <li>Push the resulting output to GitHub using the GITHUB_TOKEN environment variable</li>
-            <li>Tell me the link that I need to go to to send a pull request</li>
-          </ol>
-          Before you run OpenDevin, you can do:
-          <pre>
-            export SANDBOX_ENV_GITHUB_TOKEN=XXX
-          </pre>
-          where XXX is a GitHub token that you created that has permissions to push to the OpenDevin repo. If you don’t have write permission to the OpenDevin repo, you might need to change that to:
-          <pre>
-            4. Push the resulting output to my fork at https://github.com/USERNAME/OpenDevin/ using the GITHUB_TOKEN environment variable
-          </pre>
-          where USERNAME is your GitHub username.
-        </p>
-      </div>
-    </Layout>
+    <>
+      <Layout title="FAQ" description="Frequently Asked Questions">
+        <div id="faq" className="faq-container">
+          <div className="faq-title">Frequently Asked Questions</div>
+          <div className="faq-section">
+            <div className="faq-section-title">Support</div>
+            <div>How can I report an issue with OpenDevin?</div>
+            <div>
+              Please file a bug on{" "}
+              <a href="https://github.com/OpenDevin/OpenDevin/issues" target="_blank">GitHub</a> if
+              you notice a problem that likely affects others.
+              If you're having trouble installing, or have general questions, reach out on{" "}
+              <a href="https://discord.gg/mBuDGRzzES" target="_blank">Discord</a> or{" "}
+              <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA" target="_blank">Slack</a>.
+            </div>
+          </div>
+          <div className="faq-section">
+            <div className="faq-section-title">General</div>
+            <div>What is Devin?</div>
+            <div>
+              <span className="highlight">Devin</span>{" "}
+              represents a cutting-edge autonomous agent designed to navigate the
+              complexities of software engineering. It leverages a combination of
+              tools such as a shell, code editor, and web browser, showcasing the
+              untapped potential of LLMs in software development. Our goal is to
+              explore and expand upon Devin's capabilities, identifying both its
+              strengths and areas for improvement, to guide the progress of open
+              code models.
+            </div>
+          </div>
+          <div className="faq-section">
+            <div className="faq-section-title">Why OpenDevin?</div>
+            <p>
+              The{" "}
+              <span className="highlight">OpenDevin</span>{" "}
+              project is born out of a desire to replicate, enhance, and innovate
+              beyond the original Devin model. By engaging the{" "}
+              <a href="https://github.com/OpenDevin/OpenDevin">
+                open-source community
+              </a>
+              , we aim to tackle the challenges faced by Code LLMs in practical
+              scenarios, producing works that significantly contribute to the
+              community and pave the way for future advancements.
+            </p>
+          </div>
+          <div className="faq-section">
+            <div className="faq-section-title">How to fix an issue on OpenDevin?</div>
+            <div className="faq-steps">
+              To fix an issue on GitHub using OpenDevin, send a prompt to OpenDevin asking it to follow these steps:
+              <ol>
+                <li>Read the issue on <a href="https://github.com/OpenDevin/OpenDevin/issues/1611">GitHub</a></li>
+                <li>Clone the repository and check out a new branch</li>
+                <li>Based on the instructions in the issue description, modify files to fix the issue</li>
+                <li>Push the resulting output to GitHub using the GITHUB_TOKEN environment variable</li>
+                <li>Tell me the link that I need to go to to send a pull request</li>
+              </ol>
+              Before you run OpenDevin, you can do:
+              <div className="command-box">
+                export SANDBOX_ENV_GITHUB_TOKEN=XXX
+              </div>
+              where XXX is a GitHub token that you created that has permissions to push to the OpenDevin repo. If you don’t have write permission to the OpenDevin repo, you might need to change that to:
+              <div className="command-box">
+                Push the resulting output to my fork at https://github.com/USERNAME/OpenDevin/ using the GITHUB_TOKEN environment variable
+              </div>
+              where USERNAME is your GitHub username.
+            </div>
+          </div>
+        </div>
+      </Layout>
+    </>
  );
 }
@@ -1,23 +0,0 @@
-/**
- * CSS files with the .module.css suffix will be treated as CSS modules
- * and scoped locally.
- */
-
-.heroBanner {
-  padding: 4rem 0;
-  text-align: center;
-  position: relative;
-  overflow: hidden;
-}
-
-@media screen and (max-width: 996px) {
-  .heroBanner {
-    padding: 2rem;
-  }
-}
-
-.buttons {
-  display: flex;
-  align-items: center;
-  justify-content: center;
-}
@@ -1,12 +1,12 @@
 import useDocusaurusContext from "@docusaurus/useDocusaurusContext";
 import Layout from "@theme/Layout";
-
 import { HomepageHeader } from "../components/HomepageHeader/HomepageHeader";
 import { Welcome } from "../components/Welcome/Welcome";

 export function Header({ title, summary, description }): JSX.Element {
  return (
    <div>
+      <h1>{title}</h1>
      <h2 style={{ fontSize: "40px" }}>{summary}</h2>
      <h3 className="headerDescription">{description}</h3>
    </div>
@@ -16,8 +16,9 @@ export function Header({ title, summary, description }): JSX.Element {
 export default function Home(): JSX.Element {
  const { siteConfig } = useDocusaurusContext();
  return (
+    <>
    <Layout
-      title={`Hello from ${siteConfig.title}`}
+      title={`${siteConfig.title}`}
      description="AI-powered code generation for software engineering."
    >
      <div>
@@ -27,5 +28,6 @@ export default function Home(): JSX.Element {
        </div>
      </div>
    </Layout>
+    </>
  );
 }
@@ -0,0 +1,12 @@
+import React from 'react';
+import OriginalLayout from '@theme-original/Layout';
+import Footer from '@site/src/pages/_footer';
+
+export default function Layout(props) {
+  return (
+    <>
+      <OriginalLayout {...props} />
+      <Footer />
+    </>
+  );
+}
@@ -1,31 +1,15 @@
-import json
 import logging
-import os
 import re
 from typing import Optional

 import openai
 import requests.exceptions
-import torch
 from openai import OpenAI
 from retry import retry
-from transformers import AutoModelForCausalLM, AutoTokenizer

 LOGGER = logging.getLogger(__name__)


-def load_model(path):
-    print('Loading model...')
-    tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False)
-    print('Tokenizer loaded.')
-    model = AutoModelForCausalLM.from_pretrained(
-        path, low_cpu_mem_usage=True, torch_dtype=torch.float16
-    ).cuda()
-    print('Model loaded.')
-    # model.half().cuda()
-    return model, tokenizer
-
-
 class Q20Game:
    def __init__(
        self,
@@ -36,8 +20,10 @@ class Q20Game:
        temperature: float = 0.8,
        openai_api: bool = True,
        openai_api_key: Optional[str] = None,
-        guesser_kargs={},
+        guesser_kargs=None,
    ) -> None:
+        if guesser_kargs is None:
+            guesser_kargs = {}
        self.item = item
        self.answerer_model = answerer_model
        self.guesser_model = guesser_model
@@ -70,124 +56,11 @@ class Q20Game:

        self.guesser_messages = []

-    def confusion_matrix(self, path):
-        self.reset()
-        with open(path) as f:
-            raw_messages = json.load(f)
-            self.item = path.split('/')[-1].split('_')[0]
-            roles = ['assistant', 'user']
-            for i, message in enumerate(raw_messages):
-                self.guesser_messages.append(
-                    {'role': roles[i % 2], 'content': message['content']}
-                )
-
-        self.guesser_messages = self.guesser_messages[:-2]
-        self.guesser_messages[-1]['content'] = (
-            self.guesser_messages[-1]['content'] + " You must guess now, what's it?"
-        )
-        guesser_msg = self.guesser(self.guesser_messages)
-        self.guesser_messages.append(guesser_msg)
-        guesser_question = guesser_msg['content'].strip()
-        self.guesser_messages[-1]['content'] = (
-            self.guesser_messages[-1]['content'] + ' Is it right?'
-        )
-        usr_msg = self.answerer(guesser_question)
-        self.guesser_messages.append(
-            {'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
-        )
-
-        if 'bingo' in self.guesser_messages[-1]['content'].lower():
-            self.guesser_win = True
-            return True
-
-        return False
-
-    @retry(
-        (
-            openai.Timeout,
-            requests.exceptions.ReadTimeout,
-            openai.RateLimitError,
-            openai.APIError,
-            requests.exceptions.HTTPError,
-            openai.APIConnectionError,
-        ),
-        tries=5,
-        delay=0.5,
-        backoff=0.5,
-        max_delay=2,
-        logger=LOGGER,
-    )
-    def guesser(self, messages):
-        if not self.guesser_model.startswith('gpt'):  # hf model
-            self.guesser_model, self.guesser_tokenizer = load_model(self.guesser_model)
-
-            # """Wraps hf's `generate` adding some specific method's defaults"""
-            assert not self.openai_api
-            prompt = self.dialog_history() + ' ASSISTANT:'
-            input_ids = torch.tensor(
-                [self.guesser_tokenizer.encode(prompt, add_special_tokens=True)]
-            )  # TODO check if huggingface is using the same format.
-            input_ids = input_ids.to(self.guesser_model.base_model.device)
-            attention_mask = None
-
-            with torch.no_grad():
-                gen = self.guesser_model.generate(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    **self.guesser_kargs,
-                )
-                gen_str = (
-                    self.guesser_tokenizer.decode(gen[0][input_ids[0].shape[0] :])
-                    .split('</s>')[0]
-                    .split('USER')[0]
-                    .lstrip()
-                    .strip()
-                )
-
-                return {
-                    'role': 'assistant',
-                    'content': gen_str,
-                }
-        else:
-            openai.api_base = self.guesser_api_base
-            client = OpenAI(api_key=openai.api_key)
-            response = client.chat.completions.create(
-                model=self.guesser_model,
-                messages=messages,
-                max_tokens=64,
-                n=1,
-                stop=None,
-                temperature=self.temperature,
-            )
-            return {
-                'role': 'assistant',
-                'content': response.choices[0].message.to_dict()['content'].strip(),
-            }
-
-    def dialog_history(self):
-        history = self.vicuna_prompt + ' '
-        for item in self.guesser_messages:
-            if item['role'].upper() == 'USER':
-                history += 'USER: ' + item['content']
-            elif item['role'].upper() == 'ASSISTANT':
-                history += ' ' + 'ASSISTANT: ' + item['content'] + '</s>'
-        return history
-
-
-    def preprocess_response(self,response):
-        response = re.sub(
-            r'the entity you are thinking of', 'it', response
-        )
-        response = re.sub(
-            r"the entity you're thinking of", 'it', response
-        )
-        response = re.sub(
-            r" you're thinking of", '', response
-        )
-        response = re.sub(
-            r' you are thinking of', '', response
-        )
-        self.guesser_messages.append(response)
+    def preprocess_response(self, response):
+        response = re.sub(r'the entity you are thinking of', 'it', response)
+        response = re.sub(r"the entity you're thinking of", 'it', response)
+        response = re.sub(r" you're thinking of", '', response)
+        response = re.sub(r' you are thinking of', '', response)
        return response

    def judge_winner(self, response):
@@ -195,101 +68,39 @@ class Q20Game:

        if self.curr_turn == self.num_turns - 1:
            guesser_question += ' Is it right?'
+
+        self.guesser_messages.append({'role': 'assistant', 'content': guesser_question})
        # ask for answer
        usr_msg = self.answerer(guesser_question)

+        self.guesser_messages.append(
+            {'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
+        )
+
        if 'bingo' in usr_msg['content'].lower():
            self.guesser_win = True
-            return True, ""
-        
+            return True, ''
+
        return False, usr_msg['content'].strip()
-    
+
    def generate_user_response(self, response):
        response = self.preprocess_response(response)
        # others
        bingo, anwser_reply = self.judge_winner(response)
        if bingo:
-            return "You are bingo! quit now, run: <execute_bash> exit </execute_bash>.\n"
+            return (
+                'You are bingo! quit now, run: <execute_bash> exit </execute_bash>.\n'
+            )
        if self.curr_turn == self.num_turns - 2:
            anwser_reply += " You must guess now, what's it?"
        return anwser_reply

-    def game_play(self, user_mode=False):
-        self.reset()
-        # print(f"Item: {self.item}")
-        for t in range(self.num_turns):
-            # System asking a question
-            if (not user_mode) or user_mode is None:
-                guesser_msg = self.guesser(self.guesser_messages)
-                guesser_msg['content'] = re.sub(
-                    r'the entity you are thinking of', 'it', guesser_msg['content']
-                )
-                guesser_msg['content'] = re.sub(
-                    r"the entity you're thinking of", 'it', guesser_msg['content']
-                )
-                guesser_msg['content'] = re.sub(
-                    r" you're thinking of", '', guesser_msg['content']
-                )
-                guesser_msg['content'] = re.sub(
-                    r' you are thinking of', '', guesser_msg['content']
-                )
-            else:
-                user_q = input(
-                    f'Type in your questions for turn {t+1}. (e.g. Is it a living thing?)\n'
-                )
-                guesser_msg = {'role': 'assistant', 'content': user_q}
-            self.guesser_messages.append(guesser_msg)
-            guesser_question = guesser_msg['content'].strip()
-
-            if t == self.num_turns - 1:
-                self.guesser_messages[-1]['content'] = (
-                    self.guesser_messages[-1]['content'] + ' Is it right?'
-                )
-
-            usr_msg = self.answerer(guesser_question)
-            self.guesser_messages.append(
-                {'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
-            )
-
-            if 'bingo' in usr_msg['content'].lower():
-                self.guesser_win = True
-                return True
-
-            if t == self.num_turns - 2:
-                self.guesser_messages[-1]['content'] = (
-                    self.guesser_messages[-1]['content']
-                    + " You must guess now, what's it?"
-                )
-
-        return False
-
-    def save_session(self, path):
-        # Print the conversation
-        if not os.path.exists(path):
-            os.makedirs(path)
-        output_file = os.path.join(path, f'{self.item}.txt')
-        with open(output_file, 'w') as out_f:
-            out_f.write(f'item: {self.item}\n')
-            for t, message in enumerate(self.guesser_messages):
-                out_f.write(
-                    f"Turn {(t+1)//2}, {message['role'].capitalize()}: {message['content'].lstrip()}\n"
-                )
-
    def reward(self):
        if self.guesser_win:
            n_turns = (len(self.guesser_messages) + 1) // 2
            return 1 - max(n_turns - 5, 0) * 0.02
        return 0

-    def num_success(self):
-        return 1 if self.guesser_win else 0
-
-    def num_yes(self):
-        n_yes = sum(
-            ['yes' in msg['content'].lower() for msg in self.guesser_messages[2::2]]
-        )
-        return n_yes
-
    @retry(
        (
            openai.Timeout,
@@ -339,16 +150,6 @@ class Q20Game:
            response.choices[0].message.content = 'Bingo!'
        return response.choices[0].message.to_dict()

-    def reset(self):
-        # Initialize the conversation
-        self.curr_turn = 0
-        self.guesser_messages = [
-            {
-                'role': 'user',
-                'content': self.first_user_utterance,
-            }
-        ]
-

 class Q20GameCelebrity(Q20Game):
    def __init__(self, item: str, **kwargs) -> None:
@@ -376,6 +177,7 @@ class Q20GameCelebrity(Q20Game):
    )
    def answerer(self, question):
        openai.api_base = self.user_api_base
+        client = OpenAI(api_key=openai.api_key)
        user_messages = [
            {
                'role': 'system',
@@ -391,7 +193,7 @@ class Q20GameCelebrity(Q20Game):
            },
        ]

-        response = openai.ChatCompletion.create(
+        response = client.chat.completions.create(
            model=self.answerer_model,
            messages=user_messages,
            max_tokens=6,
@@ -402,12 +204,3 @@ class Q20GameCelebrity(Q20Game):
        if re.search(rf'(?:^|\W){self.item.lower()}(?:$|\W)', question.lower()):
            response.choices[0].message.content = 'Bingo!'
        return response.choices[0].message.to_dict()
-
-    def reset(self):
-        # Initialize the conversation
-        self.guesser_messages = [
-            {
-                'role': 'user',
-                'content': self.first_user_utterance,
-            }
-        ]
@@ -46,6 +46,8 @@ def codeact_user_response(state: State) -> str:
    game.curr_turn += 1
    logger.info(f'Model guess: {model_guess}')
    logger.info(f'Anwser response: {msg}')
+    if 'bingo!' in msg.lower():
+        return '/exit'
    return msg


@@ -64,7 +66,7 @@ AGENT_CLS_TO_INST_SUFFIX = {


 def process_instance(instance, agent_class, metadata, reset_logger: bool = True):
-    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    eval_output_dir = metadata['eval_output_dir']
    if reset_logger:
        # Set up logger
@@ -125,7 +127,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
        )
    )
    # ======= Attempt to evaluate the agent's edits =======
-    # If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.

    if state is None:
@@ -139,6 +141,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)

    logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
    test_result = game.reward()
+    metrics = state.metrics.get() if state.metrics else None

    # Save the output
    output = {
@@ -149,6 +152,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
        'history': [
            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
        ],
+        'metrics': metrics,
        'error': state.error if state and state.error else None,
        'test_result': {
            'success': test_result,
@@ -235,7 +239,7 @@ if __name__ == '__main__':
        'max_iterations': max_iterations,
        'eval_output_dir': eval_output_dir,
        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
-        # get the commit id of current repo for reproduciblity
+        # get the commit id of current repo for reproducibility
        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
        .decode('utf-8')
        .strip(),
@@ -46,4 +46,5 @@ if [ -n "$EVAL_LIMIT" ]; then
 fi

 # Run the command
+echo $COMMAND
 eval $COMMAND
@@ -13,9 +13,14 @@ all the preprocessing/evaluation/analysis scripts.
 ## Supported Benchmarks

 - SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
+- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
 - HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
 - GAIA: [`evaluation/gaia`](./gaia)
 - Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
+- MINT: [`evaluation/mint`](./mint)
+- AgentBench: [`evaluation/agent_bench`](./agent_bench)
+- BIRD: [`evaluation/bird`](./bird)
+- LogicReasoning: [`evaluation/logic_reasoning`](./logic_reasoning)

 ### Result Visualization

@@ -0,0 +1,60 @@
+# AgentBench Evaluation
+
+This folder contains evaluation harness for evaluating agents on
+the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688).
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md)
+for how to set this up.
+
+Here is an example `config.toml` file:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/path/to/cache"
+
+workspace_base = "/path/to/workspace"
+workspace_mount_path = "/path/to/workspace"
+
+sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
+sandbox_type = "ssh"
+sandbox_timeout = 120
+ssh_hostname = "localhost"
+
+use_host_network = false
+# AgentBench specific
+run_as_devin = true
+enable_auto_lint = true
+
+[eval_gpt35_turbo]
+model = "gpt-3.5-turbo"
+api_key = "sk-123"
+temperature = 0.0
+
+[eval_gpt4o]
+model = "gpt-4o"
+api_key = "sk-123"
+temperature = 0.0
+```
+
+## Start the evaluation
+
+```bash
+./evaluation/agent_bench/scripts/run_infer.sh [model_config] [agent] [eval_limit]
+```
+
+Following is the basic command to start the evaluation. Here we are only evaluating the `osbench` for now.
+
+You can update the arguments in the script `evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.
+
+- `--agent-cls`, the agent to use. For example, `CodeActAgent`.
+- `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
+- `--max-iterations`: the number of iterations to run the evaluation. For example, `30`.
+- `--eval-num-workers`: the number of workers to use for evaluation. For example, `5`.
+- `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
+
+```bash
+./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo CodeActAgent 1
+```
@@ -0,0 +1,61 @@
+import os
+import re
+
+from opendevin.events.action import CmdRunAction, MessageAction
+
+
+def analysis_size(size_str):
+    size_str = size_str.strip()
+    avails = {
+        'B': 1,
+        'Byte': 1,
+        'K': 1024,
+        'KB': 1024,
+        'M': 1024 * 1024,
+        'MB': 1024 * 1024,
+        'G': 1024 * 1024 * 1024,
+        'GB': 1024 * 1024 * 1024,
+        'T': 1024 * 1024 * 1024 * 1024,
+        'TB': 1024 * 1024 * 1024 * 1024,
+        'P': 1024 * 1024 * 1024 * 1024 * 1024,
+        'PB': 1024 * 1024 * 1024 * 1024 * 1024,
+    }
+    for size_unit in avails:
+        if size_str.endswith(size_unit):
+            return int(size_str[: -len(size_unit)]) * avails[size_unit]
+    return int(size_str)
+
+
+def compare_results(check_method: str, model_answer: str, final_ans: str) -> bool:
+    try:
+        match check_method:
+            case 'check/integer-match.py':
+                return int(model_answer) == int(final_ans)
+            case 'check/size-match.py':
+                return analysis_size(model_answer) == analysis_size(final_ans)
+        return (
+            model_answer.replace('\r\n', '\n').replace('\r', '\n').strip()
+            == final_ans.replace('\r\n', '\n').replace('\r', '\n').strip()
+        )
+    except Exception:
+        return False
+
+
+def create_sh_file(filename: str, cmds: str) -> None:
+    with open(filename, 'w', encoding='utf-8') as file:
+        file.write(cmds.replace('\r\n', '\n'))
+    os.chmod(filename, 0o755)
+
+
+def try_parse_answer(act) -> str | None:
+    raw_ans = ''
+    if isinstance(act, MessageAction) and act.source == 'agent':
+        raw_ans = act.content
+    elif isinstance(act, CmdRunAction) and act.source == 'agent':
+        raw_ans = act.thought
+    else:
+        return None
+    agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
+    if not agent_answer:
+        return None
+    return agent_answer[0].strip()
@@ -0,0 +1,405 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+import docker
+from datasets import load_dataset
+from tqdm import tqdm
+
+from evaluation.agent_bench.helper import (
+    compare_results,
+    create_sh_file,
+    try_parse_answer,
+)
+from opendevin.controller.state.state import State
+from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import CmdRunAction, MessageAction
+from opendevin.events.serialization.event import event_to_dict
+from opendevin.runtime.docker.ssh_box import DockerSSHBox
+
+
+def cleanup():
+    print('Cleaning up child processes...')
+    for process in mp.active_children():
+        print(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'If you think you have solved the task, please first send your answer to user through '
+        'message and then <execute_bash> exit </execute_bash>.\n'
+        'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
+        'For example: The answer to the question is <solution> 42 </solution>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
+    )
+    if state.history:
+        # check if the last action is an answer, if so, return exit for early exit
+        last_action, _ = state.history[-1]
+        ans = try_parse_answer(last_action)
+        if ans is not None:
+            return '/exit'
+
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have solved the question, '
+    'please first send your answer to user through message and then exit.\n'
+}
+
+
+def process_instance(
+    instance,
+    agent_class,
+    metadata,
+    eval_output_dir,
+    reset_logger: bool = True,
+):
+    # =============================================
+    # preparation
+    # =============================================
+
+    inst_id = instance.instance_id
+    question = instance.description
+    # create a directory for the instance's workspace
+    instance_workspace = str(os.path.join(config.workspace_base, inst_id))
+    container_inst_workspace = str(
+        os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
+    )
+    if os.path.exists(instance_workspace):
+        shutil.rmtree(instance_workspace)
+    os.makedirs(instance_workspace, exist_ok=True)
+
+    # Set up the logger properly, so you can run multiprocessing to parallel the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(eval_output_dir, 'logs', f'instance_{inst_id}.log')
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {inst_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+
+    # =============================================
+    # build instruction
+    # =============================================
+
+    # Prepare instruction
+    instruction = (
+        f'Please fix the following issue.\n'
+        'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
+        'For example: The answer to the question is <solution> 42 </solution>.\n'
+        '# Problem \n'
+        f'{question}\n\n'
+    )
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided '
+        'to you AND NEVER ASK FOR HUMAN HELP.\n'
+    )
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+
+    sandbox = DockerSSHBox()
+    sandbox.execute(f'cd {inst_id}')
+
+    init_cmd = instance.init
+    if init_cmd is not None:
+        scpt_name = f'{instance.instance_id}_init.sh'
+        scpt_path = os.path.join(container_inst_workspace, scpt_name)
+        host_scpt_path = os.path.join(instance_workspace, scpt_name)
+        create_sh_file(host_scpt_path, init_cmd)
+        logger.info(f'Running init script: {scpt_path}')
+        _, init_res = sandbox.execute(scpt_path)
+        logger.info(f'Init script result: {init_res}')
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State = asyncio.run(
+        main(
+            instruction,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+            sandbox=sandbox,
+        )
+    )
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # get the ground truth
+    # OSBenchSSHBox.get_ground_truth(instance, state)
+
+    # =============================================
+    # result evaluation
+    # =============================================
+
+    agent_answer = ''
+    get_agent_result_cmd = instance.get_agent_result
+    if get_agent_result_cmd is not None:
+        scpt_name = f'{instance.instance_id}_get_agent_result.sh'
+        scpt_path = os.path.join(container_inst_workspace, scpt_name)
+        host_scpt_path = os.path.join(instance_workspace, scpt_name)
+        create_sh_file(host_scpt_path, get_agent_result_cmd)
+        logger.info(f'Running get agent result cmd: {scpt_path}')
+        _, agent_answer = sandbox.execute(scpt_path)
+    else:
+        logger.info('Retrieving agent answer from history.')
+        raw_ans = ''
+        for act, _ in reversed(state.history):
+            if isinstance(act, MessageAction) and act.source == 'agent':
+                raw_ans = act.content
+                break
+            if isinstance(act, CmdRunAction) and act.source == 'agent':
+                raw_ans = act.thought
+                break
+        agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
+        if len(agent_answer) == 0:
+            logger.warning(f'Failed to parse model answer: {raw_ans}')
+            agent_answer = raw_ans
+        else:
+            agent_answer = agent_answer[0]
+
+    final_ans = ''
+    if instance.ground_truth is not None:
+        final_ans = instance.ground_truth
+    else:
+        get_ground_truth_cmd = instance.get_ground_truth
+        if get_ground_truth_cmd is not None:
+            scpt_name = f'{instance.instance_id}_get_ground_truth.sh'
+            scpt_path = os.path.join(container_inst_workspace, scpt_name)
+            host_scpt_path = os.path.join(instance_workspace, scpt_name)
+            create_sh_file(host_scpt_path, get_ground_truth_cmd)
+            logger.info(f'Running get ground truth cmd: {scpt_path}')
+            sandbox.execute(f'cd {container_inst_workspace}')
+            _, final_ans = sandbox.execute(scpt_path)
+
+    comparison_method = instance.comparison_method
+    logger.info(
+        f'Final message: {agent_answer} | Ground truth: {final_ans} | Comparison method: {comparison_method}'
+    )
+    test_result = compare_results(comparison_method, agent_answer, final_ans)
+
+    histories = [
+        (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+    ]
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = {
+        'instance_id': inst_id,
+        'instance': instance.to_dict(),
+        'instruction': instruction,
+        'metadata': metadata,
+        'history': histories,
+        'metrics': metrics,
+        'error': state.error if state and state.error else None,
+        'test_result': {
+            'agent_answer': agent_answer,
+            'final_answer': final_ans,
+            'check_method': comparison_method,
+            'result': test_result,
+        },
+    }
+
+    # clean up
+    if os.path.exists(instance_workspace):
+        shutil.rmtree(instance_workspace)
+    # Close the sandbox
+    try:
+        sandbox.close()
+    except docker.errors.NotFound as e:
+        logger.error(f'Failed to close sandbox: {e}')
+    return output
+
+
+if __name__ == '__main__':
+    # =============================================
+    # load datasets
+    # =============================================
+
+    dataset = load_dataset('iFurySt/AgentBench')
+    agent_bench_tests = dataset['osbench'].to_pandas()
+    logger.info(f'Loaded {len(agent_bench_tests)} tests.')
+
+    # =============================================
+    # handle arguments and prepare for evaluation
+    # =============================================
+
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_cls = args.agent_cls
+    assert (
+        agent_cls in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_cls}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_op_dir = str(
+        os.path.join(
+            args.eval_output_dir,
+            'agent_bench',
+            agent_cls,
+            model_name + '_maxiter_' + str(max_iterations) + eval_note,
+        )
+    )
+
+    pathlib.Path(eval_op_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(str(os.path.join(eval_op_dir, 'logs'))).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_op_dir}')
+
+    meta = {
+        'agent_class': agent_cls,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_op_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproducibility
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {meta}')
+    with open(os.path.join(eval_op_dir, 'metadata.json'), 'w') as f:
+        json.dump(meta, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        agent_bench_tests = agent_bench_tests[:eval_n_limit]
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_op_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_instance_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_instance_ids.add(data['instance_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_cls}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    # =============================================
+
+    new_agent_bench_tests = []
+    for idx, inst in agent_bench_tests.iterrows():
+        if inst.instance_id in finished_instance_ids:
+            logger.info(
+                f'Skipping instance {inst.instance_id} as it is already finished.'
+            )
+            continue
+        new_agent_bench_tests.append(inst)
+
+    agent_bench_tests = new_agent_bench_tests
+    logger.info(
+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(agent_bench_tests)}'
+    )
+
+    # =============================================
+    # start task
+    # =============================================
+
+    pbar = tqdm(total=len(agent_bench_tests))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(fut):
+        pbar.update(1)
+        output = fut.result()
+        pbar.set_description(f'Instance {output["instance_id"]}')
+        pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        output_fp.flush()
+
+    # This sets the multiprocessing
+    num_workers = args.eval_num_workers
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multiprocessing
+            for inst in agent_bench_tests:
+                future = executor.submit(
+                    process_instance,
+                    inst,
+                    agent_cls,
+                    meta,
+                    eval_op_dir,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
@@ -0,0 +1,33 @@
+#!/bin/bash
+MODEL_CONFIG=$1
+AGENT=$2
+EVAL_LIMIT=$3
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run python evaluation/agent_bench/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 30 \
+  --max-chars 10000000 \
+  --eval-num-workers 5 \
+  --eval-note $AGENT_VERSION"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
@@ -0,0 +1,37 @@
+import json
+import sys
+
+
+def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
+    passed = []
+    failed = []
+    with open(res_file_path, 'r') as file:
+        for line in file:
+            data = json.loads(line.strip())
+            instance_id = data['instance_id']
+            resolved = False
+            if 'test_result' in data and 'result' in data['test_result']:
+                resolved = data['test_result']['result']
+            if resolved:
+                passed.append(instance_id)
+            else:
+                failed.append(instance_id)
+    return passed, failed
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print(
+            'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
+        )
+        sys.exit(1)
+    json_file_path = sys.argv[1]
+    passed_tests, failed_tests = extract_test_results(json_file_path)
+    succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
+    print(
+        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
+    )
+    print('PASSED TESTS:')
+    print(passed_tests)
+    print('FAILED TESTS:')
+    print(failed_tests)
@@ -0,0 +1,517 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import re
+import shutil
+import sqlite3
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+import pandas as pd
+from datasets import load_dataset
+from func_timeout import FunctionTimedOut, func_timeout
+from tqdm import tqdm
+
+from opendevin.controller.state.state import State
+from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    logger.info('Cleaning up child processes...')
+    for process in mp.active_children():
+        logger.info(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'If you think you have completed the SQL, please run the following command: <execute_bash> exit </execute_bash>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
+    )
+    if state.history:
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have fixed the issue through code changes, please run the following command: <execute_bash> exit </execute_bash>.\n'
+}
+
+
+def execute_sql(db_path, gen_sql, gold_sql):
+    """
+    Execute the generated SQL and the ground truth SQL and compare the results.
+    """
+    with sqlite3.connect(db_path) as conn:
+        cursor = conn.cursor()
+        cursor.execute(gen_sql)
+        predicted_res = cursor.fetchall()
+        cursor.execute(gold_sql)
+        ground_truth_res = cursor.fetchall()
+        res = 0
+        if set(predicted_res) == set(ground_truth_res):
+            res = 1
+    return res
+
+
+def get_test_result(instance, path, timeout=30):
+    test_result = {'result': {}, 'metadata': {}}
+
+    # Read the generated python file
+    with open(path, 'r') as f:
+        gen_file = f.read()
+
+    # Extract the SQL from the python file
+    gen_sql = ''
+    pattern = r'sql\s*=\s*"([^"]+)"'
+    match = re.search(pattern, gen_file)
+    if match:
+        gen_sql = match.group(1)
+    else:
+        print('No match found.')
+
+    gold_sql = instance.SQL
+    # Execute the SQL
+    try:
+        res = func_timeout(
+            timeout, execute_sql, args=(instance.db_path, gen_sql, gold_sql)
+        )
+        status = 'success'
+    except FunctionTimedOut:
+        res = 0
+        status = 'timeout'
+    except Exception as e:
+        res = 0
+        status = 'error'
+        logger.error(f'Error: {e}')
+
+    # Save the test result
+    test_result['result'] = {'passed': res, 'status': status}
+    test_result['metadata'] = {
+        'timeout': timeout,
+        'gen_sql': gen_sql,
+        'gold_sql': gold_sql,
+    }
+    return test_result
+
+
+def process_instance(
+    instance, agent_class, metadata, skip_workspace_mount, reset_logger: bool = True
+):
+    workspace_mount_path = os.path.join(
+        config.workspace_mount_path, 'bird_eval_workspace'
+    )
+    # create process-specific workspace dir
+    # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+    # so that different agent don't interfere with each other.
+    if not skip_workspace_mount:
+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+    # reset workspace to config
+    config.workspace_mount_path = workspace_mount_path
+
+    # Copy the database to the workspace
+    db_root = os.path.join(
+        config.workspace_base, 'evaluation_bird/dev/dev_databases', instance.db_id
+    )
+    target_path = os.path.join(workspace_mount_path, f'{instance.db_id}')
+    if not os.path.exists(target_path):
+        logger.info(f'Copying database from {db_root} to {target_path}...')
+        shutil.copytree(db_root, target_path)
+
+    # Set up the database path
+    database_path = os.path.join(instance.db_id, f'{instance.db_id}.sqlite')
+
+    # Set up the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(
+            eval_output_dir,
+            'logs',
+            f'instance_{instance.task_id.replace("/", "__")}.log',
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {instance.task_id}.\nLOG:   tail -f {log_file}'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+
+    if not skip_workspace_mount:
+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+    # Create file with BIRD instance
+    statements = f"""
+    import sqlite3
+    def execute_sql(db_path, sql):
+        with sqlite3.connect(db_path) as conn:
+            cursor = conn.cursor()
+            cursor.execute(sql)
+            result = cursor.fetchall()
+            return result
+
+    if __name__ == '__main__':
+        sql = "" # fill in your SQL here
+        db_path = "{database_path}"
+        print(db_path)
+        result = execute_sql(db_path, sql)
+        print(result)
+    """
+    path = os.path.join(
+        config.workspace_mount_path, f'{instance.task_id.replace("/", "__")}.py'
+    )
+    instruction = (
+        f'You are a SQL expert and need to complete the following text-to-SQL tasks.'
+        f'\n\n{instance.instruction}\n\n'
+        'Please write the SQL in one line without line breaks.'
+        f'And write a new python file named {instance.task_id.replace("/", "__")}.py to call the SQL you wrote.'
+        'You need to follow the code template below:'
+        f'\n\n{statements}\n\n'
+        'Environment has been set up for you to start working.'
+        'You may assume all necessary tools are installed.\n\n'
+    )
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
+    )
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State = asyncio.run(
+        main(
+            instruction,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+        )
+    )
+
+    # ======= Attempt to evaluate the agent's edits =======
+    test_result = get_test_result(instance, path)
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+    if state is None:
+        raise ValueError('State should not be None.')
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = {
+        'task_id': instance.task_id,
+        'instruction': instruction,
+        'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'metrics': metrics,
+        'error': state.error if state and state.error else None,
+        'test_result': test_result,
+    }
+    return output
+
+
+def load_bird():
+    """
+    Main function to handle the flow of downloading, processing, and loading the bird dataset.
+    """
+    raw_dataset_path = download_bird()
+    bird_dataset = process_bird(raw_dataset_path)
+    return bird_dataset
+
+
+def download_bird():
+    """
+    Downloads and extracts the bird dataset from a specified URL into a local directory.
+    """
+    dataset_path = os.path.join(config.workspace_base, 'evaluation_bird')
+    devset_path = os.path.join(dataset_path, 'dev')
+    if not os.path.exists(dataset_path):
+        logger.info(
+            f'{dataset_path} folder does not exist, starting download and extraction...'
+        )
+        os.makedirs(dataset_path, exist_ok=True)
+        download_url = 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip'
+        download_path = os.path.join(dataset_path, 'dev.zip')
+        logger.info('Start Downloading...')
+        subprocess.run(['wget', download_url, '-O', download_path])
+        logger.info('Download completed.')
+        logger.info('Start Extracting...')
+        subprocess.run(['unzip', download_path, '-d', dataset_path])
+        # extract databases
+        devset_path = os.path.join(dataset_path, 'dev')
+        database_path = os.path.join(devset_path, 'dev_databases.zip')
+        subprocess.run(['unzip', database_path, '-d', devset_path])
+        logger.info('Extraction completed.')
+    else:
+        logger.info(f'{dataset_path} folder already exists.')
+    return devset_path
+
+
+def process_bird(dataset_path):
+    """
+    Processes the raw bird dataset into a structured format and saves it as JSON.
+    """
+    processed_path = os.path.join(dataset_path, 'processed_dev.json')
+    if not os.path.exists(processed_path):
+        logger.info(f'{processed_path} folder does not exist, starting processing...')
+        raw_data_path = os.path.join(dataset_path, 'dev.json')
+        database_path = os.path.join(dataset_path, 'dev_databases')
+        processed_data = []
+        with pathlib.Path(raw_data_path).open('r') as f:
+            data = json.load(f)
+            for e in tqdm(data):
+                item = {
+                    'task_id': f'{len(processed_data)}',
+                    'db_path': os.path.join(
+                        database_path, e['db_id'], f"{e['db_id']}.sqlite"
+                    ),
+                    'db_id': e['db_id'],
+                    'instruction': create_prompt(e, database_path),
+                    'SQL': e['SQL'],
+                }
+                processed_data.append(item)
+
+        with pathlib.Path(processed_path).open('w') as f:
+            json.dump(processed_data, f, indent=2)
+            logger.info(f'Processed data saved to {processed_path}')
+    else:
+        logger.info(f'{processed_path} folder already exists.')
+    bird_dataset = load_dataset('json', data_files={'test': processed_path})
+    return bird_dataset
+
+
+def extract_create_table_prompt(db_path, limit_value=0):
+    """
+    Generates a SQL prompt with CREATE TABLE statements and sample data from the database.
+    """
+    table_query = "SELECT * FROM sqlite_master WHERE type='table';"
+    tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall()
+    prompt = ''
+    for table in tables:
+        table_name = table[1]
+        create_table_statement = table[-1]
+
+        table_info_query = f'PRAGMA table_info(`{table_name}`);'
+        top_k_row_query = f'SELECT * FROM {table_name} LIMIT {limit_value};'
+        try:
+            headers = [
+                x[1]
+                for x in sqlite3.connect(db_path)
+                .cursor()
+                .execute(table_info_query)
+                .fetchall()
+            ]
+        except Exception:
+            logger.error(f'Error Connection: {table_info_query}, {top_k_row_query}')
+            exit(0)
+
+        prompt += create_table_statement + ';\n'
+        if limit_value > 0:
+            top_k_rows = (
+                sqlite3.connect(db_path).cursor().execute(top_k_row_query).fetchall()
+            )
+            prompt += (
+                f"/*\n3 example rows:\n{top_k_row_query}\n{'    '.join(headers)}\n"
+            )
+            for row in top_k_rows:
+                row = [str(x) for x in row]
+                row = [x if x is not None else '' for x in row]
+                prompt += '    '.join(row) + '\n'
+            prompt += '*/\n'
+        prompt += '\n'
+    return prompt
+
+
+def create_prompt(e, database_path):
+    """
+    Create a prompt for the given example
+    """
+    db_id = e['db_id']
+    db_path = pathlib.Path(database_path) / db_id / f'{db_id}.sqlite'
+
+    # Extract the CREATE TABLE statements and sample data from the database
+    prompt = extract_create_table_prompt(db_path)
+    prompt += f"-- External Knowledge: {e['evidence']}\n\n"
+    prompt += '-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n'
+    prompt += '-- Using valid SQLite, answer the following questions for the tables provided above.\n'
+    prompt += f"Question: {e['question']}\n"
+
+    return prompt
+
+
+if __name__ == '__main__':
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenDevin's repo
+    # Due to the large size of the BIRD database, it cannot be hosted on huggingface datasets, so it needs to be downloaded
+    bird_dataset = load_bird()
+    bird_tests = bird_dataset['test'].to_pandas()
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/humanevalfix/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'bird',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproducibility
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        bird_tests = bird_tests.head(eval_n_limit)
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_instance_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_instance_ids.add(data['task_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_bird_tests = []
+    for idx, instance in bird_tests.iterrows():
+        if instance.task_id in finished_instance_ids:
+            logger.info(
+                f'Skipping instance {instance.task_id} as it is already finished.'
+            )
+            continue
+        new_bird_tests.append(instance)
+
+    bird_tests = pd.DataFrame(new_bird_tests)
+    logger.info(
+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(bird_tests)}'
+    )
+    # =============================================
+
+    pbar = tqdm(total=len(bird_tests))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(future):
+        pbar.update(1)
+        output = future.result()
+        pbar.set_description(f'Instance {output["task_id"]}')
+        pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["task_id"]}: {output["test_result"]["result"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        output_fp.flush()
+
+    # This sets the multi-processing
+    num_workers = args.eval_num_workers
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multi-processing
+            for row_idx, instance in bird_tests.iterrows():
+                future = executor.submit(
+                    process_instance,
+                    instance,
+                    agent_class,
+                    metadata,
+                    skip_workspace_mount=False,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
@@ -0,0 +1,33 @@
+#!/bin/bash
+MODEL_CONFIG=$1
+AGENT=$2
+EVAL_LIMIT=$3
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+COMMAND="poetry run python evaluation/bird/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 5 \
+  --max-chars 10000000 \
+  --eval-num-workers 1 \
+  --eval-note $AGENT_VERSION" \
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
@@ -77,122 +77,133 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
    # we will create a workspace directory for EACH process
    # so that different agent don't interfere with each other.
    old_workspace_mount_path = config.workspace_mount_path
-    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
-    workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-    config.workspace_mount_path = workspace_mount_path

-    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
-    eval_output_dir = metadata['eval_output_dir']
-    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            eval_output_dir, 'logs', f'instance_{instance["task_id"]}.log'
+    try:
+        workspace_mount_path = os.path.join(
+            config.workspace_mount_path, '_eval_workspace'
        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+        config.workspace_mount_path = workspace_mount_path
+
+        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+        eval_output_dir = metadata['eval_output_dir']
+        if reset_logger:
+            # Set up logger
+            log_file = os.path.join(
+                eval_output_dir, 'logs', f'instance_{instance["task_id"]}.log'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            # add back the console handler to print ONE line
+            logger.addHandler(get_console_handler())
+            logger.info(
+                f'Starting evaluation for instance {instance["task_id"]}.\nLOG:   tail -f {log_file}'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setFormatter(
+                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+            )
+            logger.addHandler(file_handler)
+
+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+        if instance['file_name'] != '':
+            # if this question comes with a file, we need to save it to the workspace
+            src_file = os.path.join(
+                DATASET_CACHE_DIR, '2023', metadata['data_split'], instance['file_name']
+            )
+            extension_name = instance['file_name'].split('.')[-1]
+            dest_file = os.path.join(workspace_mount_path, f'file.{extension_name}')
+            shutil.copyfile(src_file, dest_file)
+            logger.info(f'File copied to {dest_file}')
+        else:
+            dest_file = None
+
+        # Prepare instruction
+        instruction = f"{instance['Question']}\n"
+        logger.info(f'Instruction: {instruction}')
+        if dest_file:
+            instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"
+
+        instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
+        instruction += (
+            'For example: The answer to the question is <solution> 42 </solution>.\n'
+        )
+        # NOTE: You can actually set slightly different instruction for different agents
+        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+        logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State = asyncio.run(
+            main(
+                instruction,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+                    agent_class
+                ),
+            )
+        )
+        # ======= Attempt to evaluate the agent's edits =======
+        # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+        if state is None:
+            raise ValueError('State should not be None.')
+
+        model_answer_raw = ''
+        for act, _ in reversed(state.history):
+            if isinstance(act, CmdRunAction) and act.source == 'agent':
+                model_answer_raw = act.thought
+                break
+            elif isinstance(act, MessageAction) and act.source == 'agent':
+                model_answer_raw = act.content
+                break
+
+        # attempt to parse model_answer
+        model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
+        if len(model_answer) == 0:
+            logger.warning(f'Failed to parse model answer: {model_answer_raw}')
+            model_answer = model_answer_raw
+        else:
+            model_answer = model_answer[0]
+
        logger.info(
-            f'Starting evaluation for instance {instance["task_id"]}.\nLOG:   tail -f {log_file}'
+            f'Final message: {model_answer} | Ground truth: {instance["Final answer"]}'
        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        score = question_scorer(
+            model_answer=model_answer, ground_truth=instance['Final answer']
        )
-        logger.addHandler(file_handler)
+        test_result = {
+            'score': score,
+            'model_answer_raw': model_answer_raw,
+            'model_answer': model_answer,
+            'ground_truth': instance['Final answer'],
+        }
+        metrics = state.metrics.get() if state.metrics else None

-    logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-    if instance['file_name'] != '':
-        # if this question comes with a file, we need to save it to the workspace
-        src_file = os.path.join(
-            DATASET_CACHE_DIR, '2023', metadata['data_split'], instance['file_name']
-        )
-        extension_name = instance['file_name'].split('.')[-1]
-        dest_file = os.path.join(workspace_mount_path, f'file.{extension_name}')
-        shutil.copyfile(src_file, dest_file)
-        logger.info(f'File copied to {dest_file}')
-    else:
-        dest_file = None
-
-    # Prepare instruction
-    instruction = f"{instance['Question']}\n"
-    logger.info(f'Instruction: {instruction}')
-    if dest_file:
-        instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"
-
-    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-    instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
-    instruction += (
-        'For example: The answer to the question is <solution> 42 </solution>.\n'
-    )
-    # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
-    logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
-
-    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State = asyncio.run(
-        main(
-            instruction,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
-        )
-    )
-    # ======= Attempt to evaluate the agent's edits =======
-    # If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
-    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-
-    if state is None:
-        raise ValueError('State should not be None.')
-
-    model_answer_raw = ''
-    for act, _ in reversed(state.history):
-        if isinstance(act, CmdRunAction) and act.source == 'agent':
-            model_answer_raw = act.thought
-            break
-        elif isinstance(act, MessageAction) and act.source == 'agent':
-            model_answer_raw = act.content
-            break
-
-    # attempt to parse model_answer
-    model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
-    if len(model_answer) == 0:
-        logger.warning(f'Failed to parse model answer: {model_answer_raw}')
-        model_answer = model_answer_raw
-    else:
-        model_answer = model_answer[0]
-
-    logger.info(
-        f'Final message: {model_answer} | Ground truth: {instance["Final answer"]}'
-    )
-    score = question_scorer(
-        model_answer=model_answer, ground_truth=instance['Final answer']
-    )
-    test_result = {
-        'score': score,
-        'model_answer_raw': model_answer_raw,
-        'model_answer': model_answer,
-        'ground_truth': instance['Final answer'],
-    }
-
-    # Save the output
-    output = {
-        'instance_id': instance['task_id'],
-        'instance': instance,
-        'instruction': instance['Question'],
-        'metadata': metadata,
-        'history': [
-            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-        ],
-        'error': state.error if state and state.error else None,
-        'test_result': test_result,
-    }
-
-    # Close the sandbox
-    config.workspace_mount_path = old_workspace_mount_path
+        # Save the output
+        output = {
+            'instance_id': instance['task_id'],
+            'instance': instance,
+            'instruction': instance['Question'],
+            'metadata': metadata,
+            'history': [
+                (event_to_dict(action), event_to_dict(obs))
+                for action, obs in state.history
+            ],
+            'metrics': metrics,
+            'error': state.error if state and state.error else None,
+            'test_result': test_result,
+        }
+    except Exception:
+        logger.error('Process instance failed')
+        raise
+    finally:
+        config.workspace_mount_path = old_workspace_mount_path
    return output


@@ -264,7 +275,7 @@ if __name__ == '__main__':
        'max_iterations': max_iterations,
        'eval_output_dir': eval_output_dir,
        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
-        # get the commit id of current repo for reproduciblity
+        # get the commit id of current repo for reproducibility
        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
        .decode('utf-8')
        .strip(),
@@ -17,8 +17,10 @@ def normalize_number_str(number_str: str) -> float:

 def split_string(
    s: str,
-    char_list: list[str] = [',', ';'],
+    char_list: list[str] = None,
 ) -> list[str]:
+    if char_list is None:
+        char_list = [',', ';']
    pattern = f"[{''.join(char_list)}]"
    return re.split(pattern, s)

@@ -51,7 +53,9 @@ def question_scorer(
        # check length is the same
        if len(gt_elems) != len(ma_elems):
            warnings.warn(
-                'Answer lists have different lengths, returning False.', UserWarning
+                'Answer lists have different lengths, returning False.',
+                UserWarning,
+                stacklevel=2,
            )
            return False

@@ -140,102 +140,114 @@ def process_instance(
 ):
    old_workspace_mount_path = config.workspace_mount_path
    old_workspace_base = config.workspace_base
-    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
-    # create process-specific workspace dir
-    # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
-    # so that different agent don't interfere with each other.
-    if not skip_workspace_mount:
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)

-    # reset workspace to config
-    config.workspace_base = workspace_mount_path
-    config.workspace_mount_path = workspace_mount_path
-
-    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
-    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            eval_output_dir,
-            'logs',
-            f'instance_{instance.task_id.replace("/", "__")}.log',
+    try:
+        workspace_mount_path = os.path.join(
+            config.workspace_mount_path, '_eval_workspace'
        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {instance.task_id}.\nLOG:   tail -f {log_file}'
+        # create process-specific workspace dir
+        # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+        # so that different agent don't interfere with each other.
+        if not skip_workspace_mount:
+            workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+            pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+        # reset workspace to config
+        config.workspace_base = workspace_mount_path
+        config.workspace_mount_path = workspace_mount_path
+
+        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+        if reset_logger:
+            # Set up logger
+            log_file = os.path.join(
+                eval_output_dir,
+                'logs',
+                f'instance_{instance.task_id.replace("/", "__")}.log',
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            # add back the console handler to print ONE line
+            logger.addHandler(get_console_handler())
+            logger.info(
+                f'Starting evaluation for instance {instance.task_id}.\nLOG:   tail -f {log_file}'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setFormatter(
+                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+            )
+            logger.addHandler(file_handler)
+
+        if not skip_workspace_mount:
+            logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+        # Create file with HumanEvalFix problem
+        # Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
+        problem_statement = (
+            instance.declaration + instance.buggy_solution + '\n' + instance.test
        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        path = os.path.join(
+            workspace_mount_path, f'{instance.task_id.replace("/", "__")}.py'
        )
-        logger.addHandler(file_handler)
+        with open(path, 'w') as f:
+            f.write(problem_statement)

-    if not skip_workspace_mount:
-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-
-    # Create file with HumanEvalFix problem
-    # Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
-    problem_statement = (
-        instance.declaration + instance.buggy_solution + '\n' + instance.test
-    )
-    path = os.path.join(
-        workspace_mount_path, f'{instance.task_id.replace("/", "__")}.py'
-    )
-    with open(path, 'w') as f:
-        f.write(problem_statement)
-
-    # Prepare instruction
-    instruction = (
-        f'Please fix the function in {instance.task_id.replace("/", "__")}.py such that all test cases pass.\n'
-        'Environment has been set up for you to start working. You may assume all necessary tools are installed.\n\n'
-        '# Problem Statement\n'
-        f'{problem_statement}\n\n'
-    )
-    instruction += (
-        'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-        'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
-        'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
-    )
-    # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
-
-    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State = asyncio.run(
-        main(
-            instruction,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+        # Prepare instruction
+        instruction = (
+            f'Please fix the function in {instance.task_id.replace("/", "__")}.py such that all test cases pass.\n'
+            'Environment has been set up for you to start working. You may assume all necessary tools are installed.\n\n'
+            '# Problem Statement\n'
+            f'{problem_statement}\n\n'
        )
-    )
+        instruction += (
+            'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+            'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
+            'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
+        )
+        # NOTE: You can actually set slightly different instruction for different agents
+        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')

-    # ======= Attempt to evaluate the agent's edits =======
-    test_result = get_test_result(instance, path)
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State = asyncio.run(
+            main(
+                instruction,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+                    agent_class
+                ),
+            )
+        )

-    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-    if state is None:
-        raise ValueError('State should not be None.')
+        # ======= Attempt to evaluate the agent's edits =======
+        test_result = get_test_result(instance, path)

-    # Save the output
-    output = {
-        'task_id': instance.task_id,
-        'instruction': instruction,
-        'metadata': metadata,
-        'history': [
-            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-        ],
-        'error': state.error if state and state.error else None,
-        'test_result': test_result,
-    }
+        # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+        if state is None:
+            raise ValueError('State should not be None.')
+        metrics = state.metrics.get() if state.metrics else None

-    config.workspace_mount_path = old_workspace_mount_path
-    config.workspace_base = old_workspace_base
+        # Save the output
+        output = {
+            'task_id': instance.task_id,
+            'instruction': instruction,
+            'metadata': metadata,
+            'history': [
+                (event_to_dict(action), event_to_dict(obs))
+                for action, obs in state.history
+            ],
+            'metrics': metrics,
+            'error': state.error if state and state.error else None,
+            'test_result': test_result,
+        }
+    except Exception:
+        logger.error('Process instance failed')
+        raise
+    finally:
+        config.workspace_mount_path = old_workspace_mount_path
+        config.workspace_base = old_workspace_base
    return output


@@ -284,7 +296,7 @@ if __name__ == '__main__':
        'max_iterations': max_iterations,
        'eval_output_dir': eval_output_dir,
        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
-        # get the commit id of current repo for reproduciblity
+        # get the commit id of current repo for reproducibility
        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
        .decode('utf-8')
        .strip(),
@@ -0,0 +1,12 @@
+Cold(Bob, True)
+Quiet(Bob, True)
+Red(Bob, True)
+Smart(Bob, True)
+Kind(Charlie, True)
+Quiet(Charlie, True)
+Red(Charlie, True)
+Rough(Charlie, True)
+Cold(Dave, True)
+Kind(Dave, True)
+Smart(Dave, True)
+Quiet(Fiona, True)
@@ -0,0 +1,52 @@
+fact1
+	foreach
+		facts.Quiet($x, True)
+		facts.Cold($x, True)
+	assert
+		facts.Smart($x, True)
+
+fact2
+	foreach
+		facts.Red($x, True)
+		facts.Cold($x, True)
+	assert
+		facts.Round($x, True)
+
+fact3
+	foreach
+		facts.Kind($x, True)
+		facts.Rough($x, True)
+	assert
+		facts.Red($x, True)
+
+fact4
+	foreach
+		facts.Quiet($x, True)
+	assert
+		facts.Rough($x, True)
+
+fact5
+	foreach
+		facts.Cold($x, True)
+		facts.Smart($x, True)
+	assert
+		facts.Red($x, True)
+
+fact6
+	foreach
+		facts.Rough($x, True)
+	assert
+		facts.Cold($x, True)
+
+fact7
+	foreach
+		facts.Red($x, True)
+	assert
+		facts.Rough($x, True)
+
+fact8
+	foreach
+		facts.Smart(Dave, True)
+		facts.Kind(Dave, True)
+	assert
+		facts.Quiet(Dave, True)
@@ -0,0 +1,35 @@
+# Logic Reasoning Evaluation
+
+This folder contains evaluation harness for evaluating agents on the logic reasoning benchmark [ProntoQA](https://github.com/asaparov/prontoqa) and [ProofWriter](https://allenai.org/data/proofwriter).
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/tmp/cache"
+ssh_hostname = "localhost"
+enable_auto_lint = true
+
+# TODO: Change these to the model you want to evaluate
+[eval_gpt4_1106_preview]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[eval_some_openai_compatible_model]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Run Inference on logic_reasoning
+The following code will run inference on the first example of the ProntoQA dataset with model gpt-4o.
+```bash
+./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA gpt-4o 1
+```
@@ -0,0 +1,19 @@
+You are a helpful assistant assigned with logic reasoning task. You need to determine the correctness of a query given some facts and rules.
+you can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag.
+In this task, you need to use the code in [[logic_inference_path.py]] to help you. Specifically, you first need to instantiate a **LogicInferenceEngine** class and use the **safe_execute_program** method to prove the **logic programs**. You should receive *answer*, *flag*, *error_message* from the output.
+
+An example would be look like this:
+    <execute_ipython>
+    import sys
+    sys.path.append(workspace_mount_path)
+    engine = LogicInferenceEngine(dataset_name, workspace_mount_path)
+    answer, flag, error_message = engine.safe_execute_program(logic_programs)
+    </execute_ipython>
+
+Please send the *answer* variable through message.
+
+dataset_name:
+[[dataset_name]]
+
+logic_programs:
+[[logic_programs]]
@@ -0,0 +1,220 @@
+import os
+import random
+import re
+import shutil
+
+from pyke import knowledge_engine
+
+
+class PykeProgram:
+    def __init__(
+        self, logic_program: str, dataset_name='ProntoQA', workspace_mount_path='./'
+    ) -> None:
+        self.logic_program = logic_program
+        self.flag = self.parse_logic_program()
+        self.dataset_name = dataset_name
+        self.cache_dir = os.path.join(workspace_mount_path, '.cache_program')
+
+        # prepare the files for facts and rules
+        try:
+            self.create_fact_file(self.Facts)
+            self.create_rule_file(self.Rules)
+            self.flag = True
+        except Exception:
+            self.flag = False
+
+        self.answer_map = {
+            'ProntoQA': self.answer_map_prontoqa,
+            'ProofWriter': self.answer_map_proofwriter,
+        }
+
+    def parse_logic_program(self):
+        keywords = ['Query:', 'Rules:', 'Facts:', 'Predicates:']
+        program_str = self.logic_program
+        for keyword in keywords:
+            try:
+                program_str, segment_list = self._parse_segment(program_str, keyword)
+                setattr(self, keyword[:-1], segment_list)
+            except Exception:
+                setattr(self, keyword[:-1], None)
+
+        return self.validate_program()
+
+    def _parse_segment(self, program_str, key_phrase):
+        remain_program_str, segment = program_str.split(key_phrase)
+        segment_list = segment.strip().split('\n')
+        for i in range(len(segment_list)):
+            segment_list[i] = segment_list[i].split(':::')[0].strip()
+        return remain_program_str, segment_list
+
+    # check if the program is valid; if not, try to fix it
+    def validate_program(self):
+        if self.Rules is not None and self.Facts is not None:
+            if not self.Rules[0] == '' and not self.Facts[0] == '':
+                return True
+        # try to fix the program
+        tmp_rules = []
+        tmp_facts = []
+        statements = self.Facts if self.Facts is not None else self.Rules
+        if statements is None:
+            return False
+
+        for fact in statements:
+            if fact.find('>>>') >= 0:  # this is a rule
+                tmp_rules.append(fact)
+            else:
+                tmp_facts.append(fact)
+        self.Rules = tmp_rules
+        self.Facts = tmp_facts
+        return False
+
+    def create_fact_file(self, facts):
+        with open(os.path.join(self.cache_dir, 'facts.kfb'), 'w') as f:
+            for fact in facts:
+                # check for invalid facts
+                if not fact.find('$x') >= 0:
+                    f.write(fact + '\n')
+
+    def create_rule_file(self, rules):
+        pyke_rules = []
+        for idx, rule in enumerate(rules):
+            pyke_rules.append(self.parse_forward_rule(idx + 1, rule))
+
+        with open(os.path.join(self.cache_dir, 'rules.krb'), 'w') as f:
+            f.write('\n\n'.join(pyke_rules))
+
+    # example rule: Furry($x, True) && Quite($x, True) >>> White($x, True)
+    def parse_forward_rule(self, f_index, rule):
+        premise, conclusion = rule.split('>>>')
+        premise = premise.strip()
+        # split the premise into multiple facts if needed
+        premise = premise.split('&&')
+        premise_list = [p.strip() for p in premise]
+
+        conclusion = conclusion.strip()
+        # split the conclusion into multiple facts if needed
+        conclusion = conclusion.split('&&')
+        conclusion_list = [c.strip() for c in conclusion]
+
+        # create the Pyke rule
+        pyke_rule = f"""fact{f_index}\n\tforeach"""
+        for p in premise_list:
+            pyke_rule += f"""\n\t\tfacts.{p}"""
+        pyke_rule += """\n\tassert"""
+        for c in conclusion_list:
+            pyke_rule += f"""\n\t\tfacts.{c}"""
+        return pyke_rule
+
+    """
+    for example: Is Marvin from Mars?
+    Query: FromMars(Marvin, $label)
+    """
+
+    def check_specific_predicate(self, subject_name, predicate_name, engine):
+        results = []
+        with engine.prove_goal(
+            f'facts.{predicate_name}({subject_name}, $label)'
+        ) as gen:
+            for vars, plan in gen:
+                results.append(vars['label'])
+
+        with engine.prove_goal(
+            f'rules.{predicate_name}({subject_name}, $label)'
+        ) as gen:
+            for vars, plan in gen:
+                results.append(vars['label'])
+
+        if len(results) == 1:
+            return results[0]
+        elif len(results) == 2:
+            return results[0] and results[1]
+        elif len(results) == 0:
+            return None
+
+    """
+    Input Example: Metallic(Wren, False)
+    """
+
+    def parse_query(self, query):
+        pattern = r'(\w+)\(([^,]+),\s*([^)]+)\)'
+        match = re.match(pattern, query)
+        if match:
+            function_name = match.group(1)
+            arg1 = match.group(2)
+            arg2 = match.group(3)
+            arg2 = True if arg2 == 'True' else False
+            return function_name, arg1, arg2
+        else:
+            raise ValueError(f'Invalid query: {query}')
+
+    def execute_program(self):
+        # delete the compiled_krb dir
+        complied_krb_dir = './models/compiled_krb'
+        if os.path.exists(complied_krb_dir):
+            print('removing compiled_krb')
+            # os.system(f'rm -rf {complied_krb_dir}/*')
+            shutil.rmtree(complied_krb_dir)
+
+        # absolute_path = os.path.abspath(complied_krb_dir)
+        # print(absolute_path)
+        try:
+            engine = knowledge_engine.engine(self.cache_dir)
+            engine.reset()
+            engine.activate('rules')
+            engine.get_kb('facts')
+
+            # parse the logic query into pyke query
+            predicate, subject, value_to_check = self.parse_query(self.Query[0])
+            result = self.check_specific_predicate(subject, predicate, engine)
+            answer = self.answer_map[self.dataset_name](result, value_to_check)
+        except Exception as err:
+            return None, err
+
+        return answer, ''
+
+    def answer_mapping(self, answer):
+        return answer
+
+    def answer_map_prontoqa(self, result, value_to_check):
+        if result == value_to_check:
+            return 'A'
+        else:
+            return 'B'
+
+    def answer_map_proofwriter(self, result, value_to_check):
+        if result is None:
+            return 'C'
+        elif result == value_to_check:
+            return 'A'
+        else:
+            return 'B'
+
+
+class LogicInferenceEngine:
+    def __init__(self, dataset_name, workspace_mount_path):
+        self.dataset_name = dataset_name
+        self.workspace_mount_path = workspace_mount_path
+
+    def random_backup(self):
+        if self.dataset_name == 'ProntoQA':
+            return random.choice(['A', 'B'])
+        elif self.dataset_name == 'ProofWriter':
+            return random.choice(['A', 'B', 'C'])
+
+    def safe_execute_program(self, logic_program):
+        program = PykeProgram(
+            logic_program, self.dataset_name, self.workspace_mount_path
+        )
+        # cannot parse the program
+        if not program.flag:
+            answer = self.random_backup()
+            return answer, 'parsing error', ''
+        # execute the program
+        answer, error_message = program.execute_program()
+        # not executable
+        if answer is None:
+            answer = self.random_backup()
+            return answer, 'execution error', error_message
+        # successfully executed
+        answer = program.answer_mapping(answer)
+        return answer, 'success', ''
@@ -0,0 +1,453 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import shutil
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+from evaluation.swe_bench.swe_env_box import DockerSSHBox
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, get_parser
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    logger.info('Cleaning up child processes...')
+    for process in mp.active_children():
+        logger.info(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'If you think you have solved the task, please run the following command: <execute_bash> exit </execute_bash>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
+    )
+    if state.history:
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have solved the question, please first send your answer to user through message and then exit.\n'
+}
+
+
+def get_choice(answer_str):
+    choices = [
+        'A',
+        'B',
+        'C',
+        'D',
+        'E',
+        'F',
+        'G',
+        'H',
+        'A)',
+        'B)',
+        'C)',
+        'D)',
+        'E)',
+        'F)',
+        'G)',
+        'H)',
+        'A.',
+        'B.',
+        'C.',
+        'D.',
+        'E.',
+        'F.',
+        'G.',
+        'H.',
+    ]
+    for c in choices:
+        if answer_str.startswith(c):
+            return c.replace(')', '')
+
+    if answer_str.startswith(':'):
+        return answer_str.replace(':', '').replace('.', '').strip()
+    return None
+
+
+def get_test_result(
+    model_answer: str,
+    ground_truth: str,
+) -> bool:
+    gold_answer = ground_truth.replace('(', '').replace(')', '').strip()
+    answer_str = model_answer if model_answer is not None else ''
+    prediction = get_choice(answer_str)
+
+    indicators = [
+        'the correct option is',
+        'the correct answer is',
+        'The correct answer is',
+        'The correct option is',
+        'Thus, the answer is',
+    ]
+    if prediction is None:
+        for indicator in indicators:
+            if answer_str.find(indicator) >= 0:
+                answer_str = answer_str.split(indicator)[1].strip()
+                prediction = get_choice(answer_str)
+                break
+
+    isTrue = prediction == gold_answer
+    test_result = {'result': isTrue}
+    return test_result
+
+
+def process_instance(
+    instance,
+    agent_class,
+    # metadata,
+    dataset_name,
+    skip_workspace_mount,
+    eval_output_dir,
+    reset_logger: bool = True,
+):
+    old_workspace_mount_path = config.workspace_mount_path
+    old_workspace_base = config.workspace_base
+
+    try:
+        workspace_mount_path = os.path.join(
+            config.workspace_mount_path, '_eval_workspace'
+        )
+        # create process-specific workspace dir
+        # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+        # so that different agent don't interfere with each other.
+        if not skip_workspace_mount:
+            workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+            pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+        # reset workspace to config
+        config.workspace_base = workspace_mount_path
+        config.workspace_mount_path = workspace_mount_path
+
+        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+        if reset_logger:
+            # Set up logger
+            log_file = os.path.join(
+                eval_output_dir, 'logs', f'instance_{instance["id"]}.log'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            # add back the console handler to print ONE line
+            logger.addHandler(get_console_handler())
+            logger.info(
+                f'Starting evaluation for instance {instance["id"]}.\nLOG:   tail -f {log_file}'
+            )
+            # Remove all existing handlers from logger
+            for handler in logger.handlers[:]:
+                logger.removeHandler(handler)
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setFormatter(
+                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+            )
+            logger.addHandler(file_handler)
+
+        if not skip_workspace_mount:
+            logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+        # sandbox = DockerSSHBox()
+        logic_inference_path = os.path.join(workspace_mount_path, 'logic_inference.py')
+        if not os.path.exists(logic_inference_path):
+            shutil.copyfile(
+                './evaluation/logic_reasoning/logic_inference.py', logic_inference_path
+            )
+        logger.info(f'logic_inference.py copied to {workspace_mount_path}')
+
+        cache_dir = os.path.join(workspace_mount_path, '.cache_program')
+        if not os.path.exists(cache_dir):
+            os.makedirs(cache_dir)
+
+        # Prepare instruction
+
+        with open('./evaluation/logic_reasoning/instruction.txt', 'r') as f:
+            instruction = f.read()
+
+        instance_logic_programs = instance['raw_logic_programs'][0].strip()
+        instruction = instruction.replace('[[dataset_name]]', dataset_name)
+        instruction = instruction.replace('[[logic_programs]]', instance_logic_programs)
+        instruction = instruction.replace(
+            '[[logic_inference_path.py]]', logic_inference_path
+        )
+
+        # NOTE: You can actually set slightly different instruction for different agents
+        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+
+        sandbox = DockerSSHBox()
+        exit_code, command_output = sandbox.execute('pip install scitools-pyke')
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State = asyncio.run(
+            main(
+                instruction,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+                    agent_class
+                ),
+                sandbox=sandbox,
+            )
+        )
+        # ======= Attempt to evaluate the agent's edits =======
+        # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+        if state is None:
+            raise ValueError('State should not be None.')
+
+        final_message = ''
+        messages = []
+        for action, obs in reversed(state.history):
+            # if isinstance(act, MessageAction):
+            messages.append(obs.content)
+            # print("obs.content:", obs.content)
+            if str(obs.content) in ["'A'", "'B'", "'C'"]:
+                final_message = obs.content
+                break
+
+        final_message = final_message.strip("'")
+        logger.info(
+            f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}'
+        )
+
+        test_result = get_test_result(
+            model_answer=final_message, ground_truth=instance['answer']
+        )
+        metrics = state.metrics.get() if state.metrics else None
+
+        # Save the output
+        output = {
+            'id': instance['id'],
+            'instance': instance,
+            'instruction': instruction,
+            # 'metadata': metadata,
+            'history': [
+                (event_to_dict(action), event_to_dict(obs))
+                for action, obs in state.history
+            ],
+            'metrics': metrics,
+            'final_message': final_message,
+            'messages': messages,
+            'error': state.error if state and state.error else None,
+            'test_result': test_result,
+        }
+    except Exception:
+        logger.error('Process instance failed')
+        raise
+    finally:
+        config.workspace_mount_path = old_workspace_mount_path
+        config.workspace_base = old_workspace_base
+
+    # Close the sandbox
+    sandbox.close()
+
+    return output
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        help='the logic reasoning dataset to evaluate on {ProntoQA, ProofWriter}',
+        default='ProntoQA',
+    )
+    parser.add_argument(
+        '--data_split',
+        type=str,
+        help='data split to evaluate on {validation}',  # right now we only support validation split
+        default='validation',
+    )
+
+    args, _ = parser.parse_known_args()
+    if args.directory:
+        config.workspace_base = os.path.abspath(args.directory)
+        print(f'Setting workspace base to {config.workspace_base}')
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenDevin's repo
+
+    dataset_name = args.dataset
+    data_split = args.data_split
+    dataset = load_dataset(f'renma/{dataset_name}')
+    logic_reasoning_tests = dataset[data_split]
+    logger.info(f'Evaluating logic reasoning dataset {dataset_name} {data_split} split')
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'logic_reasoning',
+        agent_class,
+        dataset_name,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        logic_reasoning_tests = logic_reasoning_tests.select(list(range(eval_n_limit)))
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    start_time = time.strftime('%Y-%m-%d %H:%M:%S')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_task_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_task_ids.add(data['id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_logic_reasoning_tests = []
+    for instance in logic_reasoning_tests:
+        if instance['id'] in finished_task_ids:
+            logger.info(
+                f'Skipping instance {instance["id"]} as it is already finished.'
+            )
+            continue
+        new_logic_reasoning_tests.append(instance)
+
+    logic_reasoning_tests = new_logic_reasoning_tests
+    logger.info(
+        f'Finished instances: {len(finished_task_ids)}, Remaining instances: {len(logic_reasoning_tests)}'
+    )
+    # =============================================
+
+    pbar = tqdm(total=len(logic_reasoning_tests))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(future):
+        pbar.update(1)
+        output = future.result()
+        pbar.set_description(f'Instance {output["id"]}')
+        pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["id"]}: {output["test_result"]["result"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        # json.dump(output, output_fp, indent=4)
+        output_fp.flush()
+
+    # This sets the multi-processing
+    num_workers = args.eval_num_workers
+    # num_workers = 1
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    # This is SWE-Bench specific - CodeActAgent don't requires mounted workspace to work
+    skip_workspace_mount = False
+    logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
+
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multi-processing
+            for instance in logic_reasoning_tests:
+                future = executor.submit(
+                    process_instance,
+                    instance,
+                    agent_class,
+                    dataset_name,
+                    skip_workspace_mount,
+                    eval_output_dir,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+
+    with open(output_file, 'r') as f:
+        test_result = [(json.loads(line))['test_result']['result'] for line in f]
+
+    metadata = {
+        'Dataset': dataset_name,
+        'Data split': data_split,
+        'Number of Samples': len(test_result),
+        'Agent class': agent_class,
+        'Model name': model_name,
+        'Start_time': start_time,
+        'End_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        'Final Accuracy': f'{sum(test_result)/len(test_result):.2f}',
+    }
+
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f, indent=4)
+
+    logger.info(f'Metadata: {json.dumps(metadata, indent=4)}')
+    logger.info(
+        f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json'
+    )
@@ -0,0 +1,37 @@
+#!/bin/bash
+DATASET=$1
+MODEL_CONFIG=$2
+EVAL_LIMIT=$3
+AGENT=$4
+
+# ################################################################################
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+COMMAND="poetry run python evaluation/logic_reasoning/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --dataset $DATASET \
+  --max-iterations 10 \
+  --max-chars 10000000 \
+  --eval-num-workers 1 \
+  --eval-note $AGENT_VERSION"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
@@ -0,0 +1 @@
+!requirements.txt
@@ -0,0 +1,47 @@
+# MINT Benchmark
+
+This folder contains the evaluation harness for the [MINT benchmark](https://arxiv.org/abs/2309.10691) on LLMs' ability to solve tasks with multi-turn interactions.
+
+## Configure OpenDevin and LM
+
+Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
+
+## Start the evaluation
+
+We are using the MINT dataset hosted on [Hugging Face](https://huggingface.co/datasets/ryanhoangt/xingyaoww-mint-bench).
+
+Following is the basic command to start the evaluation. Currently, the only agent supported with MINT is `CodeActAgent`.
+
+```bash
+./evaluation/mint/scripts/run_infer.sh [model_config] [subset] [eval_limit]
+```
+
+where `model_config` is mandatory, while `subset` and `eval_limit` are optional.
+
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`.
+
+- `subset`, e.g. `math`, is the subset of the MINT benchmark to evaluate on, defaulting to `math`. It can be either: `math`, `gsm8k`, `mmlu`, `theoremqa`, `mbpp`,`humaneval`.
+
+- `eval_limit`, e.g. `2`, limits the evaluation to the first `eval_limit` instances, defaulting to all instances.
+
+Note: in order to use `eval_limit`, you must also set `subset`.
+
+Let's say you'd like to run 3 instances on the `gsm8k` subset using `eval_gpt4_1106_preview`,
+then your command would be:
+
+```bash
+./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview gsm8k 3
+```
+
+## Reference
+
+```
+@misc{wang2024mint,
+    title={MINT: Evaluating LLMs in Multi-turn Interaction with Tools and Language Feedback},
+    author={Xingyao Wang and Zihan Wang and Jiateng Liu and Yangyi Chen and Lifan Yuan and Hao Peng and Heng Ji},
+    year={2024},
+    eprint={2309.10691},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
@@ -0,0 +1,9 @@
+TASK_INFO_MAP = {
+    # === Reasoning ===
+    'gsm8k': {'class': 'ReasoningTask', 'type': 'reasoning'},
+    'math': {'class': 'ReasoningTask', 'type': 'reasoning'},
+    'mmlu': {'class': 'MultipleChoiceTask', 'type': 'reasoning'},
+    'theoremqa': {'class': 'TheoremqaTask', 'type': 'reasoning'},
+    'mbpp': {'class': 'MBPPTask', 'type': 'code_generation'},
+    'humaneval': {'class': 'HumanEvalTask', 'type': 'code_generation'},
+}
@@ -0,0 +1,86 @@
+import enum
+from typing import Any, Dict, Tuple
+
+
+class TaskState:
+    def __init__(
+        self,
+        finished: bool = False,
+        success: bool = False,
+        agent_action_count: dict = None,
+        terminate_reason: str = None,
+        latest_output: Dict[str, Any] = None,
+    ):
+        self.finished = finished
+        self.success = success
+        self.agent_action_count: Dict[str, int] = (
+            agent_action_count
+            if agent_action_count
+            else {
+                'propose_solution': 0,
+                'use_tool': 0,
+                'invalid_action': 0,
+            }
+        )
+        self.terminate_reason = terminate_reason
+        self.latest_output = latest_output
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            'finished': self.finished,
+            'success': self.success,
+            'agent_action_count': self.agent_action_count,
+            'terminate_reason': self.terminate_reason,
+            'latest_output': self.latest_output,
+        }
+
+
+class ParseError(Exception):
+    pass
+
+
+class FeedbackType(enum.Enum):
+    FEEDBACK_WITH_GT = 'feedback_with_gt'
+    FEEDBACK_WO_GT = 'feedback_wo_gt'
+    NO_FEEDBACK = 'no_feedback'
+
+
+class StepOutput:
+    def __init__(
+        self,
+        observation: str = None,
+        success: bool = False,
+        extra: Dict[str, Any] = None,
+        turn_info: Tuple[int, int] = None,
+    ):
+        self.observation: str = observation
+        self.success: bool = success
+        self.extra: Dict[str, Any] = extra
+        self.turn_info = turn_info
+
+    def __repr__(self) -> str:
+        return self.observation
+
+    def to_str(self) -> str:
+        output = 'Observation:\n'
+        if self.observation is not None:
+            output += self.observation + '\n'
+        else:
+            if not self.success:
+                output += 'Your answer is wrong.\n'
+
+        if self.turn_info is not None:
+            n_steps_left, n_propose_solution_left = self.turn_info
+            output += 'You have {} steps left and {} chances to propose solution left.\n'.format(
+                n_steps_left, n_propose_solution_left
+            )
+            if n_steps_left <= 1:
+                output += 'You should take the last step to propose a solution.\n'
+
+        return output
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            'observation': self.observation,
+            'success': self.success,
+        }
@@ -0,0 +1,131 @@
+import re
+import traceback
+from typing import Dict, Optional
+
+from datatypes import ParseError, StepOutput, TaskState
+from tasks.base import Task
+
+from opendevin.controller.state.state import State
+
+
+class SimplifiedEnv:
+    INVALID_INPUT_MESSAGE = (
+        "I don't understand your input. \n"
+        'If you want to execute code, please use <execute_ipython> YOUR_CODE_HERE </execute_ipython>.\n'
+        'If you want to give me an answer, please use <solution> YOUR_SOLUTION_HERE </solution>.\n'
+        'For example: The answer to the question is <solution> 42 </solution>. \n'
+    )
+
+    def __init__(self, agent_state: State, task: Task, task_config: Dict[str, int]):
+        self.agent_state = agent_state
+        self.task = task
+
+        agent_action_count = {
+            'propose_solution': 0,
+            'use_tool': 0,
+            'invalid_action': 0,
+        }
+        # check if agent_state has attribute turn_info set
+        if hasattr(self.agent_state, 'propose_solution_count'):
+            agent_action_count['propose_solution'] = (
+                self.agent_state.propose_solution_count
+            )
+
+        self.task_state = TaskState(agent_action_count=agent_action_count)
+
+        self.task_config = task_config
+
+    def step(self, lm_message: str):
+        observation = self.handle_propose_solution(lm_message)
+
+        self.check_max_iteration()
+
+        turn_info = (
+            self.task_config['max_iterations'] - self.agent_state.iteration,
+            self.task_config['max_propose_solution']
+            - self.task_state.agent_action_count['propose_solution'],
+        )
+
+        output = StepOutput(
+            observation=observation,
+            success=self.task_state.success,
+            turn_info=turn_info,
+        )
+
+        self.agent_state.propose_solution_count = self.task_state.agent_action_count[
+            'propose_solution'
+        ]
+        self.log_output(output)
+        return self.task_state
+
+    def handle_propose_solution(self, lm_message) -> Optional[str]:
+        """Propose answer to check the task success.
+
+        It might set self.state.finished = True if the task is successful.
+        """
+        self.task_state.agent_action_count['propose_solution'] += 1
+        try:
+            parsed = self.parse_propose_solution(lm_message)
+            task_success = self.check_task_success(parsed['answer'])
+            if task_success:
+                self.task_state.finished = True
+                self.task_state.success = True
+                self.task_state.terminate_reason = 'task_success'
+                # NOTE: should not return the function now, because we need to log the output
+                # Set state.finished = True will terminate the episode
+        except ParseError:
+            return SimplifiedEnv.INVALID_INPUT_MESSAGE
+        except Exception:
+            error_traceback = traceback.format_exc()
+            return f'{error_traceback}'
+
+    def parse_propose_solution(self, lm_message: str) -> dict:
+        """Define the parsing logic."""
+        lm_output = '\n' + lm_message + '\n'
+
+        answer = '\n'.join(
+            [
+                i.strip()
+                for i in re.findall(r'<solution>(.*?)</solution>', lm_output, re.DOTALL)
+            ]
+        )
+        if answer == '':
+            raise ParseError('No answer found.')
+
+        return {'answer': answer}
+
+    def log_output(self, output: StepOutput) -> None:
+        if self.task_state.finished:
+            return
+
+        content = output.to_str()
+        # self.state.history.append({"role": "user", "content": content})
+        self.task_state.latest_output = output.to_dict()
+        self.task_state.latest_output['content'] = content
+
+    def check_task_success(self, answer: str) -> bool:
+        # log_message.info(f"STUDENT ANSWER: [{answer}]")
+        # log_message.info(f"REFERENCE ANSWER: [{self.task.reference}]")
+        return self.task.success(answer)
+
+    def check_max_iteration(self):
+        """Check if the agent has reached the max iteration limit.
+
+        It might set self.state.finished = True if the agent has reached the max iteration limit.
+        """
+        if self.task_state.finished:
+            # ignore if the episode is already finished (e.g., task success)
+            return
+
+        if (
+            # propose solution > max output solution
+            self.task_state.agent_action_count['propose_solution']
+            >= self.task_config['max_propose_solution']
+        ):
+            self.task_state.finished = True
+            self.task_state.success = False
+            self.task_state.terminate_reason = 'max_propose_steps'
+        elif self.agent_state.iteration >= self.task_config['max_iterations']:
+            self.task_state.finished = True
+            self.task_state.success = False
+            self.task_state.terminate_reason = 'max_iterations'
@@ -0,0 +1,25 @@
+import os
+
+from utils import load_file
+
+PROMPT_DIR = os.path.dirname(__file__)
+TEMPLATE_WITH_TOOL = load_file(os.path.join(PROMPT_DIR, 'template_with_tool.txt'))
+
+
+class PromptTemplate:
+    """A prompt template."""
+
+    def __init__(self, template: str):
+        self.template: str = template
+
+    def __call__(self, **kwargs) -> str:
+        return self.template.format(**kwargs)
+
+
+class ToolPromptTemplate(PromptTemplate):
+    def __init__(self, use_tool: bool):
+        if use_tool:
+            template = TEMPLATE_WITH_TOOL
+        else:
+            raise NotImplementedError('Evaluation without tool is not supported yet.')
+        super().__init__(template)
@@ -0,0 +1,19 @@
+You are a helpful assistant assigned with the task of problem-solving.
+To solve the task, you can only interact with the interactive Python (Jupyter Notebook) environment using <execute_ipython> tag. Other tools cannot be used.
+At each turn, you should first provide your step-by-step thinking for solving the task. Your thought process should be enclosed using "<thought>" tag, for example: <thought> I need to print "Hello World!" </thought>.
+
+After that, you have two options:
+1) Interact with a Python programming environment and receive the corresponding output.
+2) Directly provide a solution by sending your answer to user through message that adheres to the required format for the given task. Your solution should be enclosed using "<solution>" tag, and it must be only the result, no explanation needed. For example: The answer is <solution> A </solution>.
+Either you choose to interact with the Python environment or provide a solution, you need to send a message to the user to evaluate your response and provide feedback.
+
+You have {max_total_steps} chances to interact with the environment or propose a solution. You can only propose a solution {max_propose_solution} times.
+
+---
+
+{in_context_example}
+
+---
+
+# Problem statement:
+{task_prompt}
@@ -0,0 +1,14 @@
+pandas==1.4.4
+opencv-python
+networkx
+scipy==1.10.1
+ipython
+matplotlib
+nltk
+pyyaml
+pytz
+visdom
+sympy
+seaborn
+python-dateutil
+statsmodels
@@ -0,0 +1,362 @@
+import asyncio
+import functools
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+from typing import Dict
+
+import tasks
+from config_variables import TASK_INFO_MAP
+from datasets import load_dataset
+from datatypes import TaskState
+from env import SimplifiedEnv
+from prompts import ToolPromptTemplate
+from tasks import Task
+from tqdm import tqdm
+
+from evaluation.swe_bench.swe_env_box import DockerSSHBox
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, get_parser
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    print('Cleaning up child processes...')
+    for process in mp.active_children():
+        print(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State, task: Task, task_config: Dict[str, int]):
+    logger.info(f'Gold reference: {task.reference}')
+    logger.info(f'Task config: {task_config}')
+
+    env = SimplifiedEnv(
+        agent_state=state,
+        task=task,
+        task_config=task_config,
+    )
+    last_action, _ = state.history[-1]
+    result_state: TaskState = env.step(last_action.message)
+
+    state.task_state = result_state
+
+    if not result_state.latest_output:
+        # Task is finished
+        msg = '/exit'
+    else:
+        msg = result_state.latest_output['content']
+
+    logger.info('User response:' + msg)
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': '\nIMPORTANT: When your answer is confirmed by the user to be correct, you can exit using the following command: <execute_bash> exit </execute_bash>.\n'
+}
+
+
+def process_instance(
+    instance: Task,
+    agent_class,
+    metadata,
+    skip_workspace_mount,
+    eval_output_dir,
+    reset_logger: bool = True,
+):
+    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
+    # create process-specific workspace dir
+    # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+    # so that different agent don't interfere with each other.
+    if not skip_workspace_mount:
+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(
+            eval_output_dir, 'logs', f'instance_{instance.task_id}.log'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {instance.task_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+
+    if not skip_workspace_mount:
+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+    sandbox = DockerSSHBox()
+
+    requirements_host_src = 'evaluation/mint/requirements.txt'
+    requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'
+    sandbox.copy_to(
+        host_src=requirements_host_src,
+        sandbox_dest=requirements_sandbox_dest,
+        recursive=False,
+    )
+    logger.info(
+        f'Copied files from [{requirements_host_src}] to [{requirements_sandbox_dest}] inside sandbox.'
+    )
+    exit_code, output = sandbox.execute(f'pip install -r {requirements_sandbox_dest}')
+
+    # Prepare instruction
+    instruction = ToolPromptTemplate(use_tool=True)(
+        max_total_steps=metadata['max_iterations'],
+        max_propose_solution=metadata['max_propose_solution'],
+        in_context_example=instance.in_context_example(
+            use_tool=True, with_feedback=False
+        ),
+        task_prompt='Task:\n' + instance.prompt,
+    )
+    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the concise RESULT inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'
+
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    fake_user_response_fn = functools.partial(
+        AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+        task=instance,
+        task_config={
+            'max_iterations': metadata['max_iterations'],
+            'max_propose_solution': metadata['max_propose_solution'],
+        },
+    )
+
+    state: State = asyncio.run(
+        main(
+            instruction,
+            fake_user_response_fn=fake_user_response_fn,
+            sandbox=sandbox,
+        )
+    )
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    task_state = None
+    if hasattr(state, 'task_state'):
+        task_state = state.task_state
+        logger.info('Task state: ' + str(task_state.to_dict()))
+
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = {
+        'id': instance.task_id,
+        'instance': instance.to_dict(),
+        'instruction': instruction,
+        'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'metrics': metrics,
+        'error': state.error if state and state.error else None,
+        'test_result': task_state.success if task_state else False,
+    }
+
+    # Close the sandbox
+    sandbox.close()
+
+    return output
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+
+    parser.add_argument(
+        '--subset',
+        default='math',
+        choices=['math', 'gsm8k', 'mmlu', 'theoremqa', 'mbpp', 'humaneval'],
+        type=str,
+        help='subset of the dataset to be used',
+    )
+    parser.add_argument(
+        '--max-propose-solution',
+        default=2,
+        type=int,
+        help='maximum number of times the agent can propose a solution',
+    )
+
+    args, _ = parser.parse_known_args()
+
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenDevin's repo
+    mint_dataset = load_dataset(
+        'ryanhoangt/xingyaoww-mint-bench', name=args.subset, split='test'
+    )
+    logger.info(f'Evaluating MINT - {args.subset} subset')
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'mint',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+        args.subset,
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'max_propose_solution': args.max_propose_solution,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproducibility
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        mint_dataset = mint_dataset.select(range(eval_n_limit))
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_instance_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_instance_ids.add(data['id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}, max propose solution {args.max_propose_solution}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    task_class: Task = getattr(tasks, TASK_INFO_MAP[args.subset]['class'])
+    new_mint_tests: list[Task] = []
+
+    for instance in mint_dataset:
+        if instance['id'] in finished_instance_ids:
+            logger.info(
+                f'Skipping instance {instance["id"]} as it is already finished.'
+            )
+            continue
+        # convert to Task object
+        instance = task_class(**instance)
+        new_mint_tests.append(instance)
+
+    mint_dataset = new_mint_tests
+    logger.info(
+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(mint_dataset)}'
+    )
+    # =============================================
+
+    pbar = tqdm(total=len(mint_dataset))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(future):
+        pbar.update(1)
+        output = future.result()
+        # logger.info('Output: ', output)
+        # pbar.set_description(f'Instance {output["instance_id"]}')
+        # pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
+        # logger.info(
+        #     f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
+        # )
+        output_fp.write(json.dumps(output) + '\n')
+        output_fp.flush()
+
+    # This sets the multi-processing
+    num_workers = args.eval_num_workers
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    # This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
+    skip_workspace_mount = agent_class == 'CodeActAgent'
+    logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
+
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multi-processing
+            for instance in mint_dataset:
+                future = executor.submit(
+                    process_instance,
+                    instance,
+                    agent_class,
+                    metadata,
+                    skip_workspace_mount,
+                    eval_output_dir,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+MODEL_CONFIG=$1
+SUBSET=$2
+EVAL_LIMIT=$3
+# Only 'CodeActAgent' is supported for MINT now
+AGENT="CodeActAgent"
+
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+
+export PYTHONPATH=$(pwd)
+
+COMMAND="poetry run python ./evaluation/mint/run_infer.py \
+    --llm-config $MODEL_CONFIG \
+    --max-iterations 5 \
+    --max-propose-solution 2 \
+    --eval-note $AGENT_VERSION"
+
+if [ -n "$SUBSET" ]; then
+  echo "SUBSET: $SUBSET"
+  COMMAND="$COMMAND --subset $SUBSET"
+# otherwise default to use the math subset
+else
+  echo "SUBSET: math"
+  COMMAND="$COMMAND --subset math"
+fi
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
@@ -0,0 +1,12 @@
+from .base import Task
+from .codegen import HumanEvalTask, MBPPTask
+from .reasoning import MultipleChoiceTask, ReasoningTask, TheoremqaTask
+
+__all__ = [
+    'Task',
+    'MultipleChoiceTask',
+    'ReasoningTask',
+    'TheoremqaTask',
+    'MBPPTask',
+    'HumanEvalTask',
+]
@@ -0,0 +1,91 @@
+import json
+import logging
+import os
+from abc import ABC, abstractmethod
+from typing import List, Optional, Tuple
+
+from utils import load_file
+
+LOGGER = logging.getLogger('MINT')
+
+
+class Task(ABC):
+    """Base class for a task instance."""
+
+    task_name: str = 'base'
+    in_context_example_dir = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'in_context_examples',
+    )
+
+    def __init__(self, **kwargs) -> None:
+        if 'loaded_history' in kwargs:
+            self.loaded_history = kwargs['loaded_history']
+        else:
+            self.loaded_history = None
+        # pre-load the in-context example
+        task_dir = os.path.join(self.in_context_example_dir, self.task_name)
+        self._in_context_example = {
+            'with_tool': load_file(os.path.join(task_dir, 'with_tool.txt')),
+        }
+        self.metadata = {}
+
+    @property
+    def task_id(self) -> str:
+        """Return the task id."""
+        assert hasattr(self, '_id'), 'Task does not have an id.'
+        return self._id
+
+    def in_context_example(
+        self, use_tool: bool = True, with_feedback: bool = False
+    ) -> str:
+        """Return the in-context example for the task."""
+        if use_tool and not with_feedback:
+            return self._in_context_example['with_tool']
+        else:
+            raise NotImplementedError
+
+    @property
+    def prompt(self) -> str:
+        """Return the task prompt."""
+        assert hasattr(self, '_prompt'), 'Task does not have a prompt.'
+        return self._prompt
+
+    @property
+    def reference(self) -> str:
+        """Return the reference solution for the task."""
+        assert hasattr(self, '_reference'), 'Task does not have a reference solution.'
+        return self._reference
+
+    @abstractmethod
+    def extract_answer(self, solution: str) -> Optional[str]:
+        """Extract the answer from the given solution."""
+        pass
+
+    @abstractmethod
+    def success(self, solution: str) -> bool:
+        """This checks whether the given solution can complete the current task.
+
+        Can be used to provide binary feedback.
+        """
+        answer = self.extract_answer(solution)
+        return answer == self.reference
+
+    @classmethod
+    def load_tasks(cls, path: str) -> Tuple[List['Task'], int]:
+        """Load all the tasks from a given jsonl file."""
+        assert path.endswith('.jsonl') or path.endswith('.json')
+        with open(path, 'r') as f:
+            tasks = [cls(**json.loads(line)) for line in f.readlines()]
+        LOGGER.info(f'Loaded {len(tasks)} tasks from {path}')
+        return tasks, len(tasks)
+
+    def to_dict(self) -> dict:
+        """Convert the task to a dictionary."""
+        return {
+            'task_name': self.task_name,
+            'task_id': self.task_id,
+            'prompt': self.prompt,
+            'reference': self.reference,
+            'metadata': self.metadata,
+        }
@@ -0,0 +1,83 @@
+import logging
+from typing import Optional
+
+from utils import check_correctness
+
+from .base import Task
+
+LOGGER = logging.getLogger('MINT')
+
+
+class CodeGenTask(Task):
+    """Generic code generation task instance."""
+
+    def __init__(self, id: str, prompt: str, reference: str, **kwargs):
+        super().__init__(**kwargs)
+        self._id = id
+        self._prompt = prompt
+        self._reference = reference
+
+    def success(self, solution: str) -> bool:
+        """This checks whether the given solution can complete the current task.
+
+        Can be used to provides binary feedback.
+        """
+        code_to_exec = self.extract_answer(solution)
+        LOGGER.debug(f'CODE_TO_EXEC:\n{code_to_exec}')
+        LOGGER.debug(f'TEST_CODE:\n{self._reference}')
+        res = check_correctness(
+            solution_code=code_to_exec, test_code=self._reference, timeout=10
+        )
+        return res['success']
+
+
+class MBPPTask(CodeGenTask):
+    task_name = 'mbpp'
+
+    @property
+    def prompt(self) -> str:
+        """Return the prompt for this task.
+
+        MBPP prompt contains \"\"\" enclosed at both ends. Need to remove it.
+        """
+        return self._prompt.replace('"""', '').strip()
+
+    def extract_answer(self, solution: str) -> Optional[str]:
+        """Extract the answer from the given solution.
+
+        Split off first block of code by scanning for class, def etc. on newlines.
+
+        Modified from:
+        https://github.com/bigcode-project/bigcode-evaluation-harness/blob/d61afde130005ecc65cf800ad8eca790a9bc2115/lm_eval/tasks/mbpp.py#L67
+        """
+        # STOP_WORDS = ["\nclass", "\nassert", '\n"""', "\nprint", "\nif", "\n<|/"]
+        # return re.split("|".join(STOP_WORDS), solution)[0].rstrip()
+        return solution
+
+
+class HumanEvalTask(CodeGenTask):
+    task_name = 'humaneval'
+
+    @property
+    def prompt(self) -> str:
+        """Return the prompt for this task.
+
+        MBPP prompt contains \"\"\" enclosed at both ends. Need to remove it.
+        """
+        return 'Complete the following code:\n\n' + self._prompt
+
+    def extract_answer(self, solution: str) -> Optional[str]:
+        """Extract the answer from the given solution.
+
+        Split off first block of code by scanning for class, def etc. on newlines.
+
+        Modified from:
+        https://github.com/bigcode-project/bigcode-evaluation-harness/blob/d61afde130005ecc65cf800ad8eca790a9bc2115/lm_eval/tasks/humaneval.py#L56
+        """
+
+        # STOP_WORDS = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"]
+        # # Remove the last block of the code containing stop_words for HumanEval
+        # string_list = re.split("(%s)" % "|".join(STOP_WORDS), solution)
+        # # last string should be ""
+        # return "".join(string_list[:-2])
+        return solution
@@ -0,0 +1,51 @@
+Task:
+Complete the following code:
+
+from typing import Tuple
+def similar_elements(test_tup1: Tuple[int], test_tup2: Tuple[int]) -> Tuple[int]:
+    """
+    Write a function to find the similar elements from the given two tuple lists.
+    assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
+    """
+
+Assistant:
+<thought> The similar elements here are just elements that are in both lists. I should run a solution on the test case before proposing a solution. </thought>
+<execute>
+from typing import Tuple
+def similar_elements(test_tup1: Tuple[int], test_tup2: Tuple[int]) -> Tuple[int]:
+    res = tuple(set(test_tup1) | set(test_tup2))
+    return res
+res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+</execute>
+
+Observation:
+---------------------------------------------------------------------------
+AssertionError                            Traceback (most recent call last)
+Cell In[4], line 5
+      3     return res
+      4 res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+----> 5 assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+
+AssertionError: Expected (4, 5) but got (3, 4, 5, 6, 7, 10)
+
+Assistant:
+<thought> To find the similar elements, I should find the intersection of the two sets. I should use the & operator instead of the | operator. </thought>
+<execute>
+def similar_elements(test_tup1: Tuple[int], test_tup2: Tuple[int]) -> Tuple[int]:
+    res = tuple(set(test_tup1) & set(test_tup2))
+    return res
+res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+</execute>
+
+Observation:
+[Executed Successfully with No Output]
+
+Assistant:
+<thought> There is no more AssertionError. I can now submit the solution. </thought>
+<solution>
+def similar_elements(test_tup1: Tuple[int], test_tup2: Tuple[int]) -> Tuple[int]:
+    res = tuple(set(test_tup1) & set(test_tup2))
+    return res
+</solution>
@@ -0,0 +1,44 @@
+Task:
+Write a function to find the similar elements from the given two tuple lists.
+assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
+
+Assistant:
+<thought> The similar elements here are just elements that are in both lists. I should run a solution on the test case before proposing a solution. </thought>
+<execute>
+def similar_elements(test_tup1, test_tup2):
+    res = tuple(set(test_tup1) | set(test_tup2))
+    return res
+res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+</execute>
+
+Observation:
+---------------------------------------------------------------------------
+AssertionError                            Traceback (most recent call last)
+Cell In[4], line 5
+      3     return res
+      4 res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+----> 5 assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+
+AssertionError: Expected (4, 5) but got (3, 4, 5, 6, 7, 10)
+
+Assistant:
+<thought> To find the similar elements, I should find the intersection of the two sets. I should use the & operator instead of the | operator. </thought>
+<execute>
+def similar_elements(test_tup1, test_tup2):
+    res = tuple(set(test_tup1) & set(test_tup2))
+    return res
+res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+</execute>
+
+Observation:
+[Executed Successfully with No Output]
+
+Assistant:
+<thought> There is no more AssertionError. I can now submit the solution. </thought>
+<solution>
+def similar_elements(test_tup1, test_tup2):
+    res = tuple(set(test_tup1) & set(test_tup2))
+    return res
+</solution>
@@ -0,0 +1,48 @@
+Task:
+Evaluate $\int\int_C (x+y)^2 dx,dy$, where C is the circle defined by $x^2 + y^2 = M$, and M is the birth year of Li Bai.
+Output format requirement: Round to three decimal places, using half up rounding rule.
+
+Assistant:
+<thought> To solve this problem, let's start by determining Li Bai's birth year. </thought>
+<execute_ipython> print(wikipedia_search('Li Bai')) </execute_ipython>
+
+Observation:
+Page: Li Bai
+Summary: Li Bai (Chinese: 李白; pinyin: Lǐ Bái, 701–762), also pronounced as Li Bo, courtesy name Taibai (Chinese: 太白), was a Chinese poet, acclaimed from his own time to the present as a brilliant and romantic figure who took traditional poetic forms to new heights.
+
+Assistant:
+<thought>
+From the wiki_search result, we can see that Li Bai was born in 701, thus M = 701.
+To evaluate the double integral of $(x+y)^2$ over the region $C$, where $C$ is the circle defined by $x^2 + y^2 = 701$, we should use polar coordinates.
+In polar coordinates, $x = r\cos(\theta)$ and $y = r\sin(\theta)$, and the differential area element $dx,dy$ becomes $r,dr,d\theta$.
+$\int\int_C (x+y)^2 dx,dy = \int_0^{2\pi}\int_0^{\sqrt{701}} (r\cos(\theta) + r\sin(\theta))^2r,dr,d\theta$
+Let's use the SymPy package to calculate this.
+</thought>
+<execute_ipython>
+from sympy import symbols, integrate, cos, sin, pi
+
+r, theta = symbols('r theta')
+
+# the integrand
+integrand = (r*cos(theta) + r*sin(theta))**2 * r
+
+# perform the double integration
+double_integral = integrate(integrate(integrand, (r, 0, 701 ** 0.5)), (theta, 0, 2*pi))
+print(double_integral)
+</execute_ipython>
+
+Observation:
+Out[2]: 245700.5*pi
+
+Assistant:
+<thought> Let's round this result to three decimal places using half up rounding rule. </thought>
+<execute_ipython>
+import math
+print(round(245700.5*math.pi, 3))
+</execute_ipython>
+
+Observation:
+771890.886
+
+Assistant:
+The answer is <solution> 771890.886 </solution>.
@@ -0,0 +1,357 @@
+import ast
+import logging
+import re
+import traceback
+from typing import Any, Optional
+
+import numpy as np
+from sympy import Rational
+
+from tasks.base import Task
+
+LOGGER = logging.getLogger('MINT')
+
+
+class ReasoningTask(Task):
+    task_name = 'reasoning'
+
+    def __init__(self, id: str, prompt: str, reference: str, **kwargs):
+        super().__init__(**kwargs)
+        self._id = id
+        self._prompt = prompt.strip()
+        self._reference = str(reference).strip().lower()
+
+    def extract_answer(self, solution: str) -> Optional[str]:
+        """Extract the answer from the given solution."""
+        return solution.lower().strip()
+
+    def compare_w_digits(self, reference: str, answer: str) -> bool:
+        """Compare the reference and answer with digits."""
+        # if reference can and answer can both be converted to floats by float()
+        try:
+            float(reference)
+            float(answer)
+            return abs(float(reference) - float(answer)) <= 0.05 * abs(float(reference))
+        except ValueError:
+            return reference in answer
+        except Exception:
+            raise ValueError(f'Cannot compare {reference} and {answer}')
+
+    def success(self, solution: str) -> bool:
+        answer = self.extract_answer(solution)
+        return self.compare_w_digits(self._reference, answer)
+
+
+class MultipleChoiceTask(Task):
+    """Subclass of Task for multiple choice tasks."""
+
+    task_name = 'reasoning'
+
+    def __init__(self, id, prompt: str, reference: str, **kwargs):
+        super().__init__(**kwargs)
+        self._id = id
+        self.hide_options = kwargs.get('hide_options', False)
+        if self.hide_options:
+            self._prompt = prompt.split('Options:')[0].strip()
+        else:
+            self._prompt = prompt
+        self._reference = reference.strip().lower()
+        self._options = self.extract_options(prompt)
+        # if all options can be converted to float, strictly perform hide options
+        try:
+            for option in self._options.values():
+                float(option)
+            self.hide_options = True
+        except ValueError:
+            pass
+        self.metadata.update({'options': self._options})
+
+    def extract_answer(self, solution: str) -> Optional[str]:
+        # Extract the selected option from the solution
+        solution = solution.lower().strip()
+        for letter in 'abcdefghijklmnopqrstuvwxyz':
+            if f'{letter})' in solution or f'{letter} )' in solution:
+                print('SOLUTION', letter)
+                return letter
+            else:
+                print('SOLUTION', solution)
+                return solution
+
+    def compare_w_digits(self, reference: str, answer: str) -> bool:
+        if reference.isdigit() and answer.isdigit():
+            return abs(float(reference) - float(answer)) <= 0.05 * float(reference)
+        else:
+            return reference in answer
+
+    def success(self, solution: str) -> bool:
+        answer = self.extract_answer(solution)
+        if self.compare_w_digits(self._reference, answer):
+            return True
+        else:
+            correct_option = self._options[self._reference]
+            wrong_option_list = list(self._options.values())
+            print('OPTIONS', correct_option, wrong_option_list)
+            print('ANSWER', answer)
+            for i in wrong_option_list:
+                if i in correct_option:
+                    wrong_option_list.remove(i)
+            for i in wrong_option_list:
+                if self.compare_w_digits(i, answer) or (i in answer):
+                    return False
+            if self.compare_w_digits(correct_option, answer) or (
+                correct_option in answer
+            ):
+                return True
+            else:
+                return False
+
+    def extract_options(self, prompt: str) -> dict:
+        # Find the possible option separators (comma, semicolon, or parentheses)
+        prompt = prompt.split('Options: ')[-1]
+        # Extract the options using the delimiter
+        options_match = prompt.split(' , ')
+        options = {}
+        for i in range(len(options_match)):
+            option = options_match[i].strip("[]' ")
+            option = option.split(')')
+            letter = option[0].lower().strip()
+            content = (
+                option[1]
+                .lower()
+                .strip('.')
+                .replace('. Which option is correct?', '')
+                .replace('. Which one is correct?', '')
+                .strip()
+            )
+            options.update({letter: content})
+        return options
+
+
+# ==== TheoremQA ====
+
+
+def compare_two_numbers(p, gt):
+    if isinstance(p, int) or isinstance(p, float):
+        pass
+    elif isinstance(p, list) or isinstance(p, bool) or isinstance(p, str):
+        return False
+    elif isinstance(p, tuple) or isinstance(p, complex) or isinstance(p, dict):
+        return False
+    else:
+        raise ValueError(p)
+
+    if isinstance(gt, float):
+        return within_eps(pred=p, gt=gt)
+    else:
+        return round(p) == gt
+
+
+def compare_two_list(pred, gt):
+    if not isinstance(pred, list):
+        return False
+    elif len(pred) != len(gt):
+        return False
+    elif any([not isinstance(x, (int, float)) for x in pred]):
+        return False
+    else:
+        pred = sorted(pred)
+        gt = sorted(gt)
+        return all([compare_two_numbers(p, g) for p, g in zip(pred, gt)])
+
+
+def within_eps(pred: float, gt: float):
+    eps = abs(gt) * 0.04
+    if pred >= gt - eps and pred <= gt + eps:
+        return True
+    else:
+        return False
+
+
+def parse_number_list(s: str):
+    # Check if the string is a valid list by trying to parse it
+    parsed_list = ast.literal_eval(s)
+    return parsed_list
+
+
+def is_number(string):
+    pattern = r'^[-+]?(\d{1,3}(,\d{3})*|(\d+))(\.\d+)?$'
+    match = re.match(pattern, string)
+    return bool(match)
+
+
+def is_scientific_number(string):
+    pattern = r'^[-+]?\d+(\.\d+)?e[-]?\d+$'
+    match = re.match(pattern, string)
+    return bool(match)
+
+
+def contain_num_and_str(string):
+    pattern_str = r'[a-zA-Z]'
+    pattern_num = r'[0-9]'
+    return bool(re.search(pattern_str, string) and re.search(pattern_num, string))
+
+
+class TheoremqaTask(Task):
+    task_name = 'reasoning'
+
+    def __init__(self, id: str, prompt: str, reference: str, **kwargs):
+        super().__init__(**kwargs)
+        self._id = id
+        self._prompt = (
+            'Answer the following question with a number, a list of numbers or True or False. '
+            + prompt.strip()
+        )
+        self._reference = reference
+        self._answer_type = kwargs.get('answer_type')
+
+    def extract_answer(self, solution: str) -> Optional[Any]:
+        """Extract the answer from the given solution."""
+        prediction = solution
+        # Following the preprocessing steps from TheoremQA
+        # https://github.com/wenhuchen/TheoremQA/blob/123e36beaaa97c01f28a582f13c4f77a6822c199/predict_accuracy.py#L170
+
+        # Preprocessing the string [Stage 1]
+        if not isinstance(prediction, str):
+            prediction = str(prediction) if prediction is not None else '0'
+
+        # Replace special tokens
+        if '=' in prediction:
+            prediction = prediction.split('=')[-1].strip()
+        if '≈' in prediction:
+            prediction = prediction.split('≈')[-1].strip()
+        if '`' in prediction:
+            prediction = prediction.replace('`', '')
+        if '$' in prediction:
+            prediction = prediction.replace('$', '')
+        if '°' in prediction:
+            prediction = prediction.replace('°', '')
+
+        # Detect the boolean keyword in the generation
+        if prediction in ['true', 'yes', 'false', 'no']:
+            if prediction == 'true' or prediction == 'yes':
+                prediction = 'True'
+            else:
+                prediction = 'False'
+        if 'True' in prediction or 'False' in prediction:
+            prediction = 'True' if 'True' in prediction else 'False'
+
+        # Detect the approximation keyword
+        if 'approximately' in prediction:
+            prediction = prediction.replace('approximately', '').strip()
+        if ' or ' in prediction:
+            prediction = prediction.split(' or ')[0]
+
+        # Drop the units before and after the number
+        if re.match(r'[-+]?(?:[\d,]*\.*\d+) [^0-9 ]+$', prediction):
+            prediction = re.search(
+                r'([-+]?(?:[\d,]*\.*\d+)) [^0-9 ]+$', prediction
+            ).group(1)
+        if re.match(r'[^0-9 ]+ [-+]?(?:[\d,]*\.*\d+)$', prediction):
+            prediction = re.search(
+                r'[^0-9 ]+ ([-+]?(?:[\d,]*\.*\d+))$', prediction
+            ).group(1)
+        if re.match(r'[-+]?(?:[\d,]*\.*\d+)[^\d]{1,2}$', prediction):
+            prediction = re.search(
+                r'([-+]?(?:[\d,]*\.*\d+))[^\d]{1,2}$', prediction
+            ).group(1)
+        if re.match(r'[^-+\d]{1,2}(?:[\d,]*\.*\d+)$', prediction):
+            prediction = re.search(
+                r'[^-+\d]{1,2}((?:[\d,]*\.*\d+))$', prediction
+            ).group(1)
+
+        # Preprocessing the number [Stage 1]
+        if '10^' in prediction:
+            prediction = re.sub(r'10\^(-?\d+)', r'math.pow(10, \1)', prediction)
+        if ' x ' in prediction:
+            prediction = prediction.replace(' x ', '*')
+        if ' × ' in prediction:
+            prediction = prediction.replace(' × ', '*')
+        if is_number(prediction):
+            prediction = prediction.replace(',', '')
+
+        # Preprocessing the option [Stage 3]
+        if (
+            'a)' in prediction
+            or 'a )' in prediction
+            or prediction.lower().strip() == 'a'
+        ):
+            prediction = '(a)'
+        if (
+            'b)' in prediction
+            or 'b )' in prediction
+            or prediction.lower().strip() == 'b'
+        ):
+            prediction = '(b)'
+        if (
+            'c)' in prediction
+            or 'c )' in prediction
+            or prediction.lower().strip() == 'c'
+        ):
+            prediction = '(c)'
+        if (
+            'd)' in prediction
+            or 'd )' in prediction
+            or prediction.lower().strip() == 'd'
+        ):
+            prediction = '(d)'
+
+        if (
+            '(a)' in prediction
+            or '(b)' in prediction
+            or '(c)' in prediction
+            or '(d)' in prediction
+        ):
+            prediction = '"' + re.search(r'\([a-d]\)', prediction).group(0) + '"'
+
+        # If the prediction is empty, use dummy '0'
+        if not prediction:
+            prediction = '0'
+
+        # Converting the string answer to a number/list/bool/option
+        try:
+            prediction = eval(prediction)
+        except Exception:
+            LOGGER.warning(
+                f'[TASK] Failed to convert the answer: {prediction}\n{traceback.format_exc()}'
+            )
+            return None  # failed to convert the answer
+
+        # Performing common type conversion
+        if isinstance(prediction, (set, tuple)):
+            prediction = list(prediction)
+            if isinstance(prediction[0], complex):
+                prediction = [tmp.real for tmp in prediction]
+            elif isinstance(prediction[0], Rational):
+                prediction = [float(tmp) for tmp in prediction]
+        elif isinstance(prediction, np.ndarray):
+            prediction = prediction.tolist()
+        else:
+            if isinstance(prediction, complex):
+                prediction = prediction.real
+            elif isinstance(prediction, Rational):
+                prediction = float(prediction)
+
+        return prediction
+
+    def success(self, solution: str) -> bool:
+        """This checks whether the given solution can complete the current task."""
+        # Follow the implementation from TheoremQA
+        # https://github.com/wenhuchen/TheoremQA/blob/123e36beaaa97c01f28a582f13c4f77a6822c199/predict_accuracy.py#L301C9-L317C1
+        prediction = self.extract_answer(solution)
+        LOGGER.info(f'TheoremQA Parsed Prediction: {prediction}')
+        answer_type = self._answer_type
+        gt = self.extract_answer(self.reference)
+
+        if isinstance(prediction, (str, int, float)) or isinstance(prediction, list):
+            # Comparing prediction against the reference
+            if answer_type in ['bool', 'option', 'Option']:
+                cur_correct = int(prediction == f'({gt})') or int(prediction == gt)
+            elif answer_type == 'integer':
+                cur_correct = int(compare_two_numbers(prediction, gt))
+            elif answer_type == 'float':
+                cur_correct = int(compare_two_numbers(prediction, gt))
+            elif answer_type in ['list of integer', 'list of float']:
+                cur_correct = int(compare_two_list(prediction, gt))
+        else:
+            cur_correct = 0
+        return bool(cur_correct)
--- a/Show More
+++ b/Show More