chore(deps-dev): bump jupyterlab from 4.2.3 to 4.2.4 (#3028 )

Bumps [jupyterlab](https://github.com/jupyterlab/jupyterlab) from 4.2.3 to 4.2.4. - [Release notes](https://github.com/jupyterlab/jupyterlab/releases) - [Changelog](https://github.com/jupyterlab/jupyterlab/blob/@jupyterlab/lsp@4.2.4/CHANGELOG.md) - [Commits](https://github.com/jupyterlab/jupyterlab/compare/@jupyterlab/lsp@4.2.3...@jupyterlab/lsp@4.2.4) --- updated-dependencies: - dependency-name: jupyterlab dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
chore(deps): bump i18next from 23.12.1 to 23.12.2 in /frontend (#3020 )
2026-04-29 03:00:45 -04:00 · 2024-07-20 00:08:24 +08:00 · 2024-07-20 00:08:05 +08:00 · 2024-07-20 00:07:48 +08:00 · 2024-07-20 00:07:28 +08:00 · 2024-07-20 00:07:12 +08:00
172 changed files with 4957 additions and 2937 deletions
--- a/.github/ISSUE_TEMPLATE/bug_template.yml
+++ b/.github/ISSUE_TEMPLATE/bug_template.yml
@@ -12,7 +12,7 @@ body:
      label: Is there an existing issue for the same bug?
      description: Please check if an issue already exists for the bug you encountered.
      options:
-      - label: I have checked the troubleshooting document at https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting
+      - label: I have checked the troubleshooting document at https://docs.all-hands.dev/modules/usage/troubleshooting
        required: true
      - label: I have checked the existing issues.
        required: true
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,5 +1,8 @@
 **What is the problem that this fixes or functionality that this introduces? Does it fix any open issues?**

-**Give a brief summary of what the PR does, explaining any non-trivial design decisions**
+---

+**Give a summary of what the PR does, explaining any non-trivial design decisions**
+
+---
 **Other references**
--- a/.github/workflows/review-pr.yml
+++ b/.github/workflows/review-pr.yml
@@ -12,15 +12,15 @@ jobs:
  dogfood:
    if: contains(github.event.pull_request.labels.*.name, 'review-this')
    runs-on: ubuntu-latest
-    container:
-      image: ghcr.io/opendevin/opendevin
-      volumes:
-        - /var/run/docker.sock:/var/run/docker.sock
-
    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.11'
    - name: install git, github cli
      run: |
-        apt-get install -y git gh
+        sudo apt-get install -y git gh
        git config --global --add safe.directory $PWD

    - name: Checkout Repository
@@ -34,7 +34,9 @@ jobs:

    - name: Write Task File
      run: |
-        echo "Your coworker wants to apply a pull request to this project. Read and review ${{ github.event.pull_request.number }}.diff file. Create a review-${{ github.event.pull_request.number }}.txt and write your concise comments and suggestions there." > task.txt
+        echo "Your coworker wants to apply a pull request to this project." > task.txt
+        echo "Read and review ${{ github.event.pull_request.number }}.diff file. Create a review-${{ github.event.pull_request.number }}.txt and write your concise comments and suggestions there." >> task.txt
+        echo "Do not ask me for confirmation at any point." >> task.txt
        echo "" >> task.txt
        echo "Title" >> task.txt
        echo "${{ github.event.pull_request.title }}" >> task.txt
@@ -53,15 +55,17 @@ jobs:

    - name: Run OpenDevin
      env:
-        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+        LLM_MODEL: ${{ vars.LLM_MODEL }}
        SANDBOX_BOX_TYPE: ssh
      run: |
        # Append path to launch poetry
        export PATH="/github/home/.local/bin:$PATH"
        # Append path to correctly import package, note: must set pwd at first
        export PYTHONPATH=$(pwd):$PYTHONPATH
-        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
+        export WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE
+        export WORKSPACE_BASE=$GITHUB_WORKSPACE
+        echo -e "/exit\n" | poetry run python opendevin/core/main.py -i 50 -f task.txt
        rm task.txt

    - name: Check if review file is non-empty
--- a/.github/workflows/run-runtime-tests.yml
+++ b/.github/workflows/run-runtime-tests.yml
@@ -0,0 +1,64 @@
+name: Run Runtime Tests
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+on:
+  push:
+    branches:
+      - main
+    paths-ignore:
+      - '**/*.md'
+      - 'frontend/**'
+      - 'docs/**'
+      - 'evaluation/**'
+  pull_request:
+
+env:
+  PERSIST_SANDBOX : "false"
+
+jobs:
+  test-for-runtime:
+    name: Test for Runtime
+    runs-on: ubuntu-latest
+    env:
+      PERSIST_SANDBOX: "false"
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # when set to "true" but frees about 6 GB
+          tool-cache: true
+
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: true
+
+      - name: Install poetry via pipx
+        run: pipx install poetry
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: "poetry"
+
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+
+      - name: Run runtime tests
+        run: |
+          TEST_IN_CI=true poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml -s ./tests/unit/test_runtime.py
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/run-unit-tests.yml
+++ b/.github/workflows/run-unit-tests.yml
@@ -86,7 +86,31 @@ jobs:
          brew services start colima
          brew install docker
          colima delete
-          colima start  --network-address --arch x86_64 --cpu=1 --memory=1
+          # Attempt to start Colima
+          ATTEMPT_LIMIT=3
+
+          start_colima() {
+            colima start --network-address --arch x86_64 --cpu=1 --memory=1
+          }
+
+          for ((i=1; i<=ATTEMPT_LIMIT; i++)); do
+            if start_colima; then
+              echo "Colima started successfully."
+              break
+            else
+              echo "Failed to start Colima. Attempt $i/$ATTEMPT_LIMIT."
+              if [ $i -eq $ATTEMPT_LIMIT ]; then
+                colima delete
+              else
+                colima stop
+              fi
+            fi
+          done
+
+          if [ $i -gt $ATTEMPT_LIMIT ]; then
+            echo "Failed to start Colima after $ATTEMPT_LIMIT attempts."
+            exit 1
+          fi

          # For testcontainers to find the Colima socket
          # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
@@ -96,7 +120,7 @@ jobs:
        run: make build

      - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"
+        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox and not test_runtime"

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
@@ -130,7 +154,7 @@ jobs:
        run: make build

      - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"
+        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox and not test_runtime"

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,13 +1,13 @@
 # Contributing

-Thanks for your interest in contributing to OpenDevin! We welcome and appreciate contributions. 
+Thanks for your interest in contributing to OpenDevin! We welcome and appreciate contributions.

 ## How Can I Contribute?

 There are many ways that you can contribute:

 1. **Download and use** OpenDevin, and send [issues](https://github.com/OpenDevin/OpenDevin/issues) when you encounter something that isn't working or a feature that you'd like to see.
-2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://opendevin.github.io/OpenDevin/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
+2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
 3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issue](https://github.com/OpenDevin/OpenDevin/labels/good%20first%20issue) issues that may be ones to start on.

 ## Understanding OpenDevin's CodeBase
@@ -83,7 +83,7 @@ git push origin my_branch
   - Set `base repository` to `OpenDevin/OpenDevin`
   - Set `base` to `main`
   - Click `Create pull request`
-  
+
 The PR should appear in [OpenDevin PRs](https://github.com/OpenDevin/OpenDevin/pulls).

 Then the OpenDevin team will review your code.
@@ -114,4 +114,3 @@ You may also check out previous PRs in the [PR list](https://github.com/OpenDevi
 ### 2. Pull Request description
 - If your PR is small (such as a typo fix), you can go brief.
 - If it contains a lot of changes, it's better to write more details.
-
--- a/Development.md
+++ b/Development.md
@@ -39,18 +39,18 @@ make build
 OpenDevin supports a diverse array of Language Models (LMs) through the powerful [litellm](https://docs.litellm.ai) library. By default, we've chosen the mighty GPT-4 from OpenAI as our go-to model, but the world is your oyster! You can unleash the potential of Anthropic's suave Claude, the enigmatic Llama, or any other LM that piques your interest.

 To configure the LM of your choice, run:
-       
+
   ```bash
   make setup-config
   ```
-   
+
   This command will prompt you to enter the LLM API key, model name, and other variables ensuring that OpenDevin is tailored to your specific needs. Note that the model name will apply only when you run headless. If you use the UI, please set the model in the UI.
-   
+
   Note: If you have previously run OpenDevin using the docker command, you may have already set some environmental variables in your terminal. The final configurations are set from highest to lowest priority:
   Environment variables > config.toml variables > default variables

 **Note on Alternative Models:**
-Some alternative models may prove more challenging to tame than others. Fear not, brave adventurer! We shall soon unveil LLM-specific documentation to guide you on your quest. 
+Some alternative models may prove more challenging to tame than others. Fear not, brave adventurer! We shall soon unveil LLM-specific documentation to guide you on your quest.
 And if you've already mastered the art of wielding a model other than OpenAI's GPT, we encourage you to share your setup instructions with us by creating instructions and adding it [to our documentation](https://github.com/OpenDevin/OpenDevin/tree/main/docs/modules/usage/llms).

 For a full list of the LM providers and models available, please consult the [litellm documentation](https://docs.litellm.ai/docs/providers).
@@ -84,10 +84,11 @@ make help
 ```

 ### 8. Testing
+To run tests, refer to the following:
 #### Unit tests

 ```bash
-poetry run pytest ./tests/unit/test_sandbox.py
+poetry run pytest ./tests/unit/test_*.py
 ```

 #### Integration tests
--- a/ISSUE_TRIAGE.md
+++ b/ISSUE_TRIAGE.md
@@ -0,0 +1,25 @@
+# Issue Triage
+These are the procedures and guidelines on how issues are triaged in this repo by the maintainers.
+
+## General
+* Most issues must be tagged with **enhancement** or **bug**
+* Issues may be tagged with what it relates to (**backend**, **frontend**, **agent quality**, etc.)
+
+## Severity
+* **Low**: Minor issues, single user report
+* **Medium**: Affecting multiple users
+* **Critical**: Affecting all users or potential security issues
+
+## Effort
+* Issues may be estimated with effort required (**small effort**, **medium effort**, **large effort**)
+
+## Difficulty
+* Issues with low implementation difficulty may be tagged with **good first issue**
+
+## Not Enough Information
+* User is asked to provide more information (logs, how to reproduce, etc.) when the issue is not clear
+* If an issue is unclear and the author does not provide more information or respond to a request, the issue may be closed as **not planned** (Usually after a week)
+
+## Multiple Requests/Fixes in One Issue
+* These issues will be narrowed down to one request/fix so the issue is more easily tracked and fixed
+* Issues may be broken down into multiple issues if required
--- a/7
+++ b/7
@@ -162,11 +162,8 @@ install-frontend-dependencies:
 	@echo "$(YELLOW)Setting up frontend environment...$(RESET)"
 	@echo "$(YELLOW)Detect Node.js version...$(RESET)"
 	@cd frontend && node ./scripts/detect-node-version.js
-	@cd frontend && \
-		echo "$(BLUE)Installing frontend dependencies with npm...$(RESET)" && \
-		npm install && \
-		echo "$(BLUE)Running make-i18n with npm...$(RESET)" && \
-		npm run make-i18n
+	echo "$(BLUE)Installing frontend dependencies with npm...$(RESET)"
+	@cd frontend && npm install
 	@echo "$(GREEN)Frontend dependencies installed successfully.$(RESET)"

 install-pre-commit-hooks:
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@
 <div align="center">
  <img src="./docs/static/img/logo.png" alt="Logo" width="200" height="200">
  <h1 align="center">OpenDevin: Code Less, Make More</h1>
-  <a href="https://opendevin.github.io/OpenDevin/modules/usage/intro"><img src="https://img.shields.io/badge/Documentation-OpenDevin-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Check out the documentation"></a>
+  <a href="https://docs.all-hands.dev/modules/usage/intro"><img src="https://img.shields.io/badge/Documentation-OpenDevin-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Check out the documentation"></a>
  <a href="https://huggingface.co/spaces/OpenDevin/evaluation"><img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?style=for-the-badge" alt="Evaluation Benchmark"></a>
 </div>
 <hr>
@@ -45,7 +45,7 @@ OpenDevin agents collaborate with human developers to write code, fix bugs, and
 ![App screenshot](./docs/static/img/screenshot.png)

 ## ⚡ Getting Started
-OpenDevin works best with the most recent version of Docker, `26.0.0`.
+OpenDevin works best with Docker version 26.0.0+ (Docker Desktop 4.31.0+).
 You must be using Linux, Mac OS, or WSL on Windows.

 To start OpenDevin in a docker container, run the following commands in your terminal:
@@ -71,7 +71,7 @@ docker run -it \
 > By default, this command pulls the `latest` tag, which represents the most recent release of OpenDevin. You have other options as well:
 > - For a specific release version, use `ghcr.io/opendevin/opendevin:<OpenDevin_version>` (replace <OpenDevin_version> with the desired version number).
 > - For the most up-to-date development version, use `ghcr.io/opendevin/opendevin:main`. This version may be **(unstable!)** and is recommended for testing or development purposes only.
-> 
+>
 > Choose the tag that best suits your needs based on stability requirements and desired features.

 You'll find OpenDevin running at [http://localhost:3000](http://localhost:3000) with access to `./workspace`. To have OpenDevin operate on your code, place it in `./workspace`.
@@ -82,12 +82,12 @@ the `Settings` button (gear icon) in the UI. If the required `Model` does not ex

 For the development workflow, see [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md).

-Are you having trouble? Check out our [Troubleshooting Guide](https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting).
+Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).

 ## 🚀 Documentation

 To learn more about the project, and for tips on using OpenDevin,
-**check out our [documentation](https://opendevin.github.io/OpenDevin/modules/usage/intro)**.
+**check out our [documentation](https://docs.all-hands.dev/modules/usage/intro)**.

 There you'll find resources on how to use different LLM providers (like ollama and Anthropic's Claude),
 troubleshooting resources, and advanced configuration options.
--- a/agenthub/browsing_agent/browsing_agent.py
+++ b/agenthub/browsing_agent/browsing_agent.py
@@ -99,8 +99,7 @@ class BrowsingAgent(Agent):
        self,
        llm: LLM,
    ) -> None:
-        """
-        Initializes a new instance of the BrowsingAgent class.
+        """Initializes a new instance of the BrowsingAgent class.

        Parameters:
        - llm (LLM): The llm to be used by this agent
@@ -120,16 +119,13 @@ class BrowsingAgent(Agent):
        self.reset()

    def reset(self) -> None:
-        """
-        Resets the Browsing Agent.
-        """
+        """Resets the Browsing Agent."""
        super().reset()
        self.cost_accumulator = 0
        self.error_accumulator = 0

    def step(self, state: State) -> Action:
-        """
-        Performs one step using the Browsing Agent.
+        """Performs one step using the Browsing Agent.
        This includes gathering information on previous steps and prompting the model to make a browsing command to execute.

        Parameters:
--- a/agenthub/browsing_agent/prompt.py
+++ b/agenthub/browsing_agent/prompt.py
@@ -75,7 +75,8 @@ class PromptElement:
    Prompt elements are used to build the prompt. Use flags to control which
    prompt elements are visible. We use class attributes as a convenient way
    to implement static prompts, but feel free to override them with instance
-    attributes or @property decorator."""
+    attributes or @property decorator.
+    """

    _prompt = ''
    _abstract_ex = ''
@@ -200,11 +201,10 @@ def fit_tokens(
    model_name : str, optional
        The name of the model used when tokenizing.

-    Returns
+    Returns:
    -------
    str : the prompt after shrinking.
    """
-
    if max_prompt_chars is None:
        return shrinkable.prompt

@@ -579,8 +579,8 @@ the form is not visible yet or some fields are disabled. I need to replan.
 def diff(previous, new):
    """Return a string showing the difference between original and new.

-    If the difference is above diff_threshold, return the diff string."""
-
+    If the difference is above diff_threshold, return the diff string.
+    """
    if previous == new:
        return 'Identical', []

--- a/agenthub/browsing_agent/response_parser.py
+++ b/agenthub/browsing_agent/response_parser.py
@@ -37,9 +37,8 @@ class BrowsingResponseParser(ResponseParser):


 class BrowsingActionParserMessage(ActionParser):
-    """
-    Parser action:
-        - BrowseInteractiveAction(browser_actions) - unexpected response format, message back to user
+    """Parser action:
+    - BrowseInteractiveAction(browser_actions) - unexpected response format, message back to user
    """

    def __init__(
@@ -60,9 +59,8 @@ class BrowsingActionParserMessage(ActionParser):


 class BrowsingActionParserBrowseInteractive(ActionParser):
-    """
-    Parser action:
-        - BrowseInteractiveAction(browser_actions) - handle send message to user function call in BrowserGym
+    """Parser action:
+    - BrowseInteractiveAction(browser_actions) - handle send message to user function call in BrowserGym
    """

    def __init__(
--- a/agenthub/browsing_agent/utils.py
+++ b/agenthub/browsing_agent/utils.py
@@ -7,7 +7,6 @@ import yaml

 def yaml_parser(message):
    """Parse a yaml message for the retry function."""
-
    # saves gpt-3.5 from some yaml parsing errors
    message = re.sub(r':\s*\n(?=\S|\n)', ': ', message)

@@ -47,7 +46,6 @@ def _compress_chunks(text, identifier, skip_list, split_regex='\n\n+'):

 def compress_string(text):
    """Compress a string by replacing redundant paragraphs and lines with identifiers."""
-
    # Perform paragraph-level compression
    def_dict, compressed_text = _compress_chunks(
        text, identifier='§', skip_list=[], split_regex='\n\n+'
@@ -79,12 +77,12 @@ def extract_html_tags(text, keys):
    keys : list of str
        The HTML tags to extract the content from.

-    Returns
+    Returns:
    -------
    dict
        A dictionary mapping each key to a list of subset in `text` that match the key.

-    Notes
+    Notes:
    -----
    All text and keys will be converted to lowercase before matching.

@@ -126,7 +124,7 @@ def parse_html_tags(text, keys=(), optional_keys=(), merge_multiple=False):
    optional_keys : list of str
        The HTML tags to extract the content from, but are optional.

-    Returns
+    Returns:
    -------
    dict
        A dictionary mapping each key to subset of `text` that match the key.
--- a/agenthub/codeact_agent/action_parser.py
+++ b/agenthub/codeact_agent/action_parser.py
@@ -12,13 +12,12 @@ from opendevin.events.action import (


 class CodeActResponseParser(ResponseParser):
-    """
-    Parser action:
-        - CmdRunAction(command) - bash command to run
-        - IPythonRunCellAction(code) - IPython code to run
-        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
-        - MessageAction(content) - Message action to run (e.g. ask for clarification)
-        - AgentFinishAction() - end the interaction
+    """Parser action:
+    - CmdRunAction(command) - bash command to run
+    - IPythonRunCellAction(code) - IPython code to run
+    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    - AgentFinishAction() - end the interaction
    """

    def __init__(self):
@@ -53,9 +52,8 @@ class CodeActResponseParser(ResponseParser):


 class CodeActActionParserFinish(ActionParser):
-    """
-    Parser action:
-        - AgentFinishAction() - end the interaction
+    """Parser action:
+    - AgentFinishAction() - end the interaction
    """

    def __init__(
@@ -76,10 +74,9 @@ class CodeActActionParserFinish(ActionParser):


 class CodeActActionParserCmdRun(ActionParser):
-    """
-    Parser action:
-        - CmdRunAction(command) - bash command to run
-        - AgentFinishAction() - end the interaction
+    """Parser action:
+    - CmdRunAction(command) - bash command to run
+    - AgentFinishAction() - end the interaction
    """

    def __init__(
@@ -106,9 +103,8 @@ class CodeActActionParserCmdRun(ActionParser):


 class CodeActActionParserIPythonRunCell(ActionParser):
-    """
-    Parser action:
-        - IPythonRunCellAction(code) - IPython code to run
+    """Parser action:
+    - IPythonRunCellAction(code) - IPython code to run
    """

    def __init__(
@@ -137,9 +133,8 @@ class CodeActActionParserIPythonRunCell(ActionParser):


 class CodeActActionParserAgentDelegate(ActionParser):
-    """
-    Parser action:
-        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    """Parser action:
+    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
    """

    def __init__(
@@ -164,9 +159,8 @@ class CodeActActionParserAgentDelegate(ActionParser):


 class CodeActActionParserMessage(ActionParser):
-    """
-    Parser action:
-        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    """Parser action:
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
    """

    def __init__(
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@@ -8,7 +8,6 @@ from agenthub.codeact_agent.prompt import (
 )
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import config
 from opendevin.events.action import (
    Action,
    AgentDelegateAction,
@@ -22,6 +21,7 @@ from opendevin.events.observation import (
    CmdOutputObservation,
    IPythonRunCellObservation,
 )
+from opendevin.events.observation.observation import Observation
 from opendevin.events.serialization.event import truncate_content
 from opendevin.llm.llm import LLM
 from opendevin.runtime.plugins import (
@@ -34,62 +34,6 @@ from opendevin.runtime.tools import RuntimeTool
 ENABLE_GITHUB = True


-def action_to_str(action: Action) -> str:
-    if isinstance(action, CmdRunAction):
-        return f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
-    elif isinstance(action, IPythonRunCellAction):
-        return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
-    elif isinstance(action, AgentDelegateAction):
-        return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
-    elif isinstance(action, MessageAction):
-        return action.content
-    return ''
-
-
-def get_action_message(action: Action) -> dict[str, str] | None:
-    if (
-        isinstance(action, AgentDelegateAction)
-        or isinstance(action, CmdRunAction)
-        or isinstance(action, IPythonRunCellAction)
-        or isinstance(action, MessageAction)
-    ):
-        return {
-            'role': 'user' if action.source == 'user' else 'assistant',
-            'content': action_to_str(action),
-        }
-    return None
-
-
-def get_observation_message(obs) -> dict[str, str] | None:
-    max_message_chars = config.get_llm_config_from_agent(
-        'CodeActAgent'
-    ).max_message_chars
-    if isinstance(obs, CmdOutputObservation):
-        content = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
-        content += (
-            f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
-        )
-        return {'role': 'user', 'content': content}
-    elif isinstance(obs, IPythonRunCellObservation):
-        content = 'OBSERVATION:\n' + obs.content
-        # replace base64 images with a placeholder
-        splitted = content.split('\n')
-        for i, line in enumerate(splitted):
-            if '![image](data:image/png;base64,' in line:
-                splitted[i] = (
-                    '![image](data:image/png;base64, ...) already displayed to user'
-                )
-        content = '\n'.join(splitted)
-        content = truncate_content(content, max_message_chars)
-        return {'role': 'user', 'content': content}
-    elif isinstance(obs, AgentDelegateObservation):
-        content = 'OBSERVATION:\n' + truncate_content(
-            str(obs.outputs), max_message_chars
-        )
-        return {'role': 'user', 'content': content}
-    return None
-
-
 # FIXME: We can tweak these two settings to create MicroAgents specialized toward different area
 def get_system_message() -> str:
    if ENABLE_GITHUB:
@@ -158,8 +102,7 @@ class CodeActAgent(Agent):
        self,
        llm: LLM,
    ) -> None:
-        """
-        Initializes a new instance of the CodeActAgent class.
+        """Initializes a new instance of the CodeActAgent class.

        Parameters:
        - llm (LLM): The llm to be used by this agent
@@ -167,15 +110,67 @@ class CodeActAgent(Agent):
        super().__init__(llm)
        self.reset()

+    def action_to_str(self, action: Action) -> str:
+        if isinstance(action, CmdRunAction):
+            return (
+                f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
+            )
+        elif isinstance(action, IPythonRunCellAction):
+            return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
+        elif isinstance(action, AgentDelegateAction):
+            return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
+        elif isinstance(action, MessageAction):
+            return action.content
+        return ''
+
+    def get_action_message(self, action: Action) -> dict[str, str] | None:
+        if (
+            isinstance(action, AgentDelegateAction)
+            or isinstance(action, CmdRunAction)
+            or isinstance(action, IPythonRunCellAction)
+            or isinstance(action, MessageAction)
+        ):
+            return {
+                'role': 'user' if action.source == 'user' else 'assistant',
+                'content': self.action_to_str(action),
+            }
+        return None
+
+    def get_observation_message(self, obs: Observation) -> dict[str, str] | None:
+        max_message_chars = self.llm.config.max_message_chars
+        if isinstance(obs, CmdOutputObservation):
+            content = 'OBSERVATION:\n' + truncate_content(
+                obs.content, max_message_chars
+            )
+            content += (
+                f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
+            )
+            return {'role': 'user', 'content': content}
+        elif isinstance(obs, IPythonRunCellObservation):
+            content = 'OBSERVATION:\n' + obs.content
+            # replace base64 images with a placeholder
+            splitted = content.split('\n')
+            for i, line in enumerate(splitted):
+                if '![image](data:image/png;base64,' in line:
+                    splitted[i] = (
+                        '![image](data:image/png;base64, ...) already displayed to user'
+                    )
+            content = '\n'.join(splitted)
+            content = truncate_content(content, max_message_chars)
+            return {'role': 'user', 'content': content}
+        elif isinstance(obs, AgentDelegateObservation):
+            content = 'OBSERVATION:\n' + truncate_content(
+                str(obs.outputs), max_message_chars
+            )
+            return {'role': 'user', 'content': content}
+        return None
+
    def reset(self) -> None:
-        """
-        Resets the CodeAct Agent.
-        """
+        """Resets the CodeAct Agent."""
        super().reset()

    def step(self, state: State) -> Action:
-        """
-        Performs one step using the CodeAct Agent.
+        """Performs one step using the CodeAct Agent.
        This includes gathering info on previous steps and prompting the model to make a command to execute.

        Parameters:
@@ -188,7 +183,6 @@ class CodeActAgent(Agent):
        - MessageAction(content) - Message action to run (e.g. ask for clarification)
        - AgentFinishAction() - end the interaction
        """
-
        # if we're done, go back
        latest_user_message = state.history.get_last_user_message()
        if latest_user_message and latest_user_message.strip() == '/exit':
@@ -216,11 +210,12 @@ class CodeActAgent(Agent):

        for event in state.history.get_events():
            # create a regular message from an event
-            message = (
-                get_action_message(event)
-                if isinstance(event, Action)
-                else get_observation_message(event)
-            )
+            if isinstance(event, Action):
+                message = self.get_action_message(event)
+            elif isinstance(event, Observation):
+                message = self.get_observation_message(event)
+            else:
+                raise ValueError(f'Unknown event type: {type(event)}')

            # add regular message
            if message:
--- a/agenthub/codeact_swe_agent/action_parser.py
+++ b/agenthub/codeact_swe_agent/action_parser.py
@@ -11,9 +11,8 @@ from opendevin.events.action import (


 class CodeActSWEActionParserFinish(ActionParser):
-    """
-    Parser action:
-        - AgentFinishAction() - end the interaction
+    """Parser action:
+    - AgentFinishAction() - end the interaction
    """

    def __init__(
@@ -34,10 +33,9 @@ class CodeActSWEActionParserFinish(ActionParser):


 class CodeActSWEActionParserCmdRun(ActionParser):
-    """
-    Parser action:
-        - CmdRunAction(command) - bash command to run
-        - AgentFinishAction() - end the interaction
+    """Parser action:
+    - CmdRunAction(command) - bash command to run
+    - AgentFinishAction() - end the interaction
    """

    def __init__(
@@ -64,9 +62,8 @@ class CodeActSWEActionParserCmdRun(ActionParser):


 class CodeActSWEActionParserIPythonRunCell(ActionParser):
-    """
-    Parser action:
-        - IPythonRunCellAction(code) - IPython code to run
+    """Parser action:
+    - IPythonRunCellAction(code) - IPython code to run
    """

    def __init__(
@@ -95,9 +92,8 @@ class CodeActSWEActionParserIPythonRunCell(ActionParser):


 class CodeActSWEActionParserMessage(ActionParser):
-    """
-    Parser action:
-        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    """Parser action:
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
    """

    def __init__(
--- a/agenthub/codeact_swe_agent/codeact_swe_agent.py
+++ b/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -7,7 +7,6 @@ from agenthub.codeact_swe_agent.prompt import (
 from agenthub.codeact_swe_agent.response_parser import CodeActSWEResponseParser
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import config
 from opendevin.events.action import (
    Action,
    AgentFinishAction,
@@ -19,6 +18,7 @@ from opendevin.events.observation import (
    CmdOutputObservation,
    IPythonRunCellObservation,
 )
+from opendevin.events.observation.observation import Observation
 from opendevin.events.serialization.event import truncate_content
 from opendevin.llm.llm import LLM
 from opendevin.runtime.plugins import (
@@ -29,54 +29,6 @@ from opendevin.runtime.plugins import (
 from opendevin.runtime.tools import RuntimeTool


-def action_to_str(action: Action) -> str:
-    if isinstance(action, CmdRunAction):
-        return f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
-    elif isinstance(action, IPythonRunCellAction):
-        return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
-    elif isinstance(action, MessageAction):
-        return action.content
-    return ''
-
-
-def get_action_message(action: Action) -> dict[str, str] | None:
-    if (
-        isinstance(action, CmdRunAction)
-        or isinstance(action, IPythonRunCellAction)
-        or isinstance(action, MessageAction)
-    ):
-        return {
-            'role': 'user' if action.source == 'user' else 'assistant',
-            'content': action_to_str(action),
-        }
-    return None
-
-
-def get_observation_message(obs) -> dict[str, str] | None:
-    max_message_chars = config.get_llm_config_from_agent(
-        'CodeActSWEAgent'
-    ).max_message_chars
-    if isinstance(obs, CmdOutputObservation):
-        content = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
-        content += (
-            f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
-        )
-        return {'role': 'user', 'content': content}
-    elif isinstance(obs, IPythonRunCellObservation):
-        content = 'OBSERVATION:\n' + obs.content
-        # replace base64 images with a placeholder
-        splitted = content.split('\n')
-        for i, line in enumerate(splitted):
-            if '![image](data:image/png;base64,' in line:
-                splitted[i] = (
-                    '![image](data:image/png;base64, ...) already displayed to user'
-                )
-        content = '\n'.join(splitted)
-        content = truncate_content(content, max_message_chars)
-        return {'role': 'user', 'content': content}
-    return None
-
-
 def get_system_message() -> str:
    return f'{SYSTEM_PREFIX}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'

@@ -113,8 +65,7 @@ class CodeActSWEAgent(Agent):
        self,
        llm: LLM,
    ) -> None:
-        """
-        Initializes a new instance of the CodeActAgent class.
+        """Initializes a new instance of the CodeActAgent class.

        Parameters:
        - llm (LLM): The llm to be used by this agent
@@ -122,15 +73,59 @@ class CodeActSWEAgent(Agent):
        super().__init__(llm)
        self.reset()

+    def action_to_str(self, action: Action) -> str:
+        if isinstance(action, CmdRunAction):
+            return (
+                f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
+            )
+        elif isinstance(action, IPythonRunCellAction):
+            return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
+        elif isinstance(action, MessageAction):
+            return action.content
+        return ''
+
+    def get_action_message(self, action: Action) -> dict[str, str] | None:
+        if (
+            isinstance(action, CmdRunAction)
+            or isinstance(action, IPythonRunCellAction)
+            or isinstance(action, MessageAction)
+        ):
+            return {
+                'role': 'user' if action.source == 'user' else 'assistant',
+                'content': self.action_to_str(action),
+            }
+        return None
+
+    def get_observation_message(self, obs: Observation) -> dict[str, str] | None:
+        max_message_chars = self.llm.config.max_message_chars
+        if isinstance(obs, CmdOutputObservation):
+            content = 'OBSERVATION:\n' + truncate_content(
+                obs.content, max_message_chars
+            )
+            content += (
+                f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
+            )
+            return {'role': 'user', 'content': content}
+        elif isinstance(obs, IPythonRunCellObservation):
+            content = 'OBSERVATION:\n' + obs.content
+            # replace base64 images with a placeholder
+            splitted = content.split('\n')
+            for i, line in enumerate(splitted):
+                if '![image](data:image/png;base64,' in line:
+                    splitted[i] = (
+                        '![image](data:image/png;base64, ...) already displayed to user'
+                    )
+            content = '\n'.join(splitted)
+            content = truncate_content(content, max_message_chars)
+            return {'role': 'user', 'content': content}
+        return None
+
    def reset(self) -> None:
-        """
-        Resets the CodeAct Agent.
-        """
+        """Resets the CodeAct Agent."""
        super().reset()

    def step(self, state: State) -> Action:
-        """
-        Performs one step using the CodeAct Agent.
+        """Performs one step using the CodeAct Agent.
        This includes gathering info on previous steps and prompting the model to make a command to execute.

        Parameters:
@@ -142,7 +137,6 @@ class CodeActSWEAgent(Agent):
        - MessageAction(content) - Message action to run (e.g. ask for clarification)
        - AgentFinishAction() - end the interaction
        """
-
        # if we're done, go back
        latest_user_message = state.history.get_last_user_message()
        if latest_user_message and latest_user_message.strip() == '/exit':
@@ -170,11 +164,12 @@ class CodeActSWEAgent(Agent):

        for event in state.history.get_events():
            # create a regular message from an event
-            message = (
-                get_action_message(event)
-                if isinstance(event, Action)
-                else get_observation_message(event)
-            )
+            if isinstance(event, Action):
+                message = self.get_action_message(event)
+            elif isinstance(event, Observation):
+                message = self.get_observation_message(event)
+            else:
+                raise ValueError(f'Unknown event type: {type(event)}')

            # add regular message
            if message:
--- a/agenthub/codeact_swe_agent/response_parser.py
+++ b/agenthub/codeact_swe_agent/response_parser.py
@@ -9,12 +9,11 @@ from opendevin.events.action import Action


 class CodeActSWEResponseParser(ResponseParser):
-    """
-    Parser action:
-        - CmdRunAction(command) - bash command to run
-        - IPythonRunCellAction(code) - IPython code to run
-        - MessageAction(content) - Message action to run (e.g. ask for clarification)
-        - AgentFinishAction() - end the interaction
+    """Parser action:
+    - CmdRunAction(command) - bash command to run
+    - IPythonRunCellAction(code) - IPython code to run
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    - AgentFinishAction() - end the interaction
    """

    def __init__(self):
--- a/agenthub/delegator_agent/agent.py
+++ b/agenthub/delegator_agent/agent.py
@@ -14,8 +14,7 @@ class DelegatorAgent(Agent):
    current_delegate: str = ''

    def __init__(self, llm: LLM):
-        """
-        Initialize the Delegator Agent with an LLM
+        """Initialize the Delegator Agent with an LLM

        Parameters:
        - llm (LLM): The llm to be used by this agent
@@ -23,8 +22,7 @@ class DelegatorAgent(Agent):
        super().__init__(llm)

    def step(self, state: State) -> Action:
-        """
-        Checks to see if current step is completed, returns AgentFinishAction if True.
+        """Checks to see if current step is completed, returns AgentFinishAction if True.
        Otherwise, delegates the task to the next agent in the pipeline.

        Parameters:
--- a/agenthub/micro/agent.py
+++ b/agenthub/micro/agent.py
@@ -2,7 +2,6 @@ from jinja2 import BaseLoader, Environment

 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import config
 from opendevin.core.utils import json
 from opendevin.events.action import Action
 from opendevin.events.serialization.action import action_from_dict
@@ -23,40 +22,37 @@ def parse_response(orig_response: str) -> Action:


 def to_json(obj, **kwargs):
-    """
-    Serialize an object to str format
-    """
+    """Serialize an object to str format"""
    return json.dumps(obj, **kwargs)


-def history_to_json(history: ShortTermHistory, max_events=20, **kwargs):
-    """
-    Serialize and simplify history to str format
-    """
-    # TODO: get agent specific llm config
-    llm_config = config.get_llm_config()
-    max_message_chars = llm_config.max_message_chars
-
-    processed_history = []
-    event_count = 0
-
-    for event in history.get_events(reverse=True):
-        if event_count >= max_events:
-            break
-        processed_history.append(event_to_memory(event, max_message_chars))
-        event_count += 1
-
-    # history is in reverse order, let's fix it
-    processed_history.reverse()
-
-    return json.dumps(processed_history, **kwargs)
-
-
 class MicroAgent(Agent):
    VERSION = '1.0'
    prompt = ''
    agent_definition: dict = {}

+    def history_to_json(
+        self, history: ShortTermHistory, max_events: int = 20, **kwargs
+    ):
+        """
+        Serialize and simplify history to str format
+        """
+        processed_history = []
+        event_count = 0
+
+        for event in history.get_events(reverse=True):
+            if event_count >= max_events:
+                break
+            processed_history.append(
+                event_to_memory(event, self.llm.config.max_message_chars)
+            )
+            event_count += 1
+
+        # history is in reverse order, let's fix it
+        processed_history.reverse()
+
+        return json.dumps(processed_history, **kwargs)
+
    def __init__(self, llm: LLM):
        super().__init__(llm)
        if 'name' not in self.agent_definition:
@@ -70,7 +66,7 @@ class MicroAgent(Agent):
            state=state,
            instructions=instructions,
            to_json=to_json,
-            history_to_json=history_to_json,
+            history_to_json=self.history_to_json,
            delegates=self.delegates,
            latest_user_message=state.get_current_user_intent(),
        )
--- a/agenthub/monologue_agent/agent.py
+++ b/agenthub/monologue_agent/agent.py
@@ -47,8 +47,7 @@ class MonologueAgent(Agent):
    response_parser = MonologueResponseParser()

    def __init__(self, llm: LLM):
-        """
-        Initializes the Monologue Agent with an llm.
+        """Initializes the Monologue Agent with an llm.

        Parameters:
        - llm (LLM): The llm to be used by this agent
@@ -56,8 +55,7 @@ class MonologueAgent(Agent):
        super().__init__(llm)

    def _initialize(self, task: str):
-        """
-        Utilizes the INITIAL_THOUGHTS list to give the agent a context for its capabilities
+        """Utilizes the INITIAL_THOUGHTS list to give the agent a context for its capabilities
        and how to navigate the WORKSPACE_MOUNT_PATH_IN_SANDBOX in `config` (e.g., /workspace by default).
        Short circuited to return when already initialized.
        Will execute again when called after reset.
@@ -68,7 +66,6 @@ class MonologueAgent(Agent):
        Raises:
        - AgentNoInstructionError: If task is not provided
        """
-
        if self._initialized:
            return

@@ -86,10 +83,7 @@ class MonologueAgent(Agent):
        self._add_initial_thoughts(task)
        self._initialized = True

-    def _add_initial_thoughts(self, task):
-        max_message_chars = config.get_llm_config_from_agent(
-            'MonologueAgent'
-        ).max_message_chars
+    def _add_initial_thoughts(self, task: str):
        previous_action = ''
        for thought in INITIAL_THOUGHTS:
            thought = thought.replace('$TASK', task)
@@ -106,7 +100,7 @@ class MonologueAgent(Agent):
                        content=thought, url='', screenshot=''
                    )
                self.initial_thoughts.append(
-                    event_to_memory(observation, max_message_chars)
+                    event_to_memory(observation, self.llm.config.max_message_chars)
                )
                previous_action = ''
            else:
@@ -130,11 +124,12 @@ class MonologueAgent(Agent):
                    previous_action = ActionType.BROWSE
                else:
                    action = MessageAction(thought)
-                self.initial_thoughts.append(event_to_memory(action, max_message_chars))
+                self.initial_thoughts.append(
+                    event_to_memory(action, self.llm.config.max_message_chars)
+                )

    def step(self, state: State) -> Action:
-        """
-        Modifies the current state by adding the most recent actions and observations, then prompts the model to think about it's next action to take using monologue, memory, and hint.
+        """Modifies the current state by adding the most recent actions and observations, then prompts the model to think about it's next action to take using monologue, memory, and hint.

        Parameters:
        - state (State): The current state based on previous steps taken
@@ -142,9 +137,6 @@ class MonologueAgent(Agent):
        Returns:
        - Action: The next action to take based on LLM response
        """
-        max_message_chars = config.get_llm_config_from_agent(
-            'MonologueAgent'
-        ).max_message_chars
        goal = state.get_current_user_intent()
        self._initialize(goal)

@@ -152,7 +144,9 @@ class MonologueAgent(Agent):

        # add the events from state.history
        for event in state.history.get_events():
-            recent_events.append(event_to_memory(event, max_message_chars))
+            recent_events.append(
+                event_to_memory(event, self.llm.config.max_message_chars)
+            )

        # add the last messages to long term memory
        if self.memory is not None:
@@ -162,10 +156,12 @@ class MonologueAgent(Agent):
            # this should still work
            # we will need to do this differently: find out if there really is an action or an observation in this step
            if last_action:
-                self.memory.add_event(event_to_memory(last_action, max_message_chars))
+                self.memory.add_event(
+                    event_to_memory(last_action, self.llm.config.max_message_chars)
+                )
            if last_observation:
                self.memory.add_event(
-                    event_to_memory(last_observation, max_message_chars)
+                    event_to_memory(last_observation, self.llm.config.max_message_chars)
                )

        # the action prompt with initial thoughts and recent events
--- a/agenthub/monologue_agent/response_parser.py
+++ b/agenthub/monologue_agent/response_parser.py
@@ -19,8 +19,7 @@ class MonologueResponseParser(ResponseParser):
        return response['choices'][0]['message']['content']

    def parse_action(self, action_str: str) -> Action:
-        """
-        Parses a string to find an action within it
+        """Parses a string to find an action within it

        Parameters:
        - response (str): The string to be parsed
--- a/agenthub/monologue_agent/utils/prompts.py
+++ b/agenthub/monologue_agent/utils/prompts.py
@@ -120,8 +120,7 @@ INITIAL_THOUGHTS = [


 def get_summarize_monologue_prompt(thoughts: list[dict]):
-    """
-    Gets the prompt for summarizing the monologue
+    """Gets the prompt for summarizing the monologue

    Returns:
    - str: A formatted string with the current monologue within the prompt
@@ -136,8 +135,7 @@ def get_request_action_prompt(
    thoughts: list[dict],
    recent_events: list[dict],
 ):
-    """
-    Gets the action prompt formatted with appropriate values.
+    """Gets the action prompt formatted with appropriate values.

    Parameters:
    - task (str): The current task the agent is trying to accomplish
@@ -146,7 +144,6 @@ def get_request_action_prompt(
    Returns:
    - str: Formatted prompt string with hint, task, monologue, and background commands included
    """
-
    hint = ''
    if len(recent_events) > 0:
        latest_event = recent_events[-1]
@@ -179,8 +176,7 @@ def get_request_action_prompt(


 def parse_action_response(orig_response: str) -> Action:
-    """
-    Parses a string to find an action within it
+    """Parses a string to find an action within it

    Parameters:
    - response (str): The string to be parsed
@@ -199,8 +195,7 @@ def parse_action_response(orig_response: str) -> Action:


 def parse_summary_response(response: str) -> list[dict]:
-    """
-    Parses a summary of the monologue
+    """Parses a summary of the monologue

    Parameters:
    - response (str): The response string to be parsed
--- a/agenthub/planner_agent/agent.py
+++ b/agenthub/planner_agent/agent.py
@@ -18,8 +18,7 @@ class PlannerAgent(Agent):
    response_parser = MonologueResponseParser()

    def __init__(self, llm: LLM):
-        """
-        Initialize the Planner Agent with an LLM
+        """Initialize the Planner Agent with an LLM

        Parameters:
        - llm (LLM): The llm to be used by this agent
@@ -27,8 +26,7 @@ class PlannerAgent(Agent):
        super().__init__(llm)

    def step(self, state: State) -> Action:
-        """
-        Checks to see if current step is completed, returns AgentFinishAction if True.
+        """Checks to see if current step is completed, returns AgentFinishAction if True.
        Otherwise, creates a plan prompt and sends to model for inference, returning the result as the next action.

        Parameters:
@@ -38,14 +36,13 @@ class PlannerAgent(Agent):
        - AgentFinishAction: If the last state was 'completed', 'verified', or 'abandoned'
        - Action: The next action to take based on llm response
        """
-
        if state.root_task.state in [
            'completed',
            'verified',
            'abandoned',
        ]:
            return AgentFinishAction()
-        prompt = get_prompt(state)
+        prompt = get_prompt(state, self.llm.config.max_message_chars)
        messages = [{'content': prompt, 'role': 'user'}]
        resp = self.llm.completion(messages=messages)
        return self.response_parser.parse(resp)
--- a/agenthub/planner_agent/prompt.py
+++ b/agenthub/planner_agent/prompt.py
@@ -1,5 +1,4 @@
 from opendevin.controller.state.state import State
-from opendevin.core.config import config
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.schema import ActionType
 from opendevin.core.utils import json
@@ -101,7 +100,6 @@ What is your next thought or action? Again, you must reply with JSON, and only w

 def get_hint(latest_action_id: str) -> str:
    """Returns action type hint based on given action_id"""
-
    hints = {
        '': "You haven't taken any actions yet. Start by using `ls` to check out what files you're working with.",
        ActionType.RUN: 'You should think about the command you just ran, what output it gave, and how that affects your plan.',
@@ -117,9 +115,9 @@ def get_hint(latest_action_id: str) -> str:
    return hints.get(latest_action_id, '')


-def get_prompt(state: State) -> str:
-    """
-    Gets the prompt for the planner agent.
+def get_prompt(state: State, max_message_chars: int) -> str:
+    """Gets the prompt for the planner agent.
+
    Formatted with the most recent action-observation pairs, current task, and hint based on last action

    Parameters:
@@ -128,10 +126,6 @@ def get_prompt(state: State) -> str:
    Returns:
    - str: The formatted string prompt with historical values
    """
-    max_message_chars = config.get_llm_config_from_agent(
-        'PlannerAgent'
-    ).max_message_chars
-
    # the plan
    plan_str = json.dumps(state.root_task.to_dict(), indent=2)

@@ -180,10 +174,10 @@ def get_prompt(state: State) -> str:


 def parse_response(response: str) -> Action:
-    """
-    Parses the model output to find a valid action to take
+    """Parses the model output to find a valid action to take
    Parameters:
    - response (str): A response from the model that potentially contains an Action.
+
    Returns:
    - Action: A valid next action to perform from model output
    """
--- a/config.template.toml
+++ b/config.template.toml
@@ -25,9 +25,6 @@ workspace_base = "./workspace"
 # Disable color in terminal output
 #disable_color = false

-# Enable auto linting after editing
-#enable_auto_lint = false
-
 # Enable saving and restoring the session when run from CLI
 #enable_cli_session = false

@@ -76,8 +73,6 @@ persist_sandbox = false
 # SSH port for the sandbox
 #ssh_port = 63710

-# Use host network
-#use_host_network = false

 # Name of the default agent
 #default_agent = "CodeActAgent"
@@ -197,6 +192,12 @@ llm_config = 'gpt3'
 # Container image to use for the sandbox
 #container_image = "ghcr.io/opendevin/sandbox:main"

+# Use host network
+#use_host_network = false
+
+# Enable auto linting after editing
+#enable_auto_lint = false
+
 #################################### Eval ####################################
 # Configuration for the evaluation, please refer to the specific evaluation
 # plugin for the available options
--- a/containers/app/entrypoint.sh
+++ b/containers/app/entrypoint.sh
@@ -22,7 +22,9 @@ if [[ "$SANDBOX_USER_ID" -eq 0 ]]; then
  echo "Running OpenDevin as root"
  export RUN_AS_DEVIN=false
  mkdir -p /root/.cache/ms-playwright/
-  mv /home/opendevin/.cache/ms-playwright/ /root/.cache/
+  if [ -d "/home/opendevin/.cache/ms-playwright/" ]; then
+    mv /home/opendevin/.cache/ms-playwright/ /root/.cache/
+  fi
  "$@"
 else
  echo "Setting up enduser with id $SANDBOX_USER_ID"
@@ -52,7 +54,9 @@ else

  mkdir -p /home/enduser/.cache/huggingface/hub/
  mkdir -p /home/enduser/.cache/ms-playwright/
-  mv /home/opendevin/.cache/ms-playwright/ /home/enduser/.cache/
+  if [ -d "/home/opendevin/.cache/ms-playwright/" ]; then
+    mv /home/opendevin/.cache/ms-playwright/ /home/enduser/.cache/
+  fi

  usermod -aG $DOCKER_SOCKET_GID enduser
  echo "Running as enduser"
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/intro.mdx
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/intro.mdx
@@ -93,7 +93,7 @@ Si vous souhaitez utiliser la version **(instable !)** la plus récente, vous po

 Pour le workflow de développement, consultez [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md).

-Avez-vous des problèmes ? Consultez notre [Guide de dépannage](https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting).
+Avez-vous des problèmes ? Consultez notre [Guide de dépannage](https://docs.all-hands.dev/modules/usage/troubleshooting).

 :::warning
 OpenDevin est actuellement en cours de développement, mais vous pouvez déjà exécuter la version alpha pour voir le système de bout en bout en action.
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/troubleshooting/troubleshooting.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/troubleshooting/troubleshooting.md
@@ -25,7 +25,7 @@ Si vous utilisez Windows et que vous rencontrez des problèmes, consultez notre
 ### Symptômes

 ```bash
-Erreur lors de la création du contrôleur. Veuillez vérifier que Docker est en cours d'exécution et visitez `https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting` pour plus d'informations sur le débogage.
+Erreur lors de la création du contrôleur. Veuillez vérifier que Docker est en cours d'exécution et visitez `https://docs.all-hands.dev/modules/usage/troubleshooting` pour plus d'informations sur le débogage.
 ```

 ```bash
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/intro.mdx
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/intro.mdx
@@ -93,7 +93,7 @@ OpenDevin 只会访问这个工作区文件夹。它在一个安全的 docker

 有关开发工作流程，请参阅 [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md)。

-遇到问题了吗？查看我们的 [故障排除指南](https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting)。
+遇到问题了吗？查看我们的 [故障排除指南](https://docs.all-hands.dev/modules/usage/troubleshooting)。

 :::warning
 OpenDevin 目前正在开发中，但你已经可以运行 alpha 版本来查看端到端系统的运作情况。
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/troubleshooting/troubleshooting.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/troubleshooting/troubleshooting.md
@@ -23,7 +23,7 @@ sidebar_position: 5
 ### 症状

 ```bash
-创建控制器时出错。请检查 Docker 是否正在运行，并访问 `https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting` 获取更多调试信息。
+创建控制器时出错。请检查 Docker 是否正在运行，并访问 `https://docs.all-hands.dev/modules/usage/troubleshooting` 获取更多调试信息。
 ```

 ```bash
--- a/docs/modules/usage/openshift-example.md
+++ b/docs/modules/usage/openshift-example.md
@@ -0,0 +1,302 @@
+---
+sidebar_position: 6
+---
+
+# 💿 How to use OpenDevin in OpenShift/K8S
+
+There are different ways and scenarios that you can do, we're just mentioning one example here:
+1. Create a PV "as a cluster admin" to map workspace_base data and docker directory to the pod through the worker node.
+2. Create a PVC to be able to mount those PVs to the POD
+3. Create a POD which contains two containers; the OpenDevin and Sandbox containers.
+
+## Steps to follow the above example.
+
+> Note: Make sure you are logged in to the cluster first with the proper account for each step. PV creation requires cluster administrator!
+
+> Make sure you have read/write permissions on the hostPath used below (i.e. /tmp/workspace)
+
+1. Create the PV:
+Sample yaml file below can be used by a cluster admin to create the PV.
+- workspace-pv.yaml
+
+```yamlfile
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: workspace-pv
+spec:
+  capacity:
+    storage: 2Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  hostPath:
+    path: /tmp/workspace
+```
+
+```bash
+# apply yaml file
+$ oc create -f workspace-pv.yaml
+persistentvolume/workspace-pv created
+
+# review:
+$ oc get pv
+NAME                                       CAPACITY   ACCESS MODES   RECLAIM POLICY   STATUS      CLAIM                STORAGECLASS     REASON   AGE
+workspace-pv                               2Gi        RWO            Retain           Available                                                  7m23s
+```
+
+- docker-pv.yaml
+
+```yamlfile
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: docker-pv
+spec:
+  capacity:
+    storage: 2Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  hostPath:
+    path: /var/run/docker.sock
+```
+
+```bash
+# apply yaml file
+$ oc create -f docker-pv.yaml
+persistentvolume/docker-pv created
+
+# review:
+oc get pv
+NAME                                       CAPACITY   ACCESS MODES   RECLAIM POLICY   STATUS      CLAIM                STORAGECLASS     REASON   AGE
+docker-pv                                  2Gi        RWO            Retain           Available                                                  6m55s
+workspace-pv                               2Gi        RWO            Retain           Available                                                  7m23s
+```
+
+2. Create the PVC:
+Sample PVC yaml file below:
+
+- workspace-pvc.yaml
+
+```yamlfile
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: workspace-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+```
+
+```bash
+# create the pvc
+$ oc create -f workspace-pvc.yaml
+persistentvolumeclaim/workspace-pvc created
+
+# review
+$ oc get pvc
+NAME            STATUS    VOLUME   CAPACITY   ACCESS MODES   STORAGECLASS     AGE
+workspace-pvc   Pending                                      hcloud-volumes   4s
+
+$ oc get events
+LAST SEEN   TYPE     REASON                 OBJECT                                MESSAGE
+8s          Normal   WaitForFirstConsumer   persistentvolumeclaim/workspace-pvc   waiting for first consumer to be created before binding
+```
+
+- docker-pvc.yaml
+
+```yamlfile
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: docker-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+```
+
+```bash
+# create pvc
+$ oc create -f docker-pvc.yaml
+persistentvolumeclaim/docker-pvc created
+
+# review
+$ oc get pvc
+NAME            STATUS    VOLUME   CAPACITY   ACCESS MODES   STORAGECLASS     AGE
+docker-pvc      Pending                                      hcloud-volumes   4s
+workspace-pvc   Pending                                      hcloud-volumes   2m53s
+
+$ oc get events
+LAST SEEN   TYPE     REASON                 OBJECT                                MESSAGE
+10s         Normal   WaitForFirstConsumer   persistentvolumeclaim/docker-pvc      waiting for first consumer to be created before binding
+10s         Normal   WaitForFirstConsumer   persistentvolumeclaim/workspace-pvc   waiting for first consumer to be created before binding
+```
+
+3. Create the POD yaml file:
+Sample POD yaml file below:
+
+- pod.yaml
+
+```yamlfile
+apiVersion: v1
+kind: Pod
+metadata:
+  name: opendevin-app-2024
+  labels:
+    app: opendevin-app-2024
+spec:
+  containers:
+  - name: opendevin-app-2024
+    image: ghcr.io/opendevin/opendevin:0.7.1
+    env:
+    - name: SANDBOX_USER_ID
+      value: "1000"
+    - name: SANDBOX_BOX_TYPE
+      value: 'local'
+    - name: WORKSPACE_MOUNT_PATH
+      value: "/opt/workspace_base"
+    volumeMounts:
+    - name: workspace-volume
+      mountPath: /opt/workspace_base
+    - name: docker-sock
+      mountPath: /var/run/docker.sock
+    ports:
+    - containerPort: 3000
+  - name: opendevin-sandbox-2024
+    image: ghcr.io/opendevin/sandbox:main
+    ports:
+    - containerPort: 51963
+    command: ["/usr/sbin/sshd", "-D", "-p 51963", "-o", "PermitRootLogin=yes"]
+  volumes:
+  - name: workspace-volume
+    persistentVolumeClaim:
+      claimName: workspace-pvc
+  - name: docker-sock
+    persistentVolumeClaim:
+      claimName: docker-pvc
+```
+
+```bash
+# create the pod
+$ oc create -f pod.yaml
+W0716 11:22:07.776271  107626 warnings.go:70] would violate PodSecurity "restricted:v1.24": allowPrivilegeEscalation != false (containers "opendevin-app-2024", "opendevin-sandbox-2024" must set securityContext.allowPrivilegeEscalation=false), unrestricted capabilities (containers "opendevin-app-2024", "opendevin-sandbox-2024" must set securityContext.capabilities.drop=["ALL"]), runAsNonRoot != true (pod or containers "opendevin-app-2024", "opendevin-sandbox-2024" must set securityContext.runAsNonRoot=true), seccompProfile (pod or containers "opendevin-app-2024", "opendevin-sandbox-2024" must set securityContext.seccompProfile.type to "RuntimeDefault" or "Localhost")
+pod/opendevin-app-2024 created
+
+# Above warning can be ignored for now as we will not modify SCC restrictions.
+
+# review
+$ oc get pods
+NAME                 READY   STATUS    RESTARTS   AGE
+opendevin-app-2024   0/2     Pending   0          5s
+
+$ oc get pods
+NAME                 READY   STATUS              RESTARTS   AGE
+opendevin-app-2024   0/2     ContainerCreating   0          15s
+
+$ oc get events
+LAST SEEN   TYPE     REASON                   OBJECT                                MESSAGE
+38s         Normal   WaitForFirstConsumer     persistentvolumeclaim/docker-pvc      waiting for first consumer to be created before binding
+23s         Normal   ExternalProvisioning     persistentvolumeclaim/docker-pvc      waiting for a volume to be created, either by external provisioner "csi.hetzner.cloud" or manually created by system administrator
+27s         Normal   Provisioning             persistentvolumeclaim/docker-pvc      External provisioner is provisioning volume for claim "opendevin/docker-pvc"
+17s         Normal   ProvisioningSucceeded    persistentvolumeclaim/docker-pvc      Successfully provisioned volume pvc-2b1d223a-1c8f-4990-8e3d-68061a9ae252
+16s         Normal   Scheduled                pod/opendevin-app-2024                Successfully assigned opendevin/opendevin-app-2024 to worker1.hub.internal.blakane.com
+9s          Normal   SuccessfulAttachVolume   pod/opendevin-app-2024                AttachVolume.Attach succeeded for volume "pvc-2b1d223a-1c8f-4990-8e3d-68061a9ae252"
+9s          Normal   SuccessfulAttachVolume   pod/opendevin-app-2024                AttachVolume.Attach succeeded for volume "pvc-31f15b25-faad-4665-a25f-201a530379af"
+6s          Normal   AddedInterface           pod/opendevin-app-2024                Add eth0 [10.128.2.48/23] from openshift-sdn
+6s          Normal   Pulled                   pod/opendevin-app-2024                Container image "ghcr.io/opendevin/opendevin:0.7.1" already present on machine
+6s          Normal   Created                  pod/opendevin-app-2024                Created container opendevin-app-2024
+6s          Normal   Started                  pod/opendevin-app-2024                Started container opendevin-app-2024
+6s          Normal   Pulled                   pod/opendevin-app-2024                Container image "ghcr.io/opendevin/sandbox:main" already present on machine
+5s          Normal   Created                  pod/opendevin-app-2024                Created container opendevin-sandbox-2024
+5s          Normal   Started                  pod/opendevin-app-2024                Started container opendevin-sandbox-2024
+83s         Normal   WaitForFirstConsumer     persistentvolumeclaim/workspace-pvc   waiting for first consumer to be created before binding
+27s         Normal   Provisioning             persistentvolumeclaim/workspace-pvc   External provisioner is provisioning volume for claim "opendevin/workspace-pvc"
+17s         Normal   ProvisioningSucceeded    persistentvolumeclaim/workspace-pvc   Successfully provisioned volume pvc-31f15b25-faad-4665-a25f-201a530379af
+
+$ oc get pods
+NAME                 READY   STATUS    RESTARTS   AGE
+opendevin-app-2024   2/2     Running   0          23s
+
+$ oc get pvc
+NAME            STATUS   VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS     AGE
+docker-pvc      Bound    pvc-2b1d223a-1c8f-4990-8e3d-68061a9ae252   10Gi       RWO            hcloud-volumes   10m
+workspace-pvc   Bound    pvc-31f15b25-faad-4665-a25f-201a530379af   10Gi       RWO            hcloud-volumes   13m
+
+```
+
+4. Create a NodePort service.
+Sample service creation command below:
+
+```bash
+# create the service of type NodePort
+$ oc create svc nodeport  opendevin-app-2024  --tcp=3000:3000
+service/opendevin-app-2024 created
+
+# review
+
+$ oc get svc
+NAME                 TYPE       CLUSTER-IP      EXTERNAL-IP   PORT(S)          AGE
+opendevin-app-2024   NodePort   172.30.225.42   <none>        3000:30495/TCP   4s
+
+$ oc describe svc opendevin-app-2024
+Name:                     opendevin-app-2024
+Namespace:                opendevin
+Labels:                   app=opendevin-app-2024
+Annotations:              <none>
+Selector:                 app=opendevin-app-2024
+Type:                     NodePort
+IP Family Policy:         SingleStack
+IP Families:              IPv4
+IP:                       172.30.225.42
+IPs:                      172.30.225.42
+Port:                     3000-3000  3000/TCP
+TargetPort:               3000/TCP
+NodePort:                 3000-3000  30495/TCP
+Endpoints:                10.128.2.48:3000
+Session Affinity:         None
+External Traffic Policy:  Cluster
+Events:                   <none>
+```
+
+6. Connect to OpenDevin UI, configure the Agent, then test:
+
+![image](https://github.com/user-attachments/assets/12f94804-a0c7-4744-b873-e003c9caf40e)
+
+
+## Challenges
+Some of the challenages that would be needed to improve:
+
+1. Install GIT into the container:
+   This can be resolved by building a custom image which includes GIT software and use that image during pod deplyment.
+
+Example below: "to be tested!"
+
+```dockerfile
+FROM ghcr.io/opendevin/opendevin:0.7.1
+
+# Install Git
+RUN apt-get update && apt-get install -y git
+
+# Ensure /opt/workspace_base is writable
+RUN mkdir -p /opt/workspace_base && chown -R 1000:1000 /opt/workspace_base
+
+# Verify Git installation
+RUN git --version
+```
+   
+2. Mount a shared development directory "i.e. one hosted in EC2 instance" to the POD:
+   This can be also done by sharing the developement directory to the worker node through a sharing software (NFS), then creating a pv and pvc as described above to access that directory.
+
+3. Not all Agents working! Just tested CoderAgent with an openai API key and produced results. 
+   
+
+## Discuss
+
+For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
--- a/docs/modules/usage/troubleshooting/troubleshooting.md
+++ b/docs/modules/usage/troubleshooting/troubleshooting.md
@@ -34,7 +34,7 @@ If you're running on Windows and having trouble, check out our [guide for Window
 **Symptoms**

 ```bash
-Error creating controller. Please check Docker is running and visit `https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting` for more debugging information.
+Error creating controller. Please check Docker is running and visit `https://docs.all-hands.dev/modules/usage/troubleshooting` for more debugging information.
 ```

 ```bash
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -62,7 +62,7 @@ def process_instance(
    reset_logger: bool = True,
 ):
    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    eval_output_dir = metadata.eval_output_dir
    if reset_logger:
--- a/evaluation/TUTORIAL.md
+++ b/evaluation/TUTORIAL.md
@@ -33,13 +33,15 @@ workspace_mount_path = "/path/to/your/workspace"

 ssh_hostname = "localhost"

+run_as_devin = false
+
+[sandbox]
 # SWEBench eval specific - but you can tweak it to your needs
 use_host_network = false
-run_as_devin = false
 # linting python after editing helps LLM fix indentations
 enable_auto_lint = true

-[sandbox]
+
 box_type = "ssh"
 timeout = 120

--- a/evaluation/agent_bench/README.md
+++ b/evaluation/agent_bench/README.md
@@ -20,12 +20,12 @@ workspace_mount_path = "/path/to/workspace"

 ssh_hostname = "localhost"

-use_host_network = false
 # AgentBench specific
 run_as_devin = true
-enable_auto_lint = true

 [sandbox]
+use_host_network = false
+enable_auto_lint = true
 box_type = "ssh"
 timeout = 120

--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -37,7 +37,7 @@ def process_instance(
    reset_logger: bool = True,
 ):
    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))

    inst_id = instance.instance_id
    question = instance.description
--- a/evaluation/biocoder/biocoder_env_box.py
+++ b/evaluation/biocoder/biocoder_env_box.py
@@ -217,7 +217,7 @@ class BiocoderSSHBox(DockerSSHBox):
            config.workspace_mount_path = workspace_base

            # linting python after editing helps LLM fix indentations
-            config.enable_auto_lint = True
+            config.sandbox.enable_auto_lint = True

            # create folder for transferring files back/forth
            biocoder_cache_folder = 'biocoder_cache'
@@ -268,7 +268,7 @@ class BiocoderSSHBox(DockerSSHBox):
                f.write(json.dumps(testcase_json, indent=4))

            # linting python after editing helps LLM fix indentations
-            config.enable_auto_lint = True
+            config.sandbox.enable_auto_lint = True

            sandbox = cls(
                container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -87,7 +87,7 @@ def process_instance(
    reset_logger: bool = True,
 ):
    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
    instance = BiocoderData(**instance)
    print(instance)
    workspace_dir_name = (
--- a/evaluation/bird/README.md
+++ b/evaluation/bird/README.md
@@ -18,6 +18,8 @@ Add the following configurations:
 max_iterations = 100
 cache_dir = "/tmp/cache"
 ssh_hostname = "localhost"
+
+[sandbox]
 enable_auto_lint = true

 # TODO: Change these to the model you want to evaluate
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -66,9 +66,7 @@ AGENT_CLS_TO_INST_SUFFIX = {


 def execute_sql(db_path, gen_sql, gold_sql):
-    """
-    Execute the generated SQL and the ground truth SQL and compare the results.
-    """
+    """Execute the generated SQL and the ground truth SQL and compare the results."""
    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        cursor.execute(gen_sql)
@@ -128,7 +126,7 @@ def process_instance(
    reset_logger: bool = True,
 ):
    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
    workspace_mount_path = os.path.join(
        config.workspace_mount_path, 'bird_eval_workspace'
    )
@@ -255,18 +253,14 @@ def process_instance(


 def load_bird():
-    """
-    Main function to handle the flow of downloading, processing, and loading the bird dataset.
-    """
+    """Main function to handle the flow of downloading, processing, and loading the bird dataset."""
    raw_dataset_path = download_bird()
    bird_dataset = process_bird(raw_dataset_path)
    return bird_dataset


 def download_bird():
-    """
-    Downloads and extracts the bird dataset from a specified URL into a local directory.
-    """
+    """Downloads and extracts the bird dataset from a specified URL into a local directory."""
    dataset_path = os.path.join(config.workspace_base, 'evaluation_bird')
    devset_path = os.path.join(dataset_path, 'dev')
    if not os.path.exists(dataset_path):
@@ -292,9 +286,7 @@ def download_bird():


 def process_bird(dataset_path):
-    """
-    Processes the raw bird dataset into a structured format and saves it as JSON.
-    """
+    """Processes the raw bird dataset into a structured format and saves it as JSON."""
    processed_path = os.path.join(dataset_path, 'processed_dev.json')
    if not os.path.exists(processed_path):
        logger.info(f'{processed_path} folder does not exist, starting processing...')
@@ -325,9 +317,7 @@ def process_bird(dataset_path):


 def extract_create_table_prompt(db_path, limit_value=0):
-    """
-    Generates a SQL prompt with CREATE TABLE statements and sample data from the database.
-    """
+    """Generates a SQL prompt with CREATE TABLE statements and sample data from the database."""
    table_query = "SELECT * FROM sqlite_master WHERE type='table';"
    tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall()
    prompt = ''
@@ -367,9 +357,7 @@ def extract_create_table_prompt(db_path, limit_value=0):


 def create_prompt(e, database_path):
-    """
-    Create a prompt for the given example
-    """
+    """Create a prompt for the given example"""
    db_id = e['db_id']
    db_path = pathlib.Path(database_path) / db_id / f'{db_id}.sqlite'

--- a/evaluation/browsing_delegation/README.md
+++ b/evaluation/browsing_delegation/README.md
@@ -0,0 +1,51 @@
+# Browsing Delegation Evalution
+
+Some of OpenDevin's agent supports agent delegation action, for example, CodeActAgent can delegate browsing tasks to BrowsingAgent.
+
+This evaluation tests whether CodeActAgent can correctly delegate the instruction from WebArena and MiniWob benchmark to the BrowsingAgent.
+If so, the browsing performance upper-bound of CodeActAgent will be the performance of BrowsingAgent.
+
+
+## Setup Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to set up a local development environment for OpenDevin.
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+# TODO: Change these to the model you want to evaluate
+[llm.eval_gpt4_1106_preview_llm]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[llm.eval_some_openai_compatible_model_llm]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Run Inference
+
+```bash
+./evaluation/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
+```
+
+where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
+
+`model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+
+`git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+
+`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+
+`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances.
--- a/evaluation/browsing_delegation/run_infer.py
+++ b/evaluation/browsing_delegation/run_infer.py
@@ -0,0 +1,164 @@
+import asyncio
+import logging
+import os
+import re
+
+import nltk
+import pandas as pd
+from datasets import load_dataset
+
+from evaluation.utils.shared import (
+    EvalMetadata,
+    make_metadata,
+    prepare_dataset,
+    run_evaluation,
+)
+from opendevin.controller.agent import Agent
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import run_agent_controller
+from opendevin.llm.llm import LLM
+
+# Only CodeActAgent can delegate to BrowsingAgent
+SUPPORTED_AGENT_CLS = {'CodeActAgent'}
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+):
+    # Create the agent
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
+    env_id = instance.instance_id
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(
+            metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+    else:
+        logger.info(f'Starting evaluation for instance {env_id}.')
+
+    instruction = (
+        f'You can delegate browsing tasks to a browser agent. '
+        f"For example, for query 'Who is the president of the United States?', you can delegate the task to a browser agent via <execute_browse> Who is the president of the United States? </execute_browse>.\n"
+        f'Now, solve the following query: "{instance.instruction}"\n'
+        f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
+    )
+
+    state: State | None = asyncio.run(
+        run_agent_controller(
+            agent,
+            instruction,
+            max_iterations=metadata.max_iterations,
+            sid=env_id,
+        )
+    )
+
+    # ======= Attempt to evaluate the agent's environment impact =======
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    metrics = state.metrics.get() if state.metrics else None
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
+    # find the last delegate action
+    last_delegate_action = None
+    result = {}
+    for action, _ in histories:
+        if action['action'] == 'delegate':
+            last_delegate_action = action
+            instruction_for_delegate = action['args']['inputs']['task']
+            # parse `browse_actions` from `instruction_for_delegate`
+            # task = f'{thought}. I should start with: {browse_actions}'
+            instruction_for_delegate = re.search(
+                r'I should start with: (.*)', instruction_for_delegate
+            ).group(1)
+
+            # calculate the edit distance between the instance.instruction and the instruction_for_delegate
+            edit_distance = nltk.edit_distance(
+                instance.instruction, instruction_for_delegate
+            )
+            is_exact_match = (
+                instance.instruction.strip() == instruction_for_delegate.strip()
+            )
+            result['edit_distance'] = edit_distance
+            result['is_exact_match'] = is_exact_match
+
+    # Save the output
+    output = {
+        'instance_id': env_id,
+        'instruction': instruction,
+        'metadata': metadata.model_dump(),
+        'history': histories,
+        'metrics': metrics,
+        'error': state.last_error if state and state.last_error else None,
+        'test_result': {
+            'query': instance.instruction,
+            'action': last_delegate_action,
+            'result': result,
+        },
+    }
+
+    return output
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+
+    dataset = load_dataset('OpenDevin/eval-browsing-instructions')
+    dataset = dataset['train'].to_pandas()
+    assert dataset.columns.tolist() == ['instance_id', 'instruction']
+    id_column = 'instance_id'
+    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
+    logger.info(f'Config for evaluation: {config}')
+
+    metadata = make_metadata(
+        llm_config,
+        'browsing_delegation',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+    )
+    if metadata.agent_class not in SUPPORTED_AGENT_CLS:
+        raise ValueError(
+            f'Agent class {metadata.agent_class} not supported with AgentDelegation.'
+        )
+
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+        id_column,
+    )
--- a/evaluation/browsing_delegation/scripts/run_infer.sh
+++ b/evaluation/browsing_delegation/scripts/run_infer.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+NUM_WORKERS=$5
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+get_agent_version
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE="$AGENT_VERSION"
+
+COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 1 \
+  --max-chars 10000000 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -48,7 +48,7 @@ def process_instance(
    reset_logger: bool = True,
 ):
    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
    # create process-specific workspace dir
    # we will create a workspace directory for EACH process
    # so that different agent don't interfere with each other.
--- a/evaluation/gaia/scorer.py
+++ b/evaluation/gaia/scorer.py
@@ -80,14 +80,14 @@ def question_scorer(


 def normalize_str(input_str, remove_punct=True) -> str:
-    """
-    Normalize a string by:
+    """Normalize a string by:
    - Removing all white spaces
    - Optionally removing punctuation (if remove_punct is True)
    - Converting to lowercase
    Parameters:
    - input_str: str, the string to normalize
    - remove_punct: bool, whether to remove punctuation (default: True)
+
    Returns:
    - str, the normalized string
    """
--- a/evaluation/gorilla/utils.py
+++ b/evaluation/gorilla/utils.py
@@ -10,7 +10,6 @@ from ast_eval_th import ast_eval_th
 # This function is modified from Gorilla's APIBench implementations (https://github.com/ShishirPatil/gorilla/blob/main/eval/get_llm_responses.py).
 def encode_question(question, api_name):
    """Encode multiple prompt instructions into a single string."""
-
    prompts = []
    if api_name == 'torch':
        api_name = 'torchhub'
--- a/evaluation/gpqa/README.md
+++ b/evaluation/gpqa/README.md
@@ -36,6 +36,8 @@ Add the following configurations:
 max_iterations = 100
 cache_dir = "/tmp/cache"
 ssh_hostname = "localhost"
+
+[sandbox]
 enable_auto_lint = true

 # TODO: Change these to the model you want to evaluate
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -1,5 +1,4 @@
-"""
-Overview:
+"""Overview:
 This code implements the evaluation of agents on the GPQA Benchmark with Open Book setting.
 - The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-expert validators achieve only 34% accuracy despite unrestricted access to the web.
 - Even experts in the corresponding domains achieve only 65% accuracy.
@@ -54,8 +53,7 @@ AGENT_CLS_TO_INST_SUFFIX = {


 def parse_final_answer(final_answer: str) -> str:
-    """
-    Parse the final answer from the final message generated by the agent
+    """Parse the final answer from the final message generated by the agent
    to extract the final answer. The final answer is usually enclosed in the format:
    <<FINAL_ANSWER||
    <insert correct answer here>
@@ -71,15 +69,12 @@ def parse_final_answer(final_answer: str) -> str:


 def compare_answers(predicted_answer, ground_truth):
-    """
-    Compare the predicted answer with the ground truth answer
-    """
+    """Compare the predicted answer with the ground truth answer"""
    return predicted_answer == ground_truth


 def get_test_result(model_output, ground_truth):
-    """
-    Implements the evaluation logic for GPQA
+    """Implements the evaluation logic for GPQA
    Checks if the output of a given instance is correct (as per the ground truth)
    """
    # parse the final answer from model output
@@ -92,8 +87,7 @@ def get_test_result(model_output, ground_truth):


 def convert_instance_dict(instance):
-    """
-    Used for preprocessing the hf dataset into a format that can be used by the agent.
+    """Used for preprocessing the hf dataset into a format that can be used by the agent.
    Reads and extracts relevant information from the dataset instance.
    """
    out_instance_dict = {}
@@ -126,7 +120,7 @@ def process_instance(
    reset_logger: bool = True,
 ):
    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
    old_workspace_mount_path = config.workspace_mount_path
    old_workspace_base = config.workspace_base
    try:
--- a/evaluation/humanevalfix/README.md
+++ b/evaluation/humanevalfix/README.md
@@ -18,6 +18,8 @@ Add the following configurations:
 max_iterations = 100
 cache_dir = "/tmp/cache"
 ssh_hostname = "localhost"
+
+[sandbox]
 enable_auto_lint = true

 # TODO: Change these to the model you want to evaluate
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -1,5 +1,4 @@
-"""
-Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark introduced in
+"""Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark introduced in
 "OctoPack: Instruction Tuning Code Large Language Models" (https://arxiv.org/abs/2308.07124).
 Please see https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py
 for the reference implementation used in the paper.
@@ -109,7 +108,7 @@ def process_instance(
    reset_logger: bool = True,
 ):
    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
    old_workspace_mount_path = config.workspace_mount_path
    old_workspace_base = config.workspace_base

--- a/evaluation/logic_reasoning/README.md
+++ b/evaluation/logic_reasoning/README.md
@@ -13,6 +13,8 @@ Add the following configurations:
 max_iterations = 100
 cache_dir = "/tmp/cache"
 ssh_hostname = "localhost"
+
+[sandbox]
 enable_auto_lint = true

 # TODO: Change these to the model you want to evaluate
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -103,7 +103,7 @@ def process_instance(
    reset_logger: bool = True,
 ):
    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
    old_workspace_mount_path = config.workspace_mount_path
    old_workspace_base = config.workspace_base

--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -41,7 +41,7 @@ def process_instance(
    reset_logger: bool = True,
 ):
    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
    env_id = instance.id
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
--- a/evaluation/mint/tasks/codegen.py
+++ b/evaluation/mint/tasks/codegen.py
@@ -74,7 +74,6 @@ class HumanEvalTask(CodeGenTask):
        Modified from:
        https://github.com/bigcode-project/bigcode-evaluation-harness/blob/d61afde130005ecc65cf800ad8eca790a9bc2115/lm_eval/tasks/humaneval.py#L56
        """
-
        # STOP_WORDS = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"]
        # # Remove the last block of the code containing stop_words for HumanEval
        # string_list = re.split("(%s)" % "|".join(STOP_WORDS), solution)
--- a/evaluation/mint/utils.py
+++ b/evaluation/mint/utils.py
@@ -79,14 +79,12 @@ def check_correctness(
    timeout: float = 10,
    completion_id: Optional[int] = None,
 ) -> Dict:
-    """
-    Evaluates the functional correctness of a completion by running the test
+    """Evaluates the functional correctness of a completion by running the test
    suite provided in the problem.

    :param completion_id: an optional completion ID so we can match
        the results later even if execution finishes asynchronously.
    """
-
    manager = multiprocessing.Manager()
    result = manager.list()

@@ -181,18 +179,16 @@ def chdir(root):


 def reliability_guard(maximum_memory_bytes: Optional[int] = None):
-    """
-    This disables various destructive functions and prevents the generated code
+    """This disables various destructive functions and prevents the generated code
    from interfering with the test (e.g. fork bomb, killing other processes,
    removing filesystem files, etc.)

-    WARNING
+    Warning:
    This function is NOT a security sandbox. Untrusted code, including, model-
    generated code, should not be blindly executed outside of one. See the
    Codex paper for more information about OpenAI's code sandbox, and proceed
    with caution.
    """
-
    if maximum_memory_bytes is not None:
        import resource

--- a/evaluation/ml_bench/README.md
+++ b/evaluation/ml_bench/README.md
@@ -25,10 +25,13 @@ Add the following configurations:
 max_iterations = 100
 cache_dir = "/tmp/cache"
 ssh_hostname = "localhost"
-enable_auto_lint = true
 run_as_devin = false
 sandbox_container_image = "public.ecr.aws/i5g0m1f6/ml-bench" # Use the latest image from the ML-Bench repository

+[sandbox]
+enable_auto_lint = true
+
+
 # TODO: Change these to the model you want to evaluate
 [llm.eval_gpt4_1106_preview]
 model = "gpt-4-1106-preview"
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@@ -1,5 +1,4 @@
-"""
-Implements evaluation of agents on ML-Bench, a benchmark for assessing the effectiveness of
+"""Implements evaluation of agents on ML-Bench, a benchmark for assessing the effectiveness of
 Large Language Models (LLMs) in leveraging existing functions in open-source libraries for
 machine learning tasks. The benchmark is introduced in the paper "ML-Bench: Evaluating Large
 Language Models for Code Generation in Repository-Level Machine Learning Tasks"
@@ -68,7 +67,7 @@ ID2CONDA = {


 def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
    old_workspace_mount_path = config.workspace_mount_path
    old_workspace_base = config.workspace_base
    try:
--- a/evaluation/regression/cases/hello-world/test_hello_world.py
+++ b/evaluation/regression/cases/hello-world/test_hello_world.py
@@ -6,9 +6,7 @@ from conftest import agents

@pytest.mark.parametrize('agent', agents())
 def test_hello_world(task_file, run_test_case, agent):
-    """
-    Test case for the "Hello, World!" Bash script using different agents.
-    """
+    """Test case for the "Hello, World!" Bash script using different agents."""
    # Run the test case for the specified agent
    workspace_dir = run_test_case(agent, 'hello-world')

@@ -16,7 +14,7 @@ def test_hello_world(task_file, run_test_case, agent):
    assert os.path.exists(workspace_dir)
    assert os.path.isfile(os.path.join(workspace_dir, 'hello_world.sh'))

-   # Execute the hello_world.sh script
+    # Execute the hello_world.sh script
    os.chdir(workspace_dir)
    output = os.popen('bash hello_world.sh').read()
    assert output == 'Hello, World!\n'
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -50,11 +50,13 @@ ssh_hostname = "localhost"
 box_type = "ssh"
 timeout = 120

+run_as_devin = false
+max_budget_per_task = 4 # 4 USD
+
+[sandbox]
 # SWEBench eval specific
 use_host_network = false
-run_as_devin = false
 enable_auto_lint = true
-max_budget_per_task = 4 # 4 USD

 # TODO: Change these to the model you want to evaluate
 [llm.eval_gpt4_1106_preview_llm]
@@ -75,6 +77,7 @@ Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench
 docker image. Then run this python script:

 ```bash
+# export USE_INSTANCE_IMAGE=true # if you want to test support for instance-level docker images
 poetry run python evaluation/swe_bench/swe_env_box.py
 ```

@@ -85,7 +88,7 @@ If you see an error, please make sure your `config.toml` contains all
 ## Run Inference on SWE-Bench Instances

 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers]
 # e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
 ```

@@ -104,7 +107,20 @@ to `CodeActAgent`.
 default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
 in order to use `eval_limit`, you must also set `agent`.

+`max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
+default, it is set to 30.
+
+`num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
+default, it is set to 1.
+
+There are also two optional environment variables you can set.
+```
+export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Ignore this if you are not sure.
+export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images
+```
+
 Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview_llm` and CodeActAgent,
+
 then your command would be:

 ```bash
@@ -189,17 +205,6 @@ streamlit run 0_📊_OpenDevin_Benchmark.py --server.port 8501 --server.address

 Then you can access the SWE-Bench trajectory visualizer at `localhost:8501`.

-
-
-## View Result Summary
-
-If you just want to know the resolve rate, and/or a summary of what tests pass and what don't, you could run
-
-```bash
-poetry run python ./evaluation/swe_bench/scripts/summarise_results.py <path_to_report_json_file>
-# e.g. poetry run python ./evaluation/swe_bench/scripts/summarise_results.py ./evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActSWEAgent/gpt-4o-2024-05-13_maxiter_50_N_v1.5-no-hint/report.json
-```
-
 ## Submit your evaluation results

 You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -27,6 +27,7 @@ from opendevin.core.main import run_agent_controller
 from opendevin.llm.llm import LLM

 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false') == 'true'
+USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false') == 'true'

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
@@ -123,37 +124,45 @@ def get_test_result(instance, sandbox, workspace_dir_name):
    else:
        test_result['metadata']['5_reformat_instance_json_success'] = True

-    # Get the instance report
-    err_code, output = sandbox.execute(
-        (
-            'cd /swe_util/OD-SWE-bench '
-            '&& export PYTHONPATH=$(pwd):$PYTHONPATH '
-            '&& conda run -n swe-bench-eval python swebench/metrics/get_instance_report.py --swe_bench_task /workspace/instance.json --log_path /workspace/$SWE_INSTANCE_ID.log'
-        )
-    )
-    if err_code != 0:
-        logger.error(f'Error getting instance report: {output}')
+    if USE_INSTANCE_IMAGE:
+        # instance report is not supported in instance image mode
        test_result['metadata']['6_get_instance_report_success'] = False
-        test_result['metadata']['6_get_instance_report_error'] = output
-    else:
-        test_result['metadata']['6_get_instance_report_success'] = True
-        test_result['result_raw'] = output
+        test_result['metadata']['6_get_instance_report_error'] = (
+            'Instance report is not supported in instance image mode.'
+        )

-        # try to parse output
-        for line in output.strip().split('\n'):
-            line = line.strip('-')
-            try:
-                key, value = line.split(':')
-            except ValueError:
-                # skip this line
-                print(f'Error parsing result line: {line}')
-                continue
-            value = value.strip()
-            try:
-                value = int(value)
-            except ValueError:
-                pass
-            test_result['result'][key.strip()] = value
+    else:
+        # Get the instance report
+        err_code, output = sandbox.execute(
+            (
+                'cd /swe_util/OD-SWE-bench '
+                '&& export PYTHONPATH=$(pwd):$PYTHONPATH '
+                '&& conda run -n swe-bench-eval python swebench/metrics/get_instance_report.py --swe_bench_task /workspace/instance.json --log_path /workspace/$SWE_INSTANCE_ID.log'
+            )
+        )
+        if err_code != 0:
+            logger.error(f'Error getting instance report: {output}')
+            test_result['metadata']['6_get_instance_report_success'] = False
+            test_result['metadata']['6_get_instance_report_error'] = output
+        else:
+            test_result['metadata']['6_get_instance_report_success'] = True
+            test_result['result_raw'] = output
+
+            # try to parse output
+            for line in output.strip().split('\n'):
+                line = line.strip('-')
+                try:
+                    key, value = line.split(':')
+                except ValueError:
+                    # skip this line
+                    print(f'Error parsing result line: {line}')
+                    continue
+                value = value.strip()
+                try:
+                    value = int(value)
+                except ValueError:
+                    pass
+                test_result['result'][key.strip()] = value
    return test_result


@@ -163,7 +172,7 @@ def process_instance(
    reset_logger: bool = True,
 ):
    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))

    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
    # create process-specific workspace dir
@@ -189,6 +198,7 @@ def process_instance(
        # Remove all existing handlers from logger
        for handler in logger.handlers[:]:
            logger.removeHandler(handler)
+        os.makedirs(os.path.dirname(log_file), exist_ok=True)
        file_handler = logging.FileHandler(log_file)
        file_handler.setFormatter(
            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
@@ -205,6 +215,7 @@ def process_instance(
        workspace_dir_name,
        workspace_mount_path=workspace_mount_path,
        sandbox_plugins=agenthub.Agent.get_cls(metadata.agent_class).sandbox_plugins,
+        use_instance_image=USE_INSTANCE_IMAGE,
    )

    # Prepare instruction
--- a/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
+++ b/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
@@ -43,5 +43,7 @@ echo "Image file: $IMAGE_FILE"
 grep "$PATTERN" "$IMAGE_FILE" | while IFS= read -r image; do
    echo "Pulling $NAMESPACE/$image into $image"
    docker pull $NAMESPACE/$image
-    docker tag $NAMESPACE/$image $image
+    # replace _s_ to __ in the image name
+    renamed_image=$(echo "$image" | sed 's/_s_/__/g')
+    docker tag $NAMESPACE/$image $renamed_image
 done
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -26,6 +26,14 @@ if [ -z "$MAX_ITER" ]; then
  MAX_ITER=30
 fi

+if [ -z "$USE_INSTANCE_IMAGE" ]; then
+  echo "USE_INSTANCE_IMAGE not specified, use default false"
+  USE_INSTANCE_IMAGE=false
+fi
+
+export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
+echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
+
 get_agent_version

 echo "AGENT: $AGENT"
--- a/evaluation/swe_bench/scripts/setup/compare_patch_filename.py
+++ b/evaluation/swe_bench/scripts/setup/compare_patch_filename.py
@@ -0,0 +1,54 @@
+"""This script compares gold patches with OpenDevin-generated patches and check whether
+OpenDevin found the right (set of) files to modify.
+"""
+
+import argparse
+import json
+import re
+
+
+def extract_modified_files(patch):
+    modified_files = set()
+    file_pattern = re.compile(r'^diff --git a/(.*?) b/')
+
+    for line in patch.split('\n'):
+        match = file_pattern.match(line)
+        if match:
+            modified_files.add(match.group(1))
+
+    return modified_files
+
+
+def process_report(od_output_file):
+    succ = 0
+    fail = 0
+    for line in open(od_output_file):
+        line = json.loads(line)
+        instance_id = line['instance_id']
+        gold_patch = line['swe_instance']['patch']
+        generated_patch = line['git_patch']
+        gold_modified_files = extract_modified_files(gold_patch)
+        # swe-bench lite only: a gold patch always contains exactly one file
+        assert len(gold_modified_files) == 1
+        generated_modified_files = extract_modified_files(generated_patch)
+
+        # Check if all files in gold_patch are also in generated_patch
+        all_files_in_generated = gold_modified_files.issubset(generated_modified_files)
+        if all_files_in_generated:
+            succ += 1
+        else:
+            fail += 1
+            print(
+                f'{instance_id}: file mismatch, gold = {gold_modified_files}, generated = {generated_modified_files}'
+            )
+    print(
+        f'\nSUMMARY: {succ} out of {succ + fail} instances found correct files to edit, success rate = {succ / float(succ + fail)}'
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--od_output_file', help='Path to the OD output file')
+    args = parser.parse_args()
+
+    process_report(args.od_output_file)
--- a/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh
+++ b/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+# set -e
+
+# assert user name is `root`
+if [ "$USER" != "root" ]; then
+    echo "Error: This script is intended to be run by the 'root' user only." >&2
+    exit 1
+fi
+
+source ~/.bashrc
+
+SWEUTIL_DIR=/swe_util
+
+# Create logs directory
+LOG_DIR=/opendevin/logs
+mkdir -p $LOG_DIR && chmod 777 $LOG_DIR
+
+# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
+# SWE_INSTANCE_ID=django__django-11099
+if [ -z "$SWE_INSTANCE_ID" ]; then
+    echo "Error: SWE_INSTANCE_ID is not set." >&2
+    exit 1
+fi
+
+# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
+item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
+
+if [[ -z "$item" ]]; then
+  echo "No item found for the provided instance ID."
+  exit 1
+fi
+
+WORKSPACE_NAME=$(echo "$item" | jq -r '.repo + "__" + .version | gsub("/"; "__")')
+
+echo "WORKSPACE_NAME: $WORKSPACE_NAME"
+
+SWE_TASK_DIR=/opendevin/swe_tasks
+mkdir -p $SWE_TASK_DIR
+# Dump test_patch to /workspace/test.patch
+echo "$item" | jq -r '.test_patch' > $SWE_TASK_DIR/test.patch
+# Dump patch to /workspace/gold.patch
+echo "$item" | jq -r '.patch' > $SWE_TASK_DIR/gold.patch
+# Dump the item to /workspace/instance.json except for the "test_patch" and "patch" fields
+echo "$item" | jq 'del(.test_patch, .patch)' > $SWE_TASK_DIR/instance.json
+
+# Clear the workspace
+rm -rf /workspace/*
+# Copy repo to workspace
+if [ -d /workspace/$WORKSPACE_NAME ]; then
+    rm -rf /workspace/$WORKSPACE_NAME
+fi
+cp -r /testbed/ /workspace/$WORKSPACE_NAME/
+
+# Reset swe-bench testbed and install the repo
+. /opt/miniconda3/etc/profile.d/conda.sh
+conda activate testbed
+
+mkdir -p $SWE_TASK_DIR/reset_testbed_temp
+mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
+
+REPO_PATH=/workspace/$WORKSPACE_NAME
+echo "Repo Path: $REPO_PATH"
+echo "Test Command: $TEST_CMD"
+echo "export REPO_PATH=\"$REPO_PATH\"" >> ~/.bashrc
+# echo "export TEST_CMD=\"$TEST_CMD\"" >> ~/.bashrc
+
+if [[ "$REPO_PATH" == "None" ]]; then
+    echo "Error: Failed to retrieve repository path. Tests may not have passed or output was not as expected." >&2
+    exit 1
+fi
+
+# Activate instance-specific environment
+. /opt/miniconda3/etc/profile.d/conda.sh
+conda activate testbed
+
+# set +e
--- a/evaluation/swe_bench/scripts/summarise_results.py
+++ b/evaluation/swe_bench/scripts/summarise_results.py
@@ -1,39 +0,0 @@
-import json
-import sys
-
-
-def extract_test_results(json_file_path):
-    passed_instances = set()
-    all_instances = set()
-
-    with open(json_file_path, 'r') as file:
-        report = json.load(file)
-
-        # Add resolved instances
-        for instance_id in report['resolved']:
-            passed_instances.add(instance_id)
-
-        # Add all instances in the report
-        for _, instance_ids in report.items():
-            for instance_id in instance_ids:
-                all_instances.add(instance_id)
-
-    return passed_instances, all_instances
-
-
-if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        print(
-            'Usage: poetry run python summarise_results.py <path_to_report_json_file>'
-        )
-        sys.exit(1)
-    json_file_path = sys.argv[1]
-    passed_instances, all_instances = extract_test_results(json_file_path)
-    succ_rate = len(passed_instances) / len(all_instances)
-    print(
-        f'\nPassed {len(passed_instances)} tests, total {len(all_instances)} tests, resolve rate = {succ_rate:.2%}'
-    )
-    print('PASSED TESTS:')
-    print(sorted(list(passed_instances)))
-    print('FAILED TESTS:')
-    print(sorted(list(all_instances - passed_instances)))
--- a/evaluation/swe_bench/swe_env_box.py
+++ b/evaluation/swe_bench/swe_env_box.py
@@ -1,7 +1,12 @@
+import json
+import os
 import sys
+import tempfile
 import uuid

 from datasets import load_dataset
+from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK
+from swebench.harness.utils import get_test_directives

 from opendevin.core.config import config
 from opendevin.core.logger import opendevin_logger as logger
@@ -15,6 +20,10 @@ from opendevin.runtime.plugins import (
 SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'


+def get_image_name_from_instance_id(instance_id: str) -> str:
+    return 'sweb.eval.x86_64.' + instance_id
+
+
 class SWEBenchSSHBox(DockerSSHBox):
    def __init__(
        self,
@@ -26,6 +35,7 @@ class SWEBenchSSHBox(DockerSSHBox):
        skip_workspace_mount: bool = True,
        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
        workspace_dir_name: str | None = None,
+        use_instance_image: bool = False,
    ):
        if swe_instance_id is None:
            raise ValueError('swe_instance_id must be provided!')
@@ -39,6 +49,7 @@ class SWEBenchSSHBox(DockerSSHBox):
        ), 'container_image is required for SWEBenchSSHBox!'
        # Need to run as root to use SWEBench container
        sid = f'swe_bench_{swe_instance_id}_' + str(uuid.uuid4())
+        logger.info(f'===Using container image: {container_image}')
        super().__init__(container_image, timeout, sid)
        self.init_plugins(sandbox_plugins)

@@ -54,11 +65,61 @@ class SWEBenchSSHBox(DockerSSHBox):
        logger.info(
            'Initialization of SWEBench may take approximately 10 minutes due to long-running installations, such as those requiring compilation.'
        )
-        exit_code, output = self.execute('source /swe_util/swe_entry.sh', timeout=600)
-        logger.info('exit code: %d', exit_code)
-        logger.info(output)
-        assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
-        logger.info('Sourced swe_entry.sh successfully')
+        logger.info(f'Use instance image: {use_instance_image}')
+        if use_instance_image:
+            # we directly inject the instance info into the container and the init script
+            script_dir = os.path.dirname(__file__)
+
+            # inject test command
+            test_type = MAP_REPO_TO_TEST_FRAMEWORK[swe_instance['repo']][
+                swe_instance['version']
+            ]
+            swe_instance['test_directives'] = get_test_directives(swe_instance)
+            swe_instance['test_cmd'] = (
+                f"{test_type} {' '.join(swe_instance['test_directives'])}"
+            )
+            exit_code, output = self.execute(
+                f"""echo "export TEST_CMD='{swe_instance["test_cmd"]}'" >> ~/.bashrc"""
+            )
+            # assert exit_code == 0, f'Failed to set TEST_CMD in ~/.bashrc: {output}'
+
+            # inject the instance info
+            self.execute('mkdir -p /swe_util/eval_data/instances')
+            swe_instance_json_name = 'swe-bench-instance.json'
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Construct the full path for the desired file name within the temporary directory
+                temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
+                # Write to the file with the desired name within the temporary directory
+                with open(temp_file_path, 'w') as f:
+                    if not isinstance(swe_instance, dict):
+                        json.dump([swe_instance.to_dict()], f)
+                    else:
+                        json.dump([swe_instance], f)
+
+                # Copy the file to the desired location
+                self.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
+
+            # inject the init script
+            self.copy_to(
+                str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
+                '/swe_util/',
+            )
+            self.execute('cat ~/.bashrc')
+            self.execute('source ~/.bashrc')
+
+            self.execute('source /swe_util/instance_swe_entry.sh', timeout=600)
+            logger.info('exit code: %d', exit_code)
+            logger.info(output)
+            assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
+            logger.info('Sourced swe_entry.sh successfully')
+        else:
+            exit_code, output = self.execute(
+                'source /swe_util/swe_entry.sh', timeout=600
+            )
+            logger.info('exit code: %d', exit_code)
+            logger.info(output)
+            assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
+            logger.info('Sourced swe_entry.sh successfully')

    @property
    def volumes(self):
@@ -78,6 +139,7 @@ class SWEBenchSSHBox(DockerSSHBox):
        skip_workspace_mount: bool = True,
        workspace_mount_path: str | None = None,
        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
+        use_instance_image: bool = False,
    ) -> 'SWEBenchSSHBox':
        if workspace_dir_name is None:
            workspace_dir_name = f"{instance['repo']}__{instance['version']}".replace(
@@ -94,13 +156,20 @@ class SWEBenchSSHBox(DockerSSHBox):
            config.enable_auto_lint = True
            # Need to run as root to use SWEBench container
            config.run_as_devin = False
+            if use_instance_image:
+                container_image = get_image_name_from_instance_id(
+                    instance['instance_id']
+                )
+            else:
+                container_image = SWE_BENCH_CONTAINER_IMAGE
            sandbox = cls(
-                container_image=SWE_BENCH_CONTAINER_IMAGE,
+                container_image=container_image,
                swe_instance_id=instance['instance_id'],
                swe_instance=instance,
                skip_workspace_mount=skip_workspace_mount,
                sandbox_plugins=sandbox_plugins,
                workspace_dir_name=workspace_dir_name,
+                use_instance_image=use_instance_image,
            )
            logger.info(f"SSH box started for instance {instance['instance_id']}.")

@@ -163,6 +232,8 @@ if __name__ == '__main__':
    # so we don't need to manage file uploading to OpenDevin's repo
    dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
    swe_bench_tests = dataset['test'].to_pandas()
+    USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false') == 'true'
+    logger.info(f'USE_INSTANCE_IMAGE: {USE_INSTANCE_IMAGE}')

    # INSTANCE_ID = 'django__django-11099'
    INSTANCE_ID = 'astropy__astropy-12907'
@@ -172,6 +243,7 @@ if __name__ == '__main__':
    sandbox = SWEBenchSSHBox.get_box_for_instance(
        instance=EXAMPLE_INSTANCE,
        sandbox_plugins=[AgentSkillsRequirement(), JupyterRequirement()],
+        use_instance_image=USE_INSTANCE_IMAGE,
    )

    # PRE TEST
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -35,7 +35,7 @@ AGENT_CLS_TO_INST_SUFFIX = {


 def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
    # create process-specific workspace dir
    # we will create a workspace directory for EACH process
    # so that different agent don't interfere with each other.
@@ -115,7 +115,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
        'correct': correct,
        'answer_id': 'None',
        'model_id': metadata.model_name,
-        'metadata': metadata.model_dump(),
+        'metadata': metadata,
        'history': histories,
        'metrics': metrics,
        'error': state.last_error if state and state.last_error else None,
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -29,6 +29,14 @@ class EvalMetadata(BaseModel):
    data_split: str | None = None
    details: dict[str, Any] | None = None

+    def model_dump_json(self, *args, **kwargs):
+        dumped = super().model_dump_json(*args, **kwargs)
+        dumped_dict = json.loads(dumped)
+        logger.debug(f'Dumped metadata: {dumped_dict}')
+        # avoid leaking sensitive information
+        dumped_dict['llm_config'] = self.llm_config.to_safe_dict()
+        return json.dumps(dumped_dict)
+

 def codeact_user_response(
    state: State,
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@@ -42,7 +42,7 @@ def process_instance(
    reset_logger: bool = True,
 ):
    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
    env_id = instance.id
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
--- a/frontend/.husky/pre-commit
+++ b/frontend/.husky/pre-commit
@@ -1,4 +1,4 @@
 #!/bin/sh
 cd frontend
-npx lint-staged
-npm run test
+lint-staged
+vitest run
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -8,7 +8,7 @@
  },
  "dependencies": {
    "@monaco-editor/react": "^4.6.0",
-    "@nextui-org/react": "^2.4.2",
+    "@nextui-org/react": "^2.4.3",
    "@react-types/shared": "^3.23.1",
    "@reduxjs/toolkit": "^2.2.6",
    "@vitejs/plugin-react": "^4.3.1",
@@ -16,8 +16,7 @@
    "@xterm/xterm": "^5.4.0",
    "clsx": "^2.1.1",
    "eslint-config-airbnb-typescript": "^18.0.0",
-    "framer-motion": "^11.3.2",
-    "i18next": "^23.11.5",
+    "i18next": "^23.12.2",
    "i18next-browser-languagedetector": "^8.0.0",
    "i18next-http-backend": "^2.5.2",
    "jose": "^5.6.3",
@@ -26,14 +25,13 @@
    "react-dom": "^18.3.1",
    "react-highlight": "^0.15.0",
    "react-hot-toast": "^2.4.1",
-    "react-i18next": "^14.1.2",
+    "react-i18next": "^15.0.0",
    "react-icons": "^5.2.1",
    "react-markdown": "^9.0.1",
    "react-redux": "^9.1.2",
-    "react-router-dom": "^6.24.1",
    "react-syntax-highlighter": "^15.5.0",
    "tailwind-merge": "^2.4.0",
-    "vite": "^5.3.3",
+    "vite": "^5.3.4",
    "web-vitals": "^3.5.2"
  },
  "scripts": {
@@ -65,13 +63,13 @@
    "@testing-library/jest-dom": "^6.4.6",
    "@testing-library/react": "^16.0.0",
    "@testing-library/user-event": "^14.5.2",
-    "@types/node": "^20.14.10",
+    "@types/node": "^20.14.11",
    "@types/react": "^18.3.3",
    "@types/react-dom": "^18.3.0",
    "@types/react-highlight": "^0.12.8",
    "@types/react-syntax-highlighter": "^15.5.13",
-    "@typescript-eslint/eslint-plugin": "^7.16.0",
-    "@typescript-eslint/parser": "^7.16.0",
+    "@typescript-eslint/eslint-plugin": "^7.16.1",
+    "@typescript-eslint/parser": "^7.16.1",
    "@vitest/coverage-v8": "^1.6.0",
    "autoprefixer": "^10.4.19",
    "eslint": "^8.57.0",
@@ -80,15 +78,15 @@
    "eslint-config-prettier": "^9.1.0",
    "eslint-plugin-import": "^2.29.1",
    "eslint-plugin-jsx-a11y": "^6.9.0",
-    "eslint-plugin-prettier": "^5.1.3",
-    "eslint-plugin-react": "^7.34.3",
+    "eslint-plugin-prettier": "^5.2.1",
+    "eslint-plugin-react": "^7.34.4",
    "eslint-plugin-react-hooks": "^4.6.2",
-    "husky": "^9.0.11",
+    "husky": "^9.1.1",
    "jsdom": "^24.1.0",
    "lint-staged": "^15.2.7",
    "postcss": "^8.4.39",
-    "prettier": "^3.3.2",
-    "tailwindcss": "^3.4.4",
+    "prettier": "^3.3.3",
+    "tailwindcss": "^3.4.6",
    "typescript": "^5.5.3",
    "vite-tsconfig-paths": "^4.3.2",
    "vitest": "^1.6.0"
--- a/frontend/src/components/Browser.test.tsx
+++ b/frontend/src/components/Browser.test.tsx
@@ -1,10 +1,11 @@
 import React from "react";
+import { screen } from "@testing-library/react";
 import Browser from "./Browser";
 import { renderWithProviders } from "../../test-utils";

 describe("Browser", () => {
  it("renders a message if no screenshotSrc is provided", () => {
-    const { getByText } = renderWithProviders(<Browser />, {
+    renderWithProviders(<Browser />, {
      preloadedState: {
        browser: {
          url: "https://example.com",
@@ -14,11 +15,11 @@ describe("Browser", () => {
    });

    // i18n empty message key
-    expect(getByText(/BROWSER\$EMPTY_MESSAGE/i)).toBeInTheDocument();
+    expect(screen.getByText("BROWSER$EMPTY_MESSAGE")).toBeInTheDocument();
  });

  it("renders the url and a screenshot", () => {
-    const { getByText, getByAltText } = renderWithProviders(<Browser />, {
+    renderWithProviders(<Browser />, {
      preloadedState: {
        browser: {
          url: "https://example.com",
@@ -28,7 +29,7 @@ describe("Browser", () => {
      },
    });

-    expect(getByText("https://example.com")).toBeInTheDocument();
-    expect(getByAltText(/browser screenshot/i)).toBeInTheDocument();
+    expect(screen.getByText("https://example.com")).toBeInTheDocument();
+    expect(screen.getByAltText(/browser screenshot/i)).toBeInTheDocument();
  });
 });
--- a/frontend/src/components/chat/Chat.test.tsx
+++ b/frontend/src/components/chat/Chat.test.tsx
@@ -10,14 +10,11 @@ const MESSAGES: Message[] = [
  { sender: "assistant", content: "How can I help you today?" },
 ];

-HTMLElement.prototype.scrollTo = vi.fn(() => {});
-
 describe("Chat", () => {
  it("should render chat messages", () => {
    renderWithProviders(<Chat messages={MESSAGES} />);

    const messages = screen.getAllByTestId("message");
-
    expect(messages).toHaveLength(MESSAGES.length);
  });
 });
--- a/frontend/src/components/chat/ChatInput.test.tsx
+++ b/frontend/src/components/chat/ChatInput.test.tsx
@@ -1,6 +1,6 @@
 import React from "react";
 import userEvent from "@testing-library/user-event";
-import { act, render, fireEvent } from "@testing-library/react";
+import { render, screen } from "@testing-library/react";
 import ChatInput from "./ChatInput";

 describe("ChatInput", () => {
@@ -11,109 +11,104 @@ describe("ChatInput", () => {
  const onSendMessage = vi.fn();

  it("should render a textarea", () => {
-    const { getByRole } = render(<ChatInput onSendMessage={onSendMessage} />);
-    const textarea = getByRole("textbox");
-    expect(textarea).toBeInTheDocument();
+    render(<ChatInput onSendMessage={onSendMessage} />);
+    expect(screen.getByRole("textbox")).toBeInTheDocument();
  });

-  it("should be able to be set as disabled", () => {
-    const { getByRole } = render(
-      <ChatInput disabled onSendMessage={onSendMessage} />,
-    );
-    const textarea = getByRole("textbox");
-    const button = getByRole("button");
+  it("should be able to be set as disabled", async () => {
+    const user = userEvent.setup();
+    render(<ChatInput disabled onSendMessage={onSendMessage} />);
+
+    const textarea = screen.getByRole("textbox");
+    const button = screen.getByRole("button");

    expect(textarea).not.toBeDisabled(); // user can still type
    expect(button).toBeDisabled(); // user cannot submit

-    act(() => {
-      userEvent.type(textarea, "Hello, world!{enter}");
-    });
+    await user.type(textarea, "Hello, world!");
+    await user.keyboard("{Enter}");

    expect(onSendMessage).not.toHaveBeenCalled();
  });

  it("should render with a placeholder", () => {
-    const { getByPlaceholderText } = render(
-      <ChatInput onSendMessage={onSendMessage} />,
+    render(<ChatInput onSendMessage={onSendMessage} />);
+
+    const textarea = screen.getByPlaceholderText(
+      /CHAT_INTERFACE\$INPUT_PLACEHOLDER/i,
    );
-    const textarea = getByPlaceholderText(/CHAT_INTERFACE\$INPUT_PLACEHOLDER/i);
    expect(textarea).toBeInTheDocument();
  });

  it("should render a send button", () => {
-    const { getByRole } = render(<ChatInput onSendMessage={onSendMessage} />);
-    const button = getByRole("button");
-    expect(button).toBeInTheDocument();
+    render(<ChatInput onSendMessage={onSendMessage} />);
+    expect(screen.getByRole("button")).toBeInTheDocument();
  });

  it("should call sendChatMessage with the input when the send button is clicked", async () => {
-    const { getByRole } = render(<ChatInput onSendMessage={onSendMessage} />);
-    const textarea = getByRole("textbox");
-    const button = getByRole("button");
+    const user = userEvent.setup();
+    render(<ChatInput onSendMessage={onSendMessage} />);

-    fireEvent.change(textarea, { target: { value: "Hello, world!" } });
+    const textarea = screen.getByRole("textbox");
+    const button = screen.getByRole("button");

-    await act(async () => {
-      await userEvent.click(button);
-    });
+    await user.type(textarea, "Hello, world!");
+    await user.click(button);

    expect(onSendMessage).toHaveBeenCalledWith("Hello, world!");
-
-    // Additionally, check if the callback is called exactly once
+    // Additionally, check if it was called exactly once
    expect(onSendMessage).toHaveBeenCalledTimes(1);
  });

-  it("should be able to send a message when the enter key is pressed", () => {
-    const { getByRole } = render(<ChatInput onSendMessage={onSendMessage} />);
-    const textarea = getByRole("textbox");
+  it("should be able to send a message when the enter key is pressed", async () => {
+    const user = userEvent.setup();
+    render(<ChatInput onSendMessage={onSendMessage} />);
+    const textarea = screen.getByRole("textbox");

-    fireEvent.change(textarea, { target: { value: "Hello, world!" } });
-    fireEvent.keyDown(textarea, { key: "Enter", code: "Enter", charCode: 13 });
+    await user.type(textarea, "Hello, world!");
+    await user.keyboard("{Enter}");

    expect(onSendMessage).toHaveBeenCalledWith("Hello, world!");
  });

-  it("should NOT send a message when shift + enter is pressed", () => {
-    const { getByRole } = render(<ChatInput onSendMessage={onSendMessage} />);
-    const textarea = getByRole("textbox");
+  it("should NOT send a message when shift + enter is pressed", async () => {
+    const user = userEvent.setup();
+    render(<ChatInput onSendMessage={onSendMessage} />);
+    const textarea = screen.getByRole("textbox");

-    act(() => {
-      userEvent.type(textarea, "Hello, world!{shift}{enter}");
-    });
+    await user.type(textarea, "Hello, world!");
+    await user.keyboard("{Shift>} {Enter}"); // Shift + Enter

    expect(onSendMessage).not.toHaveBeenCalled();
  });

-  it("should NOT send an empty message", () => {
-    const { getByRole } = render(<ChatInput onSendMessage={onSendMessage} />);
-    const textarea = getByRole("textbox");
-    const button = getByRole("button");
+  it("should NOT send an empty message", async () => {
+    const user = userEvent.setup();
+    render(<ChatInput onSendMessage={onSendMessage} />);
+    const textarea = screen.getByRole("textbox");
+    const button = screen.getByRole("button");

-    act(() => {
-      userEvent.type(textarea, " {enter}"); // Only whitespace
-    });
+    await user.type(textarea, " ");

+    // with enter key
+    await user.keyboard("{Enter}");
    expect(onSendMessage).not.toHaveBeenCalled();

-    act(() => {
-      userEvent.click(button);
-    });
-
+    // with button click
+    await user.click(button);
    expect(onSendMessage).not.toHaveBeenCalled();
  });

  it("should clear the input message after sending a message", async () => {
-    const { getByRole } = render(<ChatInput onSendMessage={onSendMessage} />);
-    const textarea = getByRole("textbox");
-    const button = getByRole("button");
-
-    fireEvent.change(textarea, { target: { value: "Hello, world!" } });
+    const user = userEvent.setup();
+    render(<ChatInput onSendMessage={onSendMessage} />);
+    const textarea = screen.getByRole("textbox");
+    const button = screen.getByRole("button");

+    await user.type(textarea, "Hello, world!");
    expect(textarea).toHaveValue("Hello, world!");

-    fireEvent.click(button);
-
+    await user.click(button);
    expect(textarea).toHaveValue("");
  });

--- a/frontend/src/components/chat/ChatInterface.test.tsx
+++ b/frontend/src/components/chat/ChatInterface.test.tsx
@@ -1,31 +1,29 @@
 import React from "react";
-import { screen, act, fireEvent } from "@testing-library/react";
+import { screen, act } from "@testing-library/react";
 import { describe, expect, it } from "vitest";
 import userEvent from "@testing-library/user-event";
 import { renderWithProviders } from "test-utils";
-import { useTranslation } from "react-i18next";
 import ChatInterface from "./ChatInterface";
 import Session from "#/services/session";
 import ActionType from "#/types/ActionType";
 import { addAssistantMessage } from "#/state/chatSlice";
 import AgentState from "#/types/AgentState";
-import { I18nKey } from "#/i18n/declaration";
-
-// avoid typing side-effect
-vi.mock("#/hooks/useTyping", () => ({
-  useTyping: vi.fn((text: string) => text),
-}));
-
-const sessionSpy = vi.spyOn(Session, "send");
-vi.spyOn(Session, "isConnected").mockImplementation(() => true);

 // This is for the scrollview ref in Chat.tsx
 // TODO: Move this into test setup
-HTMLElement.prototype.scrollTo = vi.fn(() => {});
+HTMLElement.prototype.scrollTo = vi.fn().mockImplementation(() => {});

 describe("ChatInterface", () => {
+  const sessionSendSpy = vi.spyOn(Session, "send");
+  vi.spyOn(Session, "isConnected").mockReturnValue(true);
+
+  const userMessageEvent = {
+    action: ActionType.MESSAGE,
+    args: { content: "my message" },
+  };
+
  afterEach(() => {
-    sessionSpy.mockClear();
+    sessionSendSpy.mockClear();
  });

  it("should render empty message list and input", () => {
@@ -33,20 +31,6 @@ describe("ChatInterface", () => {
    expect(screen.queryAllByTestId("message")).toHaveLength(0);
  });

-  it("should render the new message the user has typed", () => {
-    renderWithProviders(<ChatInterface />, {
-      preloadedState: {
-        agent: {
-          curAgentState: AgentState.INIT,
-        },
-      },
-    });
-
-    const input = screen.getByRole("textbox");
-    fireEvent.change(input, { target: { value: "my message" } });
-    expect(input).toHaveValue("my message");
-  });
-
  it("should render user and assistant messages", () => {
    const { store } = renderWithProviders(<ChatInterface />, {
      preloadedState: {
@@ -60,6 +44,7 @@ describe("ChatInterface", () => {
    expect(screen.getByText("Hello")).toBeInTheDocument();

    act(() => {
+      // simulate assistant response
      store.dispatch(addAssistantMessage("Hello to you!"));
    });

@@ -67,7 +52,8 @@ describe("ChatInterface", () => {
    expect(screen.getByText("Hello to you!")).toBeInTheDocument();
  });

-  it("should send a start event to the Session", () => {
+  it("should send the user message as an event to the Session when the agent state is INIT", async () => {
+    const user = userEvent.setup();
    renderWithProviders(<ChatInterface />, {
      preloadedState: {
        agent: {
@@ -77,17 +63,16 @@ describe("ChatInterface", () => {
    });

    const input = screen.getByRole("textbox");
-    fireEvent.change(input, { target: { value: "my message" } });
-    fireEvent.keyDown(input, { key: "Enter", code: "Enter", charCode: 13 });
+    await user.type(input, "my message");
+    await user.keyboard("{Enter}");

-    const event = {
-      action: ActionType.MESSAGE,
-      args: { content: "my message" },
-    };
-    expect(sessionSpy).toHaveBeenCalledWith(JSON.stringify(event));
+    expect(sessionSendSpy).toHaveBeenCalledWith(
+      JSON.stringify(userMessageEvent),
+    );
  });

-  it("should send a user message event to the Session", async () => {
+  it("should send the user message as an event to the Session when the agent state is AWAITING_USER_INPUT", async () => {
+    const user = userEvent.setup();
    renderWithProviders(<ChatInterface />, {
      preloadedState: {
        agent: {
@@ -97,16 +82,16 @@ describe("ChatInterface", () => {
    });

    const input = screen.getByRole("textbox");
-    await userEvent.type(input, "my message{enter}");
+    await user.type(input, "my message");
+    await user.keyboard("{Enter}");

-    const event = {
-      action: ActionType.MESSAGE,
-      args: { content: "my message" },
-    };
-    expect(sessionSpy).toHaveBeenCalledWith(JSON.stringify(event));
+    expect(sessionSendSpy).toHaveBeenCalledWith(
+      JSON.stringify(userMessageEvent),
+    );
  });

-  it("should disable the user input if agent is not initialized", () => {
+  it("should disable the user input if agent is not initialized", async () => {
+    const user = userEvent.setup();
    renderWithProviders(<ChatInterface />, {
      preloadedState: {
        agent: {
@@ -115,12 +100,16 @@ describe("ChatInterface", () => {
      },
    });

-    const { t } = useTranslation();
-
+    const input = screen.getByRole("textbox");
+    await user.type(input, "my message");
+    await user.keyboard("{Enter}");
    const submitButton = screen.getByLabelText(
-      t(I18nKey.CHAT_INTERFACE$TOOLTIP_SEND_MESSAGE),
+      "CHAT_INTERFACE$TOOLTIP_SEND_MESSAGE",
    );

    expect(submitButton).toBeDisabled();
+    expect(sessionSendSpy).not.toHaveBeenCalled();
  });
+
+  it.todo("test scroll-related behaviour");
 });
--- a/frontend/src/components/chat/ChatInterface.tsx
+++ b/frontend/src/components/chat/ChatInterface.tsx
@@ -1,4 +1,3 @@
-// frontend/src/components/chat/ChatInterface.tsx
 import React, { useRef } from "react";
 import { useDispatch, useSelector } from "react-redux";
 import { IoMdChatbubbles } from "react-icons/io";
@@ -16,11 +15,7 @@ import { sendChatMessage } from "#/services/chatService";
 import { addUserMessage, addAssistantMessage } from "#/state/chatSlice";
 import { I18nKey } from "#/i18n/declaration";
 import { useScrollToBottom } from "#/hooks/useScrollToBottom";
-import { Feedback } from "#/services/feedbackService";
 import FeedbackModal from "../modals/feedback/FeedbackModal";
-import { removeApiKey } from "#/utils/utils";
-import Session from "#/services/session";
-import { getToken } from "#/services/auth";

 interface ScrollButtonProps {
  onClick: () => void;
@@ -55,15 +50,9 @@ function ChatInterface() {
  const { messages } = useSelector((state: RootState) => state.chat);
  const { curAgentState } = useSelector((state: RootState) => state.agent);

-  const feedbackVersion = "1.0";
-  const [feedback, setFeedback] = React.useState<Feedback>({
-    email: "",
-    feedback: "positive",
-    permissions: "private",
-    trajectory: [],
-    token: "",
-    version: feedbackVersion,
-  });
+  const [feedbackPolarity, setFeedbackPolarity] = React.useState<
+    "positive" | "negative"
+  >("positive");
  const [feedbackShared, setFeedbackShared] = React.useState(0);

  const {
@@ -73,13 +62,8 @@ function ChatInterface() {
  } = useDisclosure();

  const shareFeedback = async (polarity: "positive" | "negative") => {
-    setFeedback((prev) => ({
-      ...prev,
-      feedback: polarity,
-      trajectory: removeApiKey(Session._history),
-      token: getToken(),
-    }));
    onFeedbackModalOpen();
+    setFeedbackPolarity(polarity);
  };

  const handleSendMessage = (content: string) => {
@@ -87,14 +71,6 @@ function ChatInterface() {
    sendChatMessage(content);
  };

-  const handleEmailChange = (key: string) => {
-    setFeedback({ ...feedback, email: key } as Feedback);
-  };
-
-  const handlePermissionsChange = (permissions: "public" | "private") => {
-    setFeedback({ ...feedback, permissions } as Feedback);
-  };
-
  const { t } = useTranslation();
  const handleSendContinueMsg = () => {
    handleSendMessage(t(I18nKey.CHAT_INTERFACE$INPUT_CONTINUE_MESSAGE));
@@ -176,9 +152,7 @@ function ChatInterface() {
        onSendMessage={handleSendMessage}
      />
      <FeedbackModal
-        feedback={feedback}
-        handleEmailChange={handleEmailChange}
-        handlePermissionsChange={handlePermissionsChange}
+        polarity={feedbackPolarity}
        isOpen={feedbackModalIsOpen}
        onOpenChange={onFeedbackModalOpenChange}
        onSendFeedback={() => setFeedbackShared(messages.length)}
--- a/frontend/src/components/chat/ChatMessage.test.tsx
+++ b/frontend/src/components/chat/ChatMessage.test.tsx
@@ -1,12 +1,9 @@
-import { render, screen } from "@testing-library/react";
-import { describe, it, expect } from "vitest";
+import { fireEvent, render, screen, within } from "@testing-library/react";
+import { describe, it, expect, vi } from "vitest";
 import React from "react";
+import userEvent from "@testing-library/user-event";
 import ChatMessage from "./ChatMessage";
-
-// avoid typing side-effect
-vi.mock("#/hooks/useTyping", () => ({
-  useTyping: vi.fn((text: string) => text),
-}));
+import toast from "#/utils/toast";

 describe("Message", () => {
  it("should render a user message", () => {
@@ -49,4 +46,114 @@ describe("Message", () => {
    expect(screen.getByText("log")).toBeInTheDocument();
    expect(screen.getByText("'Hello'")).toBeInTheDocument();
  });
+
+  describe("copy to clipboard", () => {
+    const toastInfoSpy = vi.spyOn(toast, "info");
+    const toastErrorSpy = vi.spyOn(toast, "error");
+
+    it("should copy any message to clipboard", async () => {
+      const user = userEvent.setup();
+      render(
+        <ChatMessage
+          message={{ sender: "user", content: "Hello" }}
+          isLastMessage={false}
+        />,
+      );
+
+      const message = screen.getByTestId("message");
+      let copyButton = within(message).queryByTestId("copy-button");
+      expect(copyButton).not.toBeInTheDocument();
+
+      // I am using `fireEvent` here because `userEvent.hover()` seems to interfere with the
+      // `userEvent.click()` call later on
+      fireEvent.mouseEnter(message);
+
+      copyButton = within(message).getByTestId("copy-button");
+      await user.click(copyButton);
+
+      expect(navigator.clipboard.readText()).resolves.toBe("Hello");
+      expect(toastInfoSpy).toHaveBeenCalled();
+    });
+
+    it("should show an error message when the message cannot be copied", async () => {
+      const user = userEvent.setup();
+      render(
+        <ChatMessage
+          message={{ sender: "user", content: "Hello" }}
+          isLastMessage={false}
+        />,
+      );
+
+      const message = screen.getByTestId("message");
+      fireEvent.mouseEnter(message);
+
+      const copyButton = within(message).getByTestId("copy-button");
+      const clipboardSpy = vi
+        .spyOn(navigator.clipboard, "writeText")
+        .mockRejectedValue(new Error("Failed to copy"));
+
+      await user.click(copyButton);
+
+      expect(clipboardSpy).toHaveBeenCalled();
+      expect(toastErrorSpy).toHaveBeenCalled();
+    });
+  });
+
+  describe("confirmation buttons", () => {
+    const expectButtonsNotToBeRendered = () => {
+      expect(
+        screen.queryByTestId("action-confirm-button"),
+      ).not.toBeInTheDocument();
+      expect(
+        screen.queryByTestId("action-reject-button"),
+      ).not.toBeInTheDocument();
+    };
+
+    it("should display confirmation buttons for the last assistant message", () => {
+      // it should not render buttons if the message is not the last one
+      const { rerender } = render(
+        <ChatMessage
+          message={{ sender: "assistant", content: "Are you sure?" }}
+          isLastMessage={false}
+          awaitingUserConfirmation
+        />,
+      );
+      expectButtonsNotToBeRendered();
+
+      // it should not render buttons if the message is not from the assistant
+      rerender(
+        <ChatMessage
+          message={{ sender: "user", content: "Yes" }}
+          isLastMessage
+          awaitingUserConfirmation
+        />,
+      );
+      expectButtonsNotToBeRendered();
+
+      // it should not render buttons if the message is not awaiting user confirmation
+      rerender(
+        <ChatMessage
+          message={{ sender: "assistant", content: "Are you sure?" }}
+          isLastMessage
+          awaitingUserConfirmation={false}
+        />,
+      );
+      expectButtonsNotToBeRendered();
+
+      // it should render buttons if all conditions are met
+      rerender(
+        <ChatMessage
+          message={{ sender: "assistant", content: "Are you sure?" }}
+          isLastMessage
+          awaitingUserConfirmation
+        />,
+      );
+
+      const confirmButton = screen.getByTestId("action-confirm-button");
+      const rejectButton = screen.getByTestId("action-reject-button");
+
+      expect(confirmButton).toBeInTheDocument();
+      expect(rejectButton).toBeInTheDocument();
+    });
+  });
 });
--- a/frontend/src/components/chat/ChatMessage.tsx
+++ b/frontend/src/components/chat/ChatMessage.tsx
@@ -3,14 +3,10 @@ import Markdown from "react-markdown";
 import { FaClipboard, FaClipboardCheck } from "react-icons/fa";
 import { twMerge } from "tailwind-merge";
 import { useTranslation } from "react-i18next";
-import { Tooltip } from "@nextui-org/react";
-import AgentState from "#/types/AgentState";
 import { code } from "../markdown/code";
 import toast from "#/utils/toast";
 import { I18nKey } from "#/i18n/declaration";
-import ConfirmIcon from "#/assets/confirm";
-import RejectIcon from "#/assets/reject";
-import { changeAgentState } from "#/services/agentStateService";
+import ConfirmationButtons from "./ConfirmationButtons";

 interface MessageProps {
  message: Message;
@@ -23,32 +19,43 @@ function ChatMessage({
  isLastMessage,
  awaitingUserConfirmation,
 }: MessageProps) {
+  const { t } = useTranslation();
+
  const [isCopy, setIsCopy] = useState(false);
  const [isHovering, setIsHovering] = useState(false);

+  React.useEffect(() => {
+    let timeout: NodeJS.Timeout;
+
+    if (isCopy) {
+      timeout = setTimeout(() => {
+        setIsCopy(false);
+      }, 1500);
+    }
+
+    return () => {
+      clearTimeout(timeout);
+    };
+  }, [isCopy]);
+
  const className = twMerge(
    "markdown-body",
    "p-3 text-white max-w-[90%] overflow-y-auto rounded-lg relative",
    message.sender === "user" ? "bg-neutral-700 self-end" : "bg-neutral-500",
  );

-  const { t } = useTranslation();
-  const copyToClipboard = () => {
-    navigator.clipboard
-      .writeText(message.content)
-      .then(() => {
-        setIsCopy(true);
-        setTimeout(() => {
-          setIsCopy(false);
-        }, 1500);
-        toast.info(t(I18nKey.CHAT_INTERFACE$CHAT_MESSAGE_COPIED));
-      })
-      .catch(() => {
-        toast.error(
-          "copy-error",
-          t(I18nKey.CHAT_INTERFACE$CHAT_MESSAGE_COPY_FAILED),
-        );
-      });
+  const copyToClipboard = async () => {
+    try {
+      await navigator.clipboard.writeText(message.content);
+      setIsCopy(true);
+
+      toast.info(t(I18nKey.CHAT_INTERFACE$CHAT_MESSAGE_COPIED));
+    } catch {
+      toast.error(
+        "copy-error",
+        t(I18nKey.CHAT_INTERFACE$CHAT_MESSAGE_COPY_FAILED),
+      );
+    }
  };

  return (
@@ -60,6 +67,7 @@ function ChatMessage({
    >
      {isHovering && (
        <button
+          data-testid="copy-button"
          onClick={copyToClipboard}
          className="absolute top-1 right-1 p-1 bg-neutral-600 rounded hover:bg-neutral-700"
          aria-label={t(I18nKey.CHAT_INTERFACE$TOOLTIP_COPY_MESSAGE)}
@@ -71,43 +79,7 @@ function ChatMessage({
      <Markdown components={{ code }}>{message.content}</Markdown>
      {isLastMessage &&
        message.sender === "assistant" &&
-        awaitingUserConfirmation && (
-          <div className="flex justify-between items-center pt-4">
-            <p>{t(I18nKey.CHAT_INTERFACE$USER_ASK_CONFIRMATION)}</p>
-            <div className="flex items-center gap-3">
-              <Tooltip
-                content={t(I18nKey.CHAT_INTERFACE$USER_CONFIRMED)}
-                closeDelay={100}
-              >
-                <button
-                  type="button"
-                  aria-label="Confirm action"
-                  className="bg-neutral-700 rounded-full p-1 hover:bg-neutral-800"
-                  onClick={() => {
-                    changeAgentState(AgentState.USER_CONFIRMED);
-                  }}
-                >
-                  <ConfirmIcon />
-                </button>
-              </Tooltip>
-              <Tooltip
-                content={t(I18nKey.CHAT_INTERFACE$USER_REJECTED)}
-                closeDelay={100}
-              >
-                <button
-                  type="button"
-                  aria-label="Reject action"
-                  className="bg-neutral-700 rounded-full p-1 hover:bg-neutral-800"
-                  onClick={() => {
-                    changeAgentState(AgentState.USER_REJECTED);
-                  }}
-                >
-                  <RejectIcon />
-                </button>
-              </Tooltip>
-            </div>
-          </div>
-        )}
+        awaitingUserConfirmation && <ConfirmationButtons />}
    </div>
  );
 }
--- a/frontend/src/components/chat/ConfirmationButtons.test.tsx
+++ b/frontend/src/components/chat/ConfirmationButtons.test.tsx
@@ -0,0 +1,27 @@
+import { describe } from "vitest";
+import { userEvent } from "@testing-library/user-event";
+import React from "react";
+import { render, screen } from "@testing-library/react";
+import ConfirmationButtons from "./ConfirmationButtons";
+import AgentState from "#/types/AgentState";
+import { changeAgentState } from "#/services/agentStateService";
+
+describe("ConfirmationButtons", () => {
+  vi.mock("#/services/agentStateService", () => ({
+    changeAgentState: vi.fn(),
+  }));
+
+  it("should change agent state appropriately on button click", async () => {
+    const user = userEvent.setup();
+    render(<ConfirmationButtons />);
+
+    const confirmButton = screen.getByTestId("action-confirm-button");
+    const rejectButton = screen.getByTestId("action-reject-button");
+
+    await user.click(confirmButton);
+    expect(changeAgentState).toHaveBeenCalledWith(AgentState.USER_CONFIRMED);
+
+    await user.click(rejectButton);
+    expect(changeAgentState).toHaveBeenCalledWith(AgentState.USER_REJECTED);
+  });
+});
--- a/frontend/src/components/chat/ConfirmationButtons.tsx
+++ b/frontend/src/components/chat/ConfirmationButtons.tsx
@@ -0,0 +1,58 @@
+import { Tooltip } from "@nextui-org/react";
+import { useTranslation } from "react-i18next";
+import React from "react";
+import ConfirmIcon from "#/assets/confirm";
+import RejectIcon from "#/assets/reject";
+import { I18nKey } from "#/i18n/declaration";
+import AgentState from "#/types/AgentState";
+import { changeAgentState } from "#/services/agentStateService";
+
+interface ActionTooltipProps {
+  type: "confirm" | "reject";
+  onClick: () => void;
+}
+
+function ActionTooltip({ type, onClick }: ActionTooltipProps) {
+  const { t } = useTranslation();
+
+  const content =
+    type === "confirm"
+      ? t(I18nKey.CHAT_INTERFACE$USER_CONFIRMED)
+      : t(I18nKey.CHAT_INTERFACE$USER_REJECTED);
+
+  return (
+    <Tooltip content={content} closeDelay={100}>
+      <button
+        data-testid={`action-${type}-button`}
+        type="button"
+        aria-label={type === "confirm" ? "Confirm action" : "Reject action"}
+        className="bg-neutral-700 rounded-full p-1 hover:bg-neutral-800"
+        onClick={onClick}
+      >
+        {type === "confirm" ? <ConfirmIcon /> : <RejectIcon />}
+      </button>
+    </Tooltip>
+  );
+}
+
+function ConfirmationButtons() {
+  const { t } = useTranslation();
+
+  return (
+    <div className="flex justify-between items-center pt-4">
+      <p>{t(I18nKey.CHAT_INTERFACE$USER_ASK_CONFIRMATION)}</p>
+      <div className="flex items-center gap-3">
+        <ActionTooltip
+          type="confirm"
+          onClick={() => changeAgentState(AgentState.USER_CONFIRMED)}
+        />
+        <ActionTooltip
+          type="reject"
+          onClick={() => changeAgentState(AgentState.USER_REJECTED)}
+        />
+      </div>
+    </div>
+  );
+}
+
+export default ConfirmationButtons;
--- a/frontend/src/components/file-explorer/ExplorerTree.test.tsx
+++ b/frontend/src/components/file-explorer/ExplorerTree.test.tsx
@@ -1,4 +1,5 @@
 import React from "react";
+import { screen } from "@testing-library/react";
 import { renderWithProviders } from "test-utils";
 import ExplorerTree from "./ExplorerTree";

@@ -10,20 +11,18 @@ describe("ExplorerTree", () => {
  });

  it("should render the explorer", () => {
-    const { getByText } = renderWithProviders(
-      <ExplorerTree files={FILES} defaultOpen />,
-    );
+    renderWithProviders(<ExplorerTree files={FILES} defaultOpen />);

-    expect(getByText("file-1-1.ts")).toBeInTheDocument();
-    expect(getByText("folder-1-2")).toBeInTheDocument();
+    expect(screen.getByText("file-1-1.ts")).toBeInTheDocument();
+    expect(screen.getByText("folder-1-2")).toBeInTheDocument();
    // TODO: make sure children render
  });

  it("should render the explorer given the defaultExpanded prop", () => {
-    const { queryByText } = renderWithProviders(<ExplorerTree files={FILES} />);
+    renderWithProviders(<ExplorerTree files={FILES} />);

-    expect(queryByText("file-1-1.ts")).toBeInTheDocument();
-    expect(queryByText("folder-1-2")).toBeInTheDocument();
+    expect(screen.queryByText("file-1-1.ts")).toBeInTheDocument();
+    expect(screen.queryByText("folder-1-2")).toBeInTheDocument();
    // TODO: make sure children don't render
  });

--- a/frontend/src/components/file-explorer/FileExplorer.test.tsx
+++ b/frontend/src/components/file-explorer/FileExplorer.test.tsx
@@ -1,5 +1,5 @@
 import React from "react";
-import { waitFor, act } from "@testing-library/react";
+import { screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { renderWithProviders } from "test-utils";
 import { describe, it, expect, vi, Mock } from "vitest";
@@ -24,112 +24,79 @@ vi.mock("../../services/fileService", async () => ({
  uploadFiles: vi.fn(),
 }));

+const renderFileExplorerWithRunningAgentState = () =>
+  renderWithProviders(<FileExplorer />, {
+    preloadedState: {
+      agent: {
+        curAgentState: AgentState.RUNNING,
+      },
+    },
+  });
+
 describe("FileExplorer", () => {
  afterEach(() => {
    vi.clearAllMocks();
  });

  it("should get the workspace directory", async () => {
-    const { getByText } = renderWithProviders(<FileExplorer />, {
-      preloadedState: {
-        agent: {
-          curAgentState: AgentState.RUNNING,
-        },
-      },
-    });
+    renderFileExplorerWithRunningAgentState();

-    await waitFor(() => {
-      expect(getByText("folder1")).toBeInTheDocument();
-      expect(getByText("file1.ts")).toBeInTheDocument();
-    });
+    expect(await screen.findByText("folder1")).toBeInTheDocument();
+    expect(await screen.findByText("file1.ts")).toBeInTheDocument();
    expect(listFiles).toHaveBeenCalledTimes(1); // once for root
  });

  it.todo("should render an empty workspace");

  it("should refetch the workspace when clicking the refresh button", async () => {
-    const { getByText, getByTestId } = renderWithProviders(<FileExplorer />, {
-      preloadedState: {
-        agent: {
-          curAgentState: AgentState.RUNNING,
-        },
-      },
-    });
-    await waitFor(() => {
-      expect(getByText("folder1")).toBeInTheDocument();
-      expect(getByText("file1.ts")).toBeInTheDocument();
-    });
+    const user = userEvent.setup();
+    renderFileExplorerWithRunningAgentState();
+
+    expect(await screen.findByText("folder1")).toBeInTheDocument();
+    expect(await screen.findByText("file1.ts")).toBeInTheDocument();
    expect(listFiles).toHaveBeenCalledTimes(1); // once for root

-    await act(async () => {
-      await userEvent.click(getByTestId("refresh"));
-    });
+    const refreshButton = screen.getByTestId("refresh");
+    await user.click(refreshButton);

-    await waitFor(() => {
-      expect(listFiles).toHaveBeenCalledTimes(2); // once for root, once for refresh button
-    });
+    expect(listFiles).toHaveBeenCalledTimes(2); // once for root, once for refresh button
  });

-  it("should toggle the explorer visibility when clicking the close button", async () => {
-    const { getByTestId, getByText, queryByText } = renderWithProviders(
-      <FileExplorer />,
-      {
-        preloadedState: {
-          agent: {
-            curAgentState: AgentState.RUNNING,
-          },
-        },
-      },
-    );
+  it("should toggle the explorer visibility when clicking the toggle button", async () => {
+    const user = userEvent.setup();
+    renderFileExplorerWithRunningAgentState();

-    await waitFor(() => {
-      expect(getByText("folder1")).toBeInTheDocument();
-    });
+    const folder1 = await screen.findByText("folder1");
+    expect(folder1).toBeInTheDocument();

-    await act(async () => {
-      await userEvent.click(getByTestId("toggle"));
-    });
+    const toggleButton = screen.getByTestId("toggle");
+    await user.click(toggleButton);

-    expect(queryByText("folder1")).toBeInTheDocument();
-    expect(queryByText("folder1")).not.toBeVisible();
+    expect(folder1).toBeInTheDocument();
+    expect(folder1).not.toBeVisible();
  });

  it("should upload files", async () => {
-    // TODO: Improve this test by passing expected argument to `uploadFiles`
-    const { findByTestId } = renderWithProviders(<FileExplorer />, {
-      preloadedState: {
-        agent: {
-          curAgentState: AgentState.RUNNING,
-        },
-      },
-    });
+    const user = userEvent.setup();
+    renderFileExplorerWithRunningAgentState();

    const file = new File([""], "file-name");
-    const file2 = new File([""], "file-name-2");
-
-    const uploadFileInput = await findByTestId("file-input");
-
-    await act(async () => {
-      await userEvent.upload(uploadFileInput, file);
-    });
+    const uploadFileInput = await screen.findByTestId("file-input");
+    await user.upload(uploadFileInput, file);

+    // TODO: Improve this test by passing expected argument to `uploadFiles`
    expect(uploadFiles).toHaveBeenCalledOnce();
    expect(listFiles).toHaveBeenCalled();

-    const uploadDirInput = await findByTestId("file-input");
+    const file2 = new File([""], "file-name-2");
+    const uploadDirInput = await screen.findByTestId("file-input");
+    await user.upload(uploadDirInput, [file, file2]);

-    // The 'await' keyword is required here to avoid a warning during test runs
-    await act(async () => {
-      await userEvent.upload(uploadDirInput, [file, file2]);
-    });
-
-    await waitFor(() => {
-      expect(uploadFiles).toHaveBeenCalledTimes(2);
-      expect(listFiles).toHaveBeenCalled();
-    });
+    expect(uploadFiles).toHaveBeenCalledTimes(2);
+    expect(listFiles).toHaveBeenCalled();
  });

-  it.skip("should upload files when dragging them to the explorer", () => {
+  it.todo("should upload files when dragging them to the explorer", () => {
    // It will require too much work to mock drag logic, especially for our case
    // https://github.com/testing-library/user-event/issues/440#issuecomment-685010755
    // TODO: should be tested in an e2e environment such as Cypress/Playwright
@@ -137,20 +104,20 @@ describe("FileExplorer", () => {

  it.todo("should download a file");

-  it.todo("should display an error toast if file upload fails", async () => {
+  it("should display an error toast if file upload fails", async () => {
    (uploadFiles as Mock).mockRejectedValue(new Error());
+    const user = userEvent.setup();
+    renderFileExplorerWithRunningAgentState();

-    const { getByTestId } = renderWithProviders(<FileExplorer />);
-
-    const uploadFileInput = getByTestId("file-input");
+    const uploadFileInput = await screen.findByTestId("file-input");
    const file = new File([""], "test");

-    await act(async () => {
-      await userEvent.upload(uploadFileInput, file);
-    });
+    await user.upload(uploadFileInput, file);

    expect(uploadFiles).rejects.toThrow();
-    // TODO: figure out why spy isn't called to pass test
-    expect(toastSpy).toHaveBeenCalledWith("ws", "Error uploading file");
+    expect(toastSpy).toHaveBeenCalledWith(
+      expect.stringContaining("upload-error"),
+      expect.any(String),
+    );
  });
 });
--- a/frontend/src/components/file-explorer/TreeNode.test.tsx
+++ b/frontend/src/components/file-explorer/TreeNode.test.tsx
@@ -1,5 +1,5 @@
 import React from "react";
-import { waitFor, act } from "@testing-library/react";
+import { screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { renderWithProviders } from "test-utils";
 import TreeNode from "./TreeNode";
@@ -25,106 +25,93 @@ describe("TreeNode", () => {
  });

  it("should render a file if property has no children", () => {
-    const { getByText } = renderWithProviders(
-      <TreeNode path="/file.ts" defaultOpen />,
-    );
-
-    expect(getByText("file.ts")).toBeInTheDocument();
+    renderWithProviders(<TreeNode path="/file.ts" defaultOpen />);
+    expect(screen.getByText("file.ts")).toBeInTheDocument();
  });

  it("should render a folder if it's in a subdir", async () => {
-    const { findByText } = renderWithProviders(
-      <TreeNode path="/folder1/" defaultOpen />,
-    );
+    renderWithProviders(<TreeNode path="/folder1/" defaultOpen />);
    expect(listFiles).toHaveBeenCalledWith("/folder1/");

-    expect(await findByText("folder1")).toBeInTheDocument();
-    expect(await findByText("file2.ts")).toBeInTheDocument();
+    expect(await screen.findByText("folder1")).toBeInTheDocument();
+    expect(await screen.findByText("file2.ts")).toBeInTheDocument();
  });

  it("should close a folder when clicking on it", async () => {
-    const { findByText, queryByText } = renderWithProviders(
-      <TreeNode path="/folder1/" defaultOpen />,
-    );
+    const user = userEvent.setup();
+    renderWithProviders(<TreeNode path="/folder1/" defaultOpen />);

-    expect(await findByText("folder1")).toBeInTheDocument();
-    expect(await findByText("file2.ts")).toBeInTheDocument();
+    const folder1 = await screen.findByText("folder1");
+    const file2 = await screen.findByText("file2.ts");

-    await act(async () => {
-      await userEvent.click(await findByText("folder1"));
-    });
+    expect(folder1).toBeInTheDocument();
+    expect(file2).toBeInTheDocument();

-    expect(await findByText("folder1")).toBeInTheDocument();
-    expect(queryByText("file2.ts")).not.toBeInTheDocument();
+    await user.click(folder1);
+
+    expect(folder1).toBeInTheDocument();
+    expect(screen.queryByText("file2.ts")).not.toBeInTheDocument();
  });

  it("should open a folder when clicking on it", async () => {
-    const { getByText, findByText, queryByText } = renderWithProviders(
-      <TreeNode path="/folder1/" />,
-    );
+    const user = userEvent.setup();
+    renderWithProviders(<TreeNode path="/folder1/" />);

-    expect(await findByText("folder1")).toBeInTheDocument();
-    expect(queryByText("file2.ts")).not.toBeInTheDocument();
+    const folder1 = await screen.findByText("folder1");

-    await act(async () => {
-      await userEvent.click(getByText("folder1"));
-    });
+    expect(folder1).toBeInTheDocument();
+    expect(screen.queryByText("file2.ts")).not.toBeInTheDocument();
+
+    await user.click(folder1);
    expect(listFiles).toHaveBeenCalledWith("/folder1/");

-    expect(await findByText("folder1")).toBeInTheDocument();
-    expect(await findByText("file2.ts")).toBeInTheDocument();
+    expect(folder1).toBeInTheDocument();
+    expect(await screen.findByText("file2.ts")).toBeInTheDocument();
  });

-  it("should call a fn and return the full path of a file when clicking on it", async () => {
-    const { getByText } = renderWithProviders(
-      <TreeNode path="/folder1/file2.ts" defaultOpen />,
-    );
+  it("should call `selectFile` and return the full path of a file when clicking on a file", async () => {
+    const user = userEvent.setup();
+    renderWithProviders(<TreeNode path="/folder1/file2.ts" defaultOpen />);

-    await act(async () => {
-      await userEvent.click(getByText("file2.ts"));
-    });
+    const file2 = screen.getByText("file2.ts");
+    await user.click(file2);

-    await waitFor(() => {
-      expect(selectFile).toHaveBeenCalledWith("/folder1/file2.ts");
-    });
+    expect(selectFile).toHaveBeenCalledWith("/folder1/file2.ts");
  });

-  it("should render the explorer given the defaultOpen prop", async () => {
-    const { getByText, findByText, queryByText } = renderWithProviders(
-      <TreeNode path="/" defaultOpen />,
-    );
+  it("should render the full explorer given the defaultOpen prop", async () => {
+    const user = userEvent.setup();
+    renderWithProviders(<TreeNode path="/" defaultOpen />);

    expect(listFiles).toHaveBeenCalledWith("/");

-    expect(await findByText("file1.ts")).toBeInTheDocument();
-    expect(await findByText("folder1")).toBeInTheDocument();
-    expect(queryByText("file2.ts")).not.toBeInTheDocument();
+    const file1 = await screen.findByText("file1.ts");
+    const folder1 = await screen.findByText("folder1");

-    await act(async () => {
-      await userEvent.click(getByText("folder1"));
-    });
+    expect(file1).toBeInTheDocument();
+    expect(folder1).toBeInTheDocument();
+    expect(screen.queryByText("file2.ts")).not.toBeInTheDocument();

+    await user.click(folder1);
    expect(listFiles).toHaveBeenCalledWith("folder1/");

-    expect(await findByText("file1.ts")).toBeInTheDocument();
-    expect(await findByText("folder1")).toBeInTheDocument();
-    expect(await findByText("file2.ts")).toBeInTheDocument();
+    expect(file1).toBeInTheDocument();
+    expect(folder1).toBeInTheDocument();
+    expect(await screen.findByText("file2.ts")).toBeInTheDocument();
  });

  it("should render all children as collapsed when defaultOpen is false", async () => {
-    const { findByText, getByText, queryByText } = renderWithProviders(
-      <TreeNode path="/folder1/" />,
-    );
+    renderWithProviders(<TreeNode path="/folder1/" defaultOpen={false} />);

-    expect(await findByText("folder1")).toBeInTheDocument();
-    expect(queryByText("file2.ts")).not.toBeInTheDocument();
+    const folder1 = await screen.findByText("folder1");

-    await act(async () => {
-      await userEvent.click(getByText("folder1"));
-    });
+    expect(folder1).toBeInTheDocument();
+    expect(screen.queryByText("file2.ts")).not.toBeInTheDocument();
+
+    await userEvent.click(folder1);
    expect(listFiles).toHaveBeenCalledWith("/folder1/");

-    expect(await findByText("folder1")).toBeInTheDocument();
-    expect(await findByText("file2.ts")).toBeInTheDocument();
+    expect(folder1).toBeInTheDocument();
+    expect(await screen.findByText("file2.ts")).toBeInTheDocument();
  });
 });
--- a/frontend/src/components/modals/base-modal/BaseModal.tsx
+++ b/frontend/src/components/modals/base-modal/BaseModal.tsx
@@ -17,6 +17,7 @@ interface BaseModalProps {
  subtitle?: string;
  actions?: Action[];
  children?: React.ReactNode;
+  testID?: string;
 }

 function BaseModal({
@@ -27,9 +28,11 @@ function BaseModal({
  subtitle = undefined,
  actions = [],
  children = null,
+  testID,
 }: BaseModalProps) {
  return (
    <Modal
+      data-testid={testID}
      isOpen={isOpen}
      onOpenChange={onOpenChange}
      title={title}
--- a/frontend/src/components/modals/feedback/FeedbackForm.tsx
+++ b/frontend/src/components/modals/feedback/FeedbackForm.tsx
@@ -1,63 +0,0 @@
-import { Input, Select, SelectItem } from "@nextui-org/react";
-import React from "react";
-import { useTranslation } from "react-i18next";
-import { I18nKey } from "../../../i18n/declaration";
-import { Feedback } from "#/services/feedbackService";
-
-interface FeedbackFormProps {
-  feedback: Feedback;
-
-  onEmailChange: (email: string) => void;
-  onPermissionsChange: (permissions: "public" | "private") => void;
-}
-
-function FeedbackForm({
-  feedback,
-  onEmailChange,
-  onPermissionsChange,
-}: FeedbackFormProps) {
-  const { t } = useTranslation();
-
-  const isEmailValid = (email: string) => {
-    // Regular expression to validate email format
-    const emailRegex = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
-    return emailRegex.test(email);
-  };
-
-  return (
-    <>
-      <Input
-        label="Email"
-        aria-label="email"
-        data-testid="email"
-        placeholder={t(I18nKey.FEEDBACK$EMAIL_PLACEHOLDER)}
-        type="text"
-        value={feedback.email || ""}
-        onChange={(e) => {
-          onEmailChange(e.target.value);
-        }}
-      />
-      <Select
-        label="Sharing settings"
-        aria-label="permissions"
-        data-testid="permissions"
-        value={feedback.permissions}
-        onChange={(e) => {
-          onPermissionsChange(e.target.value as "public" | "private");
-        }}
-      >
-        <SelectItem key="public" value="public">
-          Public
-        </SelectItem>
-        <SelectItem key="private" value="private">
-          Private
-        </SelectItem>
-      </Select>
-      {isEmailValid(feedback.email) ? null : (
-        <p className="text-red-500">Invalid email format</p>
-      )}
-    </>
-  );
-}
-
-export default FeedbackForm;
--- a/frontend/src/components/modals/feedback/FeedbackModal.test.tsx
+++ b/frontend/src/components/modals/feedback/FeedbackModal.test.tsx
@@ -0,0 +1,194 @@
+import { render, screen, within } from "@testing-library/react";
+import { Mock, describe } from "vitest";
+import React from "react";
+import userEvent from "@testing-library/user-event";
+import toast from "react-hot-toast";
+import FeedbackModal from "./FeedbackModal";
+import { sendFeedback } from "#/services/feedbackService";
+
+describe("FeedbackModal", () => {
+  Storage.prototype.setItem = vi.fn();
+  Storage.prototype.getItem = vi.fn();
+
+  vi.mock("#/services/feedbackService", () => ({
+    sendFeedback: vi.fn(),
+  }));
+
+  vi.mock("#/services/auth", () => ({
+    getToken: vi.fn().mockReturnValue("some-token"),
+  }));
+  // mock Session class
+  vi.mock("#/services/session", () => ({
+    default: {
+      _history: [
+        { args: { LLM_API_KEY: "DANGER-key-should-not-be-here" } },
+        { content: "Hello" },
+      ],
+    },
+  }));
+
+  afterEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it("should render the feedback model when open", () => {
+    const { rerender } = render(
+      <FeedbackModal
+        polarity="positive"
+        isOpen={false}
+        onOpenChange={vi.fn}
+        onSendFeedback={vi.fn}
+      />,
+    );
+    expect(screen.queryByTestId("feedback-modal")).not.toBeInTheDocument();
+
+    rerender(
+      <FeedbackModal
+        polarity="positive"
+        isOpen
+        onOpenChange={vi.fn}
+        onSendFeedback={vi.fn}
+      />,
+    );
+    expect(screen.getByTestId("feedback-modal")).toBeInTheDocument();
+  });
+
+  it("should display an error if the email is invalid when submitting", async () => {
+    const user = userEvent.setup();
+    render(
+      <FeedbackModal
+        polarity="positive"
+        isOpen
+        onOpenChange={vi.fn}
+        onSendFeedback={vi.fn}
+      />,
+    );
+
+    const submitButton = screen.getByRole("button", {
+      name: "FEEDBACK$SHARE_LABEL",
+    });
+
+    await user.click(submitButton);
+
+    expect(screen.getByTestId("invalid-email-message")).toBeInTheDocument();
+    expect(sendFeedback).not.toHaveBeenCalled();
+  });
+
+  it("should call sendFeedback with the correct data when the share button is clicked", async () => {
+    const user = userEvent.setup();
+    render(
+      <FeedbackModal
+        polarity="negative"
+        isOpen
+        onOpenChange={vi.fn}
+        onSendFeedback={vi.fn}
+      />,
+    );
+
+    const submitButton = screen.getByRole("button", {
+      name: "FEEDBACK$SHARE_LABEL",
+    });
+
+    const email = "example@example.com";
+    const emailInput = screen.getByTestId("email-input");
+    await user.type(emailInput, email);
+
+    // select public
+    const permissionsGroup = screen.getByTestId("permissions-group");
+    const publicOption = within(permissionsGroup).getByRole("radio", {
+      name: "FEEDBACK$PUBLIC_LABEL",
+    });
+    expect(publicOption).not.toBeChecked();
+    await user.click(publicOption);
+    expect(publicOption).toBeChecked();
+
+    await user.click(submitButton);
+
+    expect(
+      screen.queryByTestId("invalid-email-message"),
+    ).not.toBeInTheDocument();
+
+    expect(sendFeedback).toHaveBeenCalledWith({
+      email,
+      permissions: "public",
+      feedback: "negative",
+      trajectory: [{ args: {} }, { content: "Hello" }], // api key should be removed
+      token: "some-token",
+      version: "1.0",
+    });
+  });
+
+  it("should store the users email in local state for later use", async () => {
+    const email = "example@example.com";
+
+    const user = userEvent.setup();
+    const { rerender } = render(
+      <FeedbackModal
+        polarity="negative"
+        isOpen
+        onOpenChange={vi.fn}
+        onSendFeedback={vi.fn}
+      />,
+    );
+
+    expect(localStorage.getItem).toHaveBeenCalledWith("feedback-email");
+    const emailInput = screen.getByTestId("email-input");
+    expect(emailInput).toHaveValue("");
+
+    await user.type(emailInput, email);
+    expect(emailInput).toHaveValue(email);
+
+    const submitButton = screen.getByRole("button", {
+      name: "FEEDBACK$SHARE_LABEL",
+    });
+    await user.click(submitButton);
+
+    expect(localStorage.setItem).toHaveBeenCalledWith("feedback-email", email);
+
+    rerender(
+      <FeedbackModal
+        polarity="positive"
+        isOpen
+        onOpenChange={vi.fn}
+        onSendFeedback={vi.fn}
+      />,
+    );
+
+    const emailInputAfterClose = screen.getByTestId("email-input");
+    expect(emailInputAfterClose).toHaveValue(email);
+  });
+
+  // TODO: figure out how to properly mock toast
+  it.skip("should display a success toast when the feedback is shared successfully", async () => {
+    (sendFeedback as Mock).mockResolvedValue({
+      statusCode: 200,
+      body: {
+        message: "Feedback shared",
+        feedback_id: "some-id",
+        password: "some-password",
+      },
+    });
+
+    const user = userEvent.setup();
+    render(
+      <FeedbackModal
+        polarity="negative"
+        isOpen
+        onOpenChange={vi.fn}
+        onSendFeedback={vi.fn}
+      />,
+    );
+
+    const submitButton = screen.getByRole("button", {
+      name: "FEEDBACK$SHARE_LABEL",
+    });
+
+    const email = "example@example.com";
+    const emailInput = screen.getByTestId("email-input");
+    await user.type(emailInput, email);
+
+    await user.click(submitButton);
+
+    expect(toast).toHaveBeenCalled();
+  });
+});
--- a/frontend/src/components/modals/feedback/FeedbackModal.tsx
+++ b/frontend/src/components/modals/feedback/FeedbackModal.tsx
@@ -1,57 +1,127 @@
 import React from "react";
 import { useTranslation } from "react-i18next";
+import { Input, Radio, RadioGroup } from "@nextui-org/react";
+import hotToast from "react-hot-toast";
 import { I18nKey } from "#/i18n/declaration";
 import BaseModal from "../base-modal/BaseModal";
 import { Feedback, sendFeedback } from "#/services/feedbackService";
-import FeedbackForm from "./FeedbackForm";
 import toast from "#/utils/toast";
+import { getToken } from "#/services/auth";
+import Session from "#/services/session";
+import { removeApiKey } from "#/utils/utils";

-const VIEWER_PAGE = "https://od-feedback.vercel.app/show";
+const isEmailValid = (email: string) => {
+  // Regular expression to validate email format
+  const emailRegex = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
+  return emailRegex.test(email);
+};
+
+const VIEWER_PAGE = "https://www.all-hands.dev/share-opendevin";
+const FEEDBACK_VERSION = "1.0";

 interface FeedbackModalProps {
-  feedback: Feedback;
-  handleEmailChange: (key: string) => void;
-  handlePermissionsChange: (permissions: "public" | "private") => void;
+  polarity: "positive" | "negative";
  isOpen: boolean;
  onOpenChange: (isOpen: boolean) => void;
  onSendFeedback: () => void;
 }

 function FeedbackModal({
-  feedback,
-  handleEmailChange,
-  handlePermissionsChange,
+  polarity,
  isOpen,
  onOpenChange,
  onSendFeedback,
 }: FeedbackModalProps) {
  const { t } = useTranslation();

-  const handleSendFeedback = () => {
+  const [email, setEmail] = React.useState("");
+  const [permissions, setPermissions] = React.useState<"public" | "private">(
+    "private",
+  );
+
+  React.useEffect(() => {
+    // check if email is stored in local storage
+    const storedEmail = localStorage.getItem("feedback-email");
+    if (storedEmail) setEmail(storedEmail);
+  }, []);
+
+  const handleEmailChange = (newEmail: string) => {
+    setEmail(newEmail);
+  };
+
+  const copiedToClipboardToast = () => {
+    hotToast("Password copied to clipboard", {
+      icon: "📋",
+      position: "bottom-right",
+    });
+  };
+
+  const onPressToast = (password: string) => {
+    navigator.clipboard.writeText(password);
+    copiedToClipboardToast();
+  };
+
+  const shareFeedbackToast = (
+    message: string,
+    link: string,
+    password: string,
+  ) => {
+    hotToast(
+      <div className="flex flex-col gap-1">
+        <span>{message}</span>
+        <a
+          data-testid="toast-share-url"
+          className="text-blue-500 underline"
+          onClick={() => onPressToast(password)}
+          href={link}
+          target="_blank"
+          rel="noreferrer"
+        >
+          Go to shared feedback
+        </a>
+        <span onClick={() => onPressToast(password)} className="cursor-pointer">
+          Password: {password} <span className="text-gray-500">(copy)</span>
+        </span>
+      </div>,
+      { duration: 5000 },
+    );
+  };
+
+  const handleSendFeedback = async () => {
    onSendFeedback();
-    sendFeedback(feedback)
-      .then((response) => {
-        if (response.statusCode === 200) {
-          const { message, feedback_id: feedbackId, password } = response.body;
-          const toastMessage = `${message}\nFeedback link: ${VIEWER_PAGE}?feedback_id=${feedbackId}\nPassword: ${password}`;
-          toast.info(toastMessage);
-        } else {
-          toast.error(
-            "share-error",
-            `Failed to share, please contact the developers: ${response.body.message}`,
-          );
-        }
-      })
-      .catch((error) => {
+    const feedback: Feedback = {
+      version: FEEDBACK_VERSION,
+      feedback: polarity,
+      email,
+      permissions,
+      token: getToken(),
+      trajectory: removeApiKey(Session._history),
+    };
+
+    try {
+      const response = await sendFeedback(feedback);
+      localStorage.setItem("feedback-email", email); // store email in local storage
+      if (response.statusCode === 200) {
+        const { message, feedback_id: feedbackId, password } = response.body;
+        const link = `${VIEWER_PAGE}?share_id=${feedbackId}&password=${password}`;
+        shareFeedbackToast(message, link, password);
+      } else {
        toast.error(
          "share-error",
-          `Failed to share, please contact the developers: ${error}`,
+          `Failed to share, please contact the developers: ${response.body.message}`,
        );
-      });
+      }
+    } catch (error) {
+      toast.error(
+        "share-error",
+        `Failed to share, please contact the developers: ${error}`,
+      );
+    }
  };

  return (
    <BaseModal
+      testID="feedback-modal"
      isOpen={isOpen}
      title={t(I18nKey.FEEDBACK$MODAL_TITLE)}
      onOpenChange={onOpenChange}
@@ -61,6 +131,7 @@ function FeedbackModal({
          label: t(I18nKey.FEEDBACK$SHARE_LABEL),
          className: "bg-primary rounded-lg",
          action: handleSendFeedback,
+          isDisabled: !isEmailValid(email),
          closeAfterAction: true,
        },
        {
@@ -72,11 +143,33 @@ function FeedbackModal({
      ]}
    >
      <p>{t(I18nKey.FEEDBACK$MODAL_CONTENT)}</p>
-      <FeedbackForm
-        feedback={feedback}
-        onEmailChange={handleEmailChange}
-        onPermissionsChange={handlePermissionsChange}
+
+      <Input
+        label="Email"
+        aria-label="email"
+        data-testid="email-input"
+        placeholder={t(I18nKey.FEEDBACK$EMAIL_PLACEHOLDER)}
+        type="text"
+        value={email}
+        onChange={(e) => {
+          handleEmailChange(e.target.value);
+        }}
      />
+      {!isEmailValid(email) && (
+        <p data-testid="invalid-email-message" className="text-red-500">
+          Invalid email format
+        </p>
+      )}
+      <RadioGroup
+        data-testid="permissions-group"
+        label="Sharing settings"
+        orientation="horizontal"
+        value={permissions}
+        onValueChange={(value) => setPermissions(value as "public" | "private")}
+      >
+        <Radio value="private">{t(I18nKey.FEEDBACK$PRIVATE_LABEL)}</Radio>
+        <Radio value="public">{t(I18nKey.FEEDBACK$PUBLIC_LABEL)}</Radio>
+      </RadioGroup>
    </BaseModal>
  );
 }
--- a/frontend/src/components/modals/settings/SettingsModal.test.tsx
+++ b/frontend/src/components/modals/settings/SettingsModal.test.tsx
@@ -1,4 +1,4 @@
-import { screen, act, waitFor } from "@testing-library/react";
+import { act, screen, waitFor } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import i18next from "i18next";
 import React from "react";
@@ -70,19 +70,15 @@ describe("SettingsModal", () => {
  });

  it("should close the modal when the close button is clicked", async () => {
+    const user = userEvent.setup();
    const onOpenChange = vi.fn();
-    await act(async () =>
-      renderWithProviders(<SettingsModal isOpen onOpenChange={onOpenChange} />),
-    );
+    renderWithProviders(<SettingsModal isOpen onOpenChange={onOpenChange} />);

    const cancelButton = screen.getByRole("button", {
      name: /MODAL_CLOSE_BUTTON_LABEL/i, // i18n key
    });

-    await act(async () => {
-      await userEvent.click(cancelButton);
-    });
-
+    await user.click(cancelButton);
    expect(onOpenChange).toHaveBeenCalledWith(false);
  });

@@ -113,11 +109,10 @@ describe("SettingsModal", () => {
    };

    it("should save the settings", async () => {
+      const user = userEvent.setup();
      const onOpenChangeMock = vi.fn();
-      await act(async () =>
-        renderWithProviders(
-          <SettingsModal isOpen onOpenChange={onOpenChangeMock} />,
-        ),
+      renderWithProviders(
+        <SettingsModal isOpen onOpenChange={onOpenChangeMock} />,
      );

      // Use the helper function to assert models were fetched
@@ -126,19 +121,11 @@ describe("SettingsModal", () => {
      const saveButton = screen.getByRole("button", { name: /save/i });
      const modelInput = screen.getByRole("combobox", { name: "model" });

-      await act(async () => {
-        await userEvent.click(modelInput);
-      });
-
+      await user.click(modelInput);
      const model3 = screen.getByText("model3");

-      await act(async () => {
-        await userEvent.click(model3);
-      });
-
-      await act(async () => {
-        await userEvent.click(saveButton);
-      });
+      await user.click(model3);
+      await user.click(saveButton);

      expect(saveSettings).toHaveBeenCalledWith({
        ...initialSettings,
@@ -147,6 +134,7 @@ describe("SettingsModal", () => {
    });

    it("should reinitialize agent", async () => {
+      const user = userEvent.setup();
      const onOpenChangeMock = vi.fn();
      await act(async () =>
        renderWithProviders(
@@ -157,24 +145,17 @@ describe("SettingsModal", () => {
      const saveButton = screen.getByRole("button", { name: /save/i });
      const modelInput = screen.getByRole("combobox", { name: "model" });

-      await act(async () => {
-        await userEvent.click(modelInput);
-      });
-
+      await user.click(modelInput);
      const model3 = screen.getByText("model3");

-      await act(async () => {
-        await userEvent.click(model3);
-      });
-
-      await act(async () => {
-        await userEvent.click(saveButton);
-      });
+      await user.click(model3);
+      await user.click(saveButton);

      expect(startNewSessionSpy).toHaveBeenCalled();
    });

    it("should display a toast for every change", async () => {
+      const user = userEvent.setup();
      const onOpenChangeMock = vi.fn();
      await act(async () =>
        renderWithProviders(
@@ -185,24 +166,17 @@ describe("SettingsModal", () => {
      const saveButton = screen.getByRole("button", { name: /save/i });
      const modelInput = screen.getByRole("combobox", { name: "model" });

-      await act(async () => {
-        await userEvent.click(modelInput);
-      });
-
+      await user.click(modelInput);
      const model3 = screen.getByText("model3");

-      await act(async () => {
-        await userEvent.click(model3);
-      });
-
-      await act(async () => {
-        await userEvent.click(saveButton);
-      });
+      await user.click(model3);
+      await user.click(saveButton);

      expect(toastSpy).toHaveBeenCalledTimes(3);
    });

    it("should change the language", async () => {
+      const user = userEvent.setup();
      const onOpenChangeMock = vi.fn();
      await act(async () =>
        renderWithProviders(
@@ -213,24 +187,17 @@ describe("SettingsModal", () => {
      const saveButton = screen.getByRole("button", { name: /save/i });
      const languageInput = screen.getByRole("combobox", { name: "language" });

-      await act(async () => {
-        await userEvent.click(languageInput);
-      });
-
+      await user.click(languageInput);
      const spanish = screen.getByText("Español");

-      await act(async () => {
-        await userEvent.click(spanish);
-      });
-
-      await act(async () => {
-        await userEvent.click(saveButton);
-      });
+      await user.click(spanish);
+      await user.click(saveButton);

      expect(i18nSpy).toHaveBeenCalledWith("es");
    });

    it("should close the modal", async () => {
+      const user = userEvent.setup();
      const onOpenChangeMock = vi.fn();
      await act(async () =>
        renderWithProviders(
@@ -245,25 +212,18 @@ describe("SettingsModal", () => {
      const saveButton = screen.getByRole("button", { name: /save/i });
      const modelInput = screen.getByRole("combobox", { name: "model" });

-      await act(async () => {
-        await userEvent.click(modelInput);
-      });
-
+      await user.click(modelInput);
      const model3 = screen.getByText("model3");

-      await act(async () => {
-        await userEvent.click(model3);
-      });
-
-      await act(async () => {
-        await userEvent.click(saveButton);
-      });
+      await user.click(model3);
+      await user.click(saveButton);

      expect(onOpenChangeMock).toHaveBeenCalledWith(false);
    });
  });

  it("should reset settings to defaults when the 'reset to defaults' button is clicked", async () => {
+    const user = userEvent.setup();
    const onOpenChangeMock = vi.fn();
    await act(async () =>
      renderWithProviders(
@@ -276,18 +236,12 @@ describe("SettingsModal", () => {
    });
    const agentInput = screen.getByRole("combobox", { name: "agent" });

-    await act(async () => {
-      await userEvent.click(agentInput);
-    });
+    await user.click(agentInput);
    const agent3 = screen.getByText("agent3");
-    await act(async () => {
-      await userEvent.click(agent3);
-    });
+    await user.click(agent3);
    expect(agentInput).toHaveValue("agent3");

-    await act(async () => {
-      await userEvent.click(resetButton);
-    });
+    await user.click(resetButton);
    expect(getDefaultSettings).toHaveBeenCalled();

    expect(agentInput).toHaveValue("CodeActAgent"); // Agent value is reset to default from getDefaultSettings()
--- a/frontend/src/components/terminal/Terminal.test.tsx
+++ b/frontend/src/components/terminal/Terminal.test.tsx
@@ -9,24 +9,19 @@ global.ResizeObserver = vi.fn().mockImplementation(() => ({
  disconnect: vi.fn(),
 }));

-const openMock = vi.fn();
-const writeMock = vi.fn();
-const writelnMock = vi.fn();
-const disposeMock = vi.fn();
-const onKeyMock = vi.fn();
-const attachCustomKeyEventHandlerMock = vi.fn();
+const mockTerminal = {
+  open: vi.fn(),
+  write: vi.fn(),
+  writeln: vi.fn(),
+  dispose: vi.fn(),
+  onKey: vi.fn(),
+  attachCustomKeyEventHandler: vi.fn(),
+  loadAddon: vi.fn(),
+};

 vi.mock("@xterm/xterm", async (importOriginal) => ({
  ...(await importOriginal<typeof import("@xterm/xterm")>()),
-  Terminal: vi.fn(() => ({
-    open: openMock,
-    write: writeMock,
-    writeln: writelnMock,
-    dispose: disposeMock,
-    onKey: onKeyMock,
-    attachCustomKeyEventHandler: attachCustomKeyEventHandlerMock,
-    loadAddon: vi.fn(),
-  })),
+  Terminal: vi.fn().mockImplementation(() => mockTerminal),
 }));

 const renderTerminal = (commands: Command[] = []) =>
@@ -47,9 +42,9 @@ describe("Terminal", () => {
    renderTerminal();

    expect(screen.getByText("Terminal")).toBeInTheDocument();
-    expect(openMock).toHaveBeenCalledTimes(1);
+    expect(mockTerminal.open).toHaveBeenCalledTimes(1);

-    expect(writeMock).toHaveBeenCalledWith("$ ");
+    expect(mockTerminal.write).toHaveBeenCalledWith("$ ");
  });

  it("should load commands to the terminal", () => {
@@ -58,8 +53,8 @@ describe("Terminal", () => {
      { type: "output", content: "OUTPUT" },
    ]);

-    expect(writelnMock).toHaveBeenNthCalledWith(1, "INPUT");
-    expect(writelnMock).toHaveBeenNthCalledWith(2, "OUTPUT");
+    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(1, "INPUT");
+    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(2, "OUTPUT");
  });

  it("should write commands to the terminal", () => {
@@ -70,14 +65,14 @@ describe("Terminal", () => {
      store.dispatch(appendOutput("Hello"));
    });

-    expect(writelnMock).toHaveBeenNthCalledWith(1, "echo Hello");
-    expect(writelnMock).toHaveBeenNthCalledWith(2, "Hello");
+    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(1, "echo Hello");
+    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(2, "Hello");

    act(() => {
      store.dispatch(appendInput("echo World"));
    });

-    expect(writelnMock).toHaveBeenNthCalledWith(3, "echo World");
+    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(3, "echo World");
  });

  it("should load and write commands to the terminal", () => {
@@ -86,14 +81,14 @@ describe("Terminal", () => {
      { type: "output", content: "Hello" },
    ]);

-    expect(writelnMock).toHaveBeenNthCalledWith(1, "echo Hello");
-    expect(writelnMock).toHaveBeenNthCalledWith(2, "Hello");
+    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(1, "echo Hello");
+    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(2, "Hello");

    act(() => {
      store.dispatch(appendInput("echo Hello"));
    });

-    expect(writelnMock).toHaveBeenNthCalledWith(3, "echo Hello");
+    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(3, "echo Hello");
  });

  it("should end the line with a dollar sign after writing a command", () => {
@@ -103,18 +98,18 @@ describe("Terminal", () => {
      store.dispatch(appendInput("echo Hello"));
    });

-    expect(writelnMock).toHaveBeenCalledWith("echo Hello");
-    expect(writeMock).toHaveBeenCalledWith("$ ");
+    expect(mockTerminal.writeln).toHaveBeenCalledWith("echo Hello");
+    expect(mockTerminal.write).toHaveBeenCalledWith("$ ");
  });

  // This test fails because it expects `disposeMock` to have been called before the component is unmounted.
  it.skip("should dispose the terminal on unmount", () => {
    const { unmount } = renderWithProviders(<Terminal />);

-    expect(disposeMock).not.toHaveBeenCalled();
+    expect(mockTerminal.dispose).not.toHaveBeenCalled();

    unmount();

-    expect(disposeMock).toHaveBeenCalledTimes(1);
+    expect(mockTerminal.dispose).toHaveBeenCalledTimes(1);
  });
 });
--- a/frontend/src/hooks/useTyping.test.ts
+++ b/frontend/src/hooks/useTyping.test.ts
@@ -1,41 +0,0 @@
-import { act, renderHook } from "@testing-library/react";
-import { describe, it, vi } from "vitest";
-import { useTyping } from "./useTyping";
-
-vi.useFakeTimers();
-
-describe("useTyping", () => {
-  it("should 'type' a given message", () => {
-    const text = "Hello, World!";
-    const typingSpeed = 10;
-
-    const { result } = renderHook(() => useTyping(text));
-    expect(result.current).toBe("H");
-
-    act(() => {
-      vi.advanceTimersByTime(typingSpeed);
-    });
-
-    expect(result.current).toBe("He");
-
-    act(() => {
-      vi.advanceTimersByTime(typingSpeed);
-    });
-
-    expect(result.current).toBe("Hel");
-
-    for (let i = 3; i < text.length; i += 1) {
-      act(() => {
-        vi.advanceTimersByTime(typingSpeed);
-      });
-    }
-
-    expect(result.current).toBe("Hello, World!");
-
-    act(() => {
-      vi.advanceTimersByTime(typingSpeed);
-    });
-
-    expect(result.current).toBe("Hello, World!");
-  });
-});
--- a/frontend/src/hooks/useTyping.ts
+++ b/frontend/src/hooks/useTyping.ts
@@ -1,23 +0,0 @@
-import React from "react";
-
-export const useTyping = (text: string) => {
-  const [message, setMessage] = React.useState(text[0]);
-
-  const advance = () =>
-    setTimeout(() => {
-      if (message.length < text.length) {
-        setMessage(text.slice(0, message.length + 1));
-      }
-    }, 10);
-
-  React.useEffect(() => {
-    const timeout = advance();
-
-    return () => {
-      clearTimeout(timeout);
-    };
-    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [message]);
-
-  return message;
-};
--- a/frontend/src/i18n/translation.json
+++ b/frontend/src/i18n/translation.json
@@ -732,5 +732,17 @@
    "zh-CN": "计划未创建",
    "zh-TW": "未創建任何計劃。",
    "de": "Kein Plan erstellt."
+  },
+  "FEEDBACK$PUBLIC_LABEL": {
+    "en": "Public",
+    "zh-CN": "公开",
+    "zh-TW": "公開。",
+    "de": "Öffentlich"
+  },
+  "FEEDBACK$PRIVATE_LABEL": {
+    "en": "Private",
+    "zh-CN": "私有",
+    "zh-TW": "私有。",
+    "de": "Privat"
  }
 }
--- a/frontend/src/index.tsx
+++ b/frontend/src/index.tsx
@@ -4,19 +4,11 @@ import ReactDOM from "react-dom/client";
 import "./index.css";
 import { Provider } from "react-redux";
 import { NextUIProvider } from "@nextui-org/react";
-import { createBrowserRouter, RouterProvider } from "react-router-dom";
 import App from "./App";
 import reportWebVitals from "./reportWebVitals";
 import store from "#/store";
 import "#/i18n";

-const router = createBrowserRouter([
-  {
-    path: "/",
-    element: <App />,
-  },
-]);
-
 const root = ReactDOM.createRoot(
  document.getElementById("root") as HTMLElement,
 );
@@ -24,7 +16,7 @@ root.render(
  <React.StrictMode>
    <Provider store={store}>
      <NextUIProvider>
-        <RouterProvider router={router} />
+        <App />
      </NextUIProvider>
    </Provider>
  </React.StrictMode>,
--- a/frontend/src/services/auth.test.ts
+++ b/frontend/src/services/auth.test.ts
@@ -10,9 +10,10 @@ describe("Auth Service", () => {
  });

  describe("getToken", () => {
-    it("should fetch and return a token", async () => {
+    it("should fetch and return a token", () => {
      (Storage.prototype.getItem as Mock).mockReturnValue("newToken");
-      const data = await getToken();
+
+      const data = getToken();
      expect(localStorage.getItem).toHaveBeenCalledWith("token"); // Used to set Authorization header
      expect(data).toEqual("newToken");
    });
--- a/frontend/src/services/session.test.ts
+++ b/frontend/src/services/session.test.ts
@@ -1,17 +1,14 @@
 import { describe, expect, it, vi } from "vitest";
-
 import ActionType from "#/types/ActionType";
 import { Settings, saveSettings } from "./settings";
 import Session from "./session";

 const sendSpy = vi.spyOn(Session, "send");
-const setupSpy = vi
-  /* eslint-disable-next-line @typescript-eslint/no-explicit-any */
-  .spyOn(Session as any, "_setupSocket")
-  .mockImplementation(() => {
-    /* eslint-disable-next-line @typescript-eslint/dot-notation */
-    Session["_initializeAgent"](); // use key syntax to fix complaint about private fn
-  });
+// @ts-expect-error - spying on private function
+const setupSpy = vi.spyOn(Session, "_setupSocket").mockImplementation(() => {
+  // @ts-expect-error - calling a private function
+  Session._initializeAgent();
+});

 describe("startNewSession", () => {
  it("Should start a new session with the current settings", () => {
--- a/opendevin/controller/action_parser.py
+++ b/opendevin/controller/action_parser.py
@@ -4,8 +4,7 @@ from opendevin.events.action import Action


 class ResponseParser(ABC):
-    """
-    This abstract base class is a general interface for an response parser dedicated to
+    """This abstract base class is a general interface for an response parser dedicated to
    parsing the action from the response from the LLM.
    """

@@ -17,8 +16,7 @@ class ResponseParser(ABC):

    @abstractmethod
    def parse(self, response: str) -> Action:
-        """
-        Parses the action from the response from the LLM.
+        """Parses the action from the response from the LLM.

        Parameters:
        - response (str): The response from the LLM.
@@ -30,8 +28,7 @@ class ResponseParser(ABC):

    @abstractmethod
    def parse_response(self, response) -> str:
-        """
-        Parses the action from the response from the LLM.
+        """Parses the action from the response from the LLM.

        Parameters:
        - response (str): The response from the LLM.
@@ -43,8 +40,7 @@ class ResponseParser(ABC):

    @abstractmethod
    def parse_action(self, action_str: str) -> Action:
-        """
-        Parses the action from the response from the LLM.
+        """Parses the action from the response from the LLM.

        Parameters:
        - action_str (str): The response from the LLM.
@@ -56,21 +52,16 @@ class ResponseParser(ABC):


 class ActionParser(ABC):
-    """
-    This abstract base class is a general interface for an action parser dedicated to
+    """This abstract base class is a general interface for an action parser dedicated to
    parsing the action from the action str from the LLM.
    """

    @abstractmethod
    def check_condition(self, action_str: str) -> bool:
-        """
-        Check if the action string can be parsed by this parser.
-        """
+        """Check if the action string can be parsed by this parser."""
        pass

    @abstractmethod
    def parse(self, action_str: str) -> Action:
-        """
-        Parses the action from the action string from the LLM response.
-        """
+        """Parses the action from the action string from the LLM response."""
        pass
--- a/opendevin/controller/agent.py
+++ b/opendevin/controller/agent.py
@@ -35,8 +35,7 @@ class Agent(ABC):

    @property
    def complete(self) -> bool:
-        """
-        Indicates whether the current instruction execution is complete.
+        """Indicates whether the current instruction execution is complete.

        Returns:
        - complete (bool): True if execution is complete; False otherwise.
@@ -45,15 +44,13 @@ class Agent(ABC):

    @abstractmethod
    def step(self, state: 'State') -> 'Action':
-        """
-        Starts the execution of the assigned instruction. This method should
+        """Starts the execution of the assigned instruction. This method should
        be implemented by subclasses to define the specific execution logic.
        """
        pass

    def reset(self) -> None:
-        """
-        Resets the agent's execution status and clears the history. This method can be used
+        """Resets the agent's execution status and clears the history. This method can be used
        to prepare the agent for restarting the instruction or cleaning up before destruction.

        """
@@ -66,8 +63,7 @@ class Agent(ABC):

    @classmethod
    def register(cls, name: str, agent_cls: Type['Agent']):
-        """
-        Registers an agent class in the registry.
+        """Registers an agent class in the registry.

        Parameters:
        - name (str): The name to register the class under.
@@ -82,8 +78,7 @@ class Agent(ABC):

    @classmethod
    def get_cls(cls, name: str) -> Type['Agent']:
-        """
-        Retrieves an agent class from the registry.
+        """Retrieves an agent class from the registry.

        Parameters:
        - name (str): The name of the class to retrieve
@@ -100,8 +95,7 @@ class Agent(ABC):

    @classmethod
    def list_agents(cls) -> list[str]:
-        """
-        Retrieves the list of all agent names from the registry.
+        """Retrieves the list of all agent names from the registry.

        Raises:
        - AgentNotRegisteredError: If no agent is registered
--- a/Show More
+++ b/Show More