fix ssh_box error parsing

Enabled LLM logs by default (#1819 )
Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com>
2026-04-29 03:00:45 -04:00 · 2024-05-16 23:15:26 +08:00 · 2024-05-16 10:35:18 +00:00 · 2024-05-16 10:26:30 +00:00 · 2024-05-16 18:06:06 +08:00 · 2024-05-16 10:04:05 +00:00
327 changed files with 15370 additions and 5871 deletions
@@ -0,0 +1,14 @@
+codecov:
+  notify:
+    wait_for_ci: true
+
+coverage:
+  status:
+    patch:
+      default:
+        threshold: 10% # allow patch coverage to be lower than project coverage by at most 10%
+    project:
+      default:
+        threshold: 5% # allow project coverage to drop at most 5%
+
+comment: false
@@ -66,4 +66,4 @@ body:
    id: additional-context
    attributes:
      label: Logs, Errors, Screenshots, and Additional Context
-      description: Please add any additional context about the problem here.
+      description: LLM logs will be stored in the `logs/llm/default` folder. Please add any additional context about the problem here.
@@ -12,6 +12,7 @@ jobs:
  build:
    name: Build Docusaurus
    runs-on: ubuntu-latest
+    if: github.repository == 'OpenDevin/OpenDevin'
    steps:
      - uses: actions/checkout@v4
        with:
@@ -42,7 +43,7 @@ jobs:
  deploy:
    name: Deploy to GitHub Pages
    needs: build
-    if: github.ref == 'refs/heads/main'
+    if: github.ref == 'refs/heads/main' && github.repository == 'OpenDevin/OpenDevin'
    # Grant GITHUB_TOKEN the permissions required to make a Pages deployment
    permissions:
      pages: write # to deploy to Pages
@@ -28,7 +28,7 @@ jobs:

    strategy:
      matrix:
-        image: ["app", "evaluation", "sandbox"]
+        image: ["app", "sandbox"]

    steps:
      - name: checkout
@@ -14,6 +14,8 @@ jobs:
  test-on-macos:
    name: Test on macOS
    runs-on: macos-13
+    env:
+      INSTALL_DOCKER: '0'  # Set to '0' to skip Docker installation
    strategy:
      matrix:
        python-version: ["3.11"]
@@ -34,6 +36,7 @@ jobs:
        run: poetry install

      - name: Install & Start Docker
+        if: env.INSTALL_DOCKER == '1'
        run: |
          brew install colima docker
          colima start
@@ -46,7 +49,7 @@ jobs:
        run: make build

      - name: Run Tests
-        run: poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit
+        run: poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
@@ -55,6 +58,8 @@ jobs:
  test-on-linux:
    name: Test on Linux
    runs-on: ubuntu-latest
+    env:
+      INSTALL_DOCKER: '0'  # Set to '0' to skip Docker installation
    strategy:
      matrix:
        python-version: ["3.11"]
@@ -78,7 +83,37 @@ jobs:
        run: make build

      - name: Run Tests
-        run: poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit
+        run: poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+
+  test-for-sandbox:
+    name: Test for Sandbox
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install poetry via pipx
+        run: pipx install poetry
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'poetry'
+
+      - name: Install Python dependencies using Poetry
+        run: poetry install
+
+      - name: Build Environment
+        run: make build
+
+      - name: Run Integration Test for Sandbox
+        run: |
+          poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml -s ./tests/unit/test_sandbox.py

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
@@ -87,6 +122,6 @@ jobs:
  test_matrix_success:
    name: All Mac/Linux Tests Passed
    runs-on: ubuntu-latest
-    needs: [test-on-macos, test-on-linux]
+    needs: [test-on-macos, test-on-linux, test-for-sandbox]
    steps:
    - run: echo Done!
@@ -196,8 +196,14 @@ logs
 # agent
 .envrc
 /workspace
+/_test_workspace
 /debug
 cache

 # configuration
 config.toml
+evaluation/swe_bench/eval_workspace
+evaluation/outputs
+evaluation/evaluation_outputs
+test_results*
+/_test_files_tmp/
@@ -7,7 +7,7 @@ BACKEND_PORT = 3000
 BACKEND_HOST = "127.0.0.1:$(BACKEND_PORT)"
 FRONTEND_PORT = 3001
 DEFAULT_WORKSPACE_DIR = "./workspace"
-DEFAULT_MODEL = "gpt-3.5-turbo-1106"
+DEFAULT_MODEL = "gpt-3.5-turbo"
 CONFIG_FILE = config.toml
 PRECOMMIT_CONFIG_PATH = "./dev_config/python/.pre-commit-config.yaml"

@@ -22,7 +22,9 @@ RESET=$(shell tput -Txterm sgr0)
 build:
 	@echo "$(GREEN)Building project...$(RESET)"
 	@$(MAKE) -s check-dependencies
+ifeq ($(INSTALL_DOCKER),)
 	@$(MAKE) -s pull-docker-image
+endif
 	@$(MAKE) -s install-python-dependencies
 	@$(MAKE) -s install-frontend-dependencies
 	@$(MAKE) -s install-precommit-hooks
@@ -35,7 +37,9 @@ check-dependencies:
 	@$(MAKE) -s check-python
 	@$(MAKE) -s check-npm
 	@$(MAKE) -s check-nodejs
+ifeq ($(INSTALL_DOCKER),)
 	@$(MAKE) -s check-docker
+endif
 	@$(MAKE) -s check-poetry
 	@echo "$(GREEN)Dependencies checked successfully.$(RESET)"

@@ -44,7 +48,11 @@ check-system:
 	@if [ "$(shell uname)" = "Darwin" ]; then \
 		echo "$(BLUE)macOS detected.$(RESET)"; \
 	elif [ "$(shell uname)" = "Linux" ]; then \
-		echo "$(BLUE)Linux detected.$(RESET)"; \
+		if [ -f "/etc/manjaro-release" ]; then \
+			echo "$(BLUE)Manjaro Linux detected.$(RESET)"; \
+		else \
+			echo "$(BLUE)Linux detected.$(RESET)"; \
+		fi; \
 	elif [ "$$(uname -r | grep -i microsoft)" ]; then \
 		echo "$(BLUE)Windows Subsystem for Linux detected.$(RESET)"; \
 	else \
@@ -127,8 +135,14 @@ install-python-dependencies:
 		export HNSWLIB_NO_NATIVE=1; \
 		poetry run pip install chroma-hnswlib; \
 	fi
-	@poetry install --without evaluation
-	@poetry run playwright install --with-deps chromium
+	@poetry install
+	@if [ -f "/etc/manjaro-release" ]; then \
+		echo "$(BLUE)Detected Manjaro Linux. Installing Playwright dependencies...$(RESET)"; \
+		poetry run pip install playwright; \
+		poetry run playwright install chromium; \
+	else \
+		poetry run playwright install --with-deps chromium; \
+	fi
 	@echo "$(GREEN)Python dependencies installed successfully.$(RESET)"

 install-frontend-dependencies:
@@ -205,15 +219,24 @@ setup-config:
 	@echo "$(GREEN)Config.toml setup completed.$(RESET)"

 setup-config-prompts:
-	@read -p "Enter your LLM Model name, used for running without UI. Set the model in the UI after you start the app. (see https://docs.litellm.ai/docs/providers for full list) [default: $(DEFAULT_MODEL)]: " llm_model; \
+	@echo "[core]" > $(CONFIG_FILE).tmp
+
+	@read -p "Enter your workspace directory [default: $(DEFAULT_WORKSPACE_DIR)]: " workspace_dir; \
+	 workspace_dir=$${workspace_dir:-$(DEFAULT_WORKSPACE_DIR)}; \
+	 echo "workspace_base=\"$$workspace_dir\"" >> $(CONFIG_FILE).tmp
+
+	@echo "" >> $(CONFIG_FILE).tmp
+
+	@echo "[llm]" >> $(CONFIG_FILE).tmp
+	@read -p "Enter your LLM model name, used for running without UI. Set the model in the UI after you start the app. (see https://docs.litellm.ai/docs/providers for full list) [default: $(DEFAULT_MODEL)]: " llm_model; \
 	 llm_model=$${llm_model:-$(DEFAULT_MODEL)}; \
-	 echo "LLM_MODEL=\"$$llm_model\"" > $(CONFIG_FILE).tmp
+	 echo "model=\"$$llm_model\"" >> $(CONFIG_FILE).tmp

-	@read -p "Enter your LLM API key: " llm_api_key; \
-	 echo "LLM_API_KEY=\"$$llm_api_key\"" >> $(CONFIG_FILE).tmp
+	@read -p "Enter your LLM api key: " llm_api_key; \
+	 echo "api_key=\"$$llm_api_key\"" >> $(CONFIG_FILE).tmp

-	@read -p "Enter your LLM Base URL [mostly used for local LLMs, leave blank if not needed - example: http://localhost:5001/v1/]: " llm_base_url; \
-	 if [[ ! -z "$$llm_base_url" ]]; then echo "LLM_BASE_URL=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; fi
+	@read -p "Enter your LLM base URL [mostly used for local LLMs, leave blank if not needed - example: http://localhost:5001/v1/]: " llm_base_url; \
+	 if [[ ! -z "$$llm_base_url" ]]; then echo "base_url=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; fi

 	@echo "Enter your LLM Embedding Model"; \
 		echo "Choices are:"; \
@@ -227,22 +250,19 @@ setup-config-prompts:
 		echo "    - stable-code"; \
 		echo "  - Leave blank to default to 'BAAI/bge-small-en-v1.5' via huggingface"; \
 		read -p "> " llm_embedding_model; \
-		echo "LLM_EMBEDDING_MODEL=\"$$llm_embedding_model\"" >> $(CONFIG_FILE).tmp; \
+		echo "embedding_model=\"$$llm_embedding_model\"" >> $(CONFIG_FILE).tmp; \
 		if [ "$$llm_embedding_model" = "llama2" ] || [ "$$llm_embedding_model" = "mxbai-embed-large" ] || [ "$$llm_embedding_model" = "nomic-embed-text" ] || [ "$$llm_embedding_model" = "all-minilm" ] || [ "$$llm_embedding_model" = "stable-code" ]; then \
-			read -p "Enter the local model URL for the embedding model (will set LLM_EMBEDDING_BASE_URL): " llm_embedding_base_url; \
-				echo "LLM_EMBEDDING_BASE_URL=\"$$llm_embedding_base_url\"" >> $(CONFIG_FILE).tmp; \
+			read -p "Enter the local model URL for the embedding model (will set llm.embedding_base_url): " llm_embedding_base_url; \
+				echo "embedding_base_url=\"$$llm_embedding_base_url\"" >> $(CONFIG_FILE).tmp; \
 		elif [ "$$llm_embedding_model" = "azureopenai" ]; then \
-			read -p "Enter the Azure endpoint URL (will overwrite LLM_BASE_URL): " llm_base_url; \
-				echo "LLM_BASE_URL=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; \
+			read -p "Enter the Azure endpoint URL (will overwrite llm.base_url): " llm_base_url; \
+				echo "base_url=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; \
 			read -p "Enter the Azure LLM Embedding Deployment Name: " llm_embedding_deployment_name; \
-				echo "LLM_EMBEDDING_DEPLOYMENT_NAME=\"$$llm_embedding_deployment_name\"" >> $(CONFIG_FILE).tmp; \
+				echo "embedding_deployment_name=\"$$llm_embedding_deployment_name\"" >> $(CONFIG_FILE).tmp; \
 			read -p "Enter the Azure API Version: " llm_api_version; \
-				echo "LLM_API_VERSION=\"$$llm_api_version\"" >> $(CONFIG_FILE).tmp; \
+				echo "api_version=\"$$llm_api_version\"" >> $(CONFIG_FILE).tmp; \
 		fi

-	@read -p "Enter your workspace directory [default: $(DEFAULT_WORKSPACE_DIR)]: " workspace_dir; \
-	 workspace_dir=$${workspace_dir:-$(DEFAULT_WORKSPACE_DIR)}; \
-	 echo "WORKSPACE_BASE=\"$$workspace_dir\"" >> $(CONFIG_FILE).tmp

 # Clean up all caches
 clean:
@@ -18,14 +18,16 @@
 -->

 <div align="center">
-  <a href="https://github.com/OpenDevin/OpenDevin/graphs/contributors"><img src="https://img.shields.io/github/contributors/opendevin/opendevin?style=for-the-badge" alt="Contributors"></a>
-  <a href="https://github.com/OpenDevin/OpenDevin/network/members"><img src="https://img.shields.io/github/forks/opendevin/opendevin?style=for-the-badge" alt="Forks"></a>
-  <a href="https://github.com/OpenDevin/OpenDevin/stargazers"><img src="https://img.shields.io/github/stars/opendevin/opendevin?style=for-the-badge" alt="Stargazers"></a>
-  <a href="https://github.com/OpenDevin/OpenDevin/issues"><img src="https://img.shields.io/github/issues/opendevin/opendevin?style=for-the-badge" alt="Issues"></a>
-  <a href="https://github.com/OpenDevin/OpenDevin/blob/main/LICENSE"><img src="https://img.shields.io/github/license/opendevin/opendevin?style=for-the-badge" alt="MIT License"></a>
-  </br>
+  <a href="https://github.com/OpenDevin/OpenDevin/graphs/contributors"><img src="https://img.shields.io/github/contributors/opendevin/opendevin?style=for-the-badge&color=blue" alt="Contributors"></a>
+  <a href="https://github.com/OpenDevin/OpenDevin/network/members"><img src="https://img.shields.io/github/forks/opendevin/opendevin?style=for-the-badge&color=blue" alt="Forks"></a>
+  <a href="https://github.com/OpenDevin/OpenDevin/stargazers"><img src="https://img.shields.io/github/stars/opendevin/opendevin?style=for-the-badge&color=blue" alt="Stargazers"></a>
+  <a href="https://github.com/OpenDevin/OpenDevin/issues"><img src="https://img.shields.io/github/issues/opendevin/opendevin?style=for-the-badge&color=blue" alt="Issues"></a>
+  <a href="https://github.com/OpenDevin/OpenDevin/blob/main/LICENSE"><img src="https://img.shields.io/github/license/opendevin/opendevin?style=for-the-badge&color=blue" alt="MIT License"></a>
+  <br/>
  <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2i1iqdag6-bVmvamiPA9EZUu7oCO6KhA"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
  <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
+  <br/>
+  <a href="https://xwang.dev/blog/2024/opendevin-codeact-1.0-swebench/"><img src="https://img.shields.io/badge/SWE--bench%20Lite-21.0%25-green?style=for-the-badge" alt="SWE-bench "></a>
 </div>

 <!-- PROJECT LOGO -->
@@ -34,18 +36,13 @@
  <h1 align="center">OpenDevin: Code Less, Make More</h1>
  <a href="https://opendevin.github.io/OpenDevin/"><img src="https://img.shields.io/badge/Documenation-OpenDevin-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Check out the documentation"></a>
 </div>
+<hr>

-## 🎯 Mission
+Welcome to OpenDevin, a platform for autonomous software engineers, powered by AI and LLMs.

-Welcome to OpenDevin, an open-source project aiming to replicate Devin, an autonomous AI software engineer who is capable of executing complex engineering tasks and collaborating actively with users on software development projects. This project aspires to replicate, enhance, and innovate upon Devin through the power of the open-source community.
+OpenDevin agents collaborate with human developers to write code, fix bugs, and ship features.

-To learn more and to use OpenDevin, check out our [documentation](https://opendevin.github.io/OpenDevin/).
-
-<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
-    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
-        ↑ Back to Top ↑
-    </a>
-</p>
+![App screenshot](./docs/static/img/screenshot.png)

 ## ⚡ Quick Start
 You can run OpenDevin with Docker. It works best with the most recent
@@ -56,64 +53,62 @@ version of Docker, `26.0.0`.
 export WORKSPACE_BASE=$(pwd)/workspace;

 docker run \
+    -it \
    --pull=always \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 3000:3000 \
-    --add-host host.docker.internal=host-gateway \
+    --add-host host.docker.internal:host-gateway \
    ghcr.io/opendevin/opendevin:0.5
 ```

-For troubleshooting and advanced configuration, see
-[the full documentation](https://opendevin.github.io/OpenDevin/).
+## 🚀 Documentation
+
+To learn more about the project, and for tips on using OpenDevin,
+**check out our [documentation](https://opendevin.github.io/OpenDevin/)**.
+
+There you'll find resources on how to use different LLM providers (like ollama and Anthropic's Claude),
+troubleshooting resources, and advanced configuration options.

 ## 🤝 How to Contribute

-OpenDevin is a community-driven project, and we welcome contributions from everyone. Whether you're a developer, a researcher, or simply enthusiastic about advancing the field of software engineering with AI, there are many ways to get involved:
+OpenDevin is a community-driven project, and we welcome contributions from everyone.
+Whether you're a developer, a researcher, or simply enthusiastic about advancing the field of
+software engineering with AI, there are many ways to get involved:

- **Code Contributions:** Help us develop the core functionalities, frontend interface, or sandboxing solutions.
+- **Code Contributions:** Help us develop new agents, core functionality, the frontend and other interfaces, or sandboxing solutions.
 - **Research and Evaluation:** Contribute to our understanding of LLMs in software engineering, participate in evaluating the models, or suggest improvements.
 - **Feedback and Testing:** Use the OpenDevin toolset, report bugs, suggest features, or provide feedback on usability.

-For details, please check [this document](./CONTRIBUTING.md).
-
-<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
-    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
-        ↑ Back to Top ↑
-    </a>
-</p>
+For details, please check [CONTRIBUTING.md](./CONTRIBUTING.md).

 ## 🤖 Join Our Community

-Now we have both Slack workspace for the collaboration on building OpenDevin and Discord server for discussion about anything related, e.g., this project, LLM, agent, etc.
+Whether you're a developer, a researcher, or simply enthusiastic about OpenDevin, we'd love to have you in our community.
+Let's make software engineering better together!

- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw)
- [Discord server](https://discord.gg/ESHStjSjD4)
+- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw) - Here we talk about research, architecture, and future development.
+- [Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback.

-If you would love to contribute, feel free to join our community (note that now there is no need to fill in the [form](https://forms.gle/758d5p6Ve8r2nxxq6)). Let's simplify software engineering together!
-
-🐚 **Code less, make more with OpenDevin.**
-
-[![Star History Chart](https://api.star-history.com/svg?repos=OpenDevin/OpenDevin&type=Date)](https://star-history.com/#OpenDevin/OpenDevin&Date)
-
-<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
-    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
-        ↑ Back to Top ↑
+## 📈 Progress
+<p align="center">
+    <a href="https://www.swebench.com/lite.html">
+        <img src="/docs/static/img/results.png" alt="SWE-Bench Lite Score" width="500" height="auto">
    </a>
 </p>

+<p align="center">
+  <a href="https://star-history.com/#OpenDevin/OpenDevin&Date">
+    <img src="https://api.star-history.com/svg?repos=OpenDevin/OpenDevin&type=Date" width="500" alt="Star History Chart">
+  </a>
+</p>
+
 ## 📜 License

 Distributed under the MIT License. See [`LICENSE`](./LICENSE) for more information.

-<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
-    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
-        ↑ Back to Top ↑
-    </a>
-</p>
-
 [contributors-shield]: https://img.shields.io/github/contributors/opendevin/opendevin?style=for-the-badge
 [contributors-url]: https://github.com/OpenDevin/OpenDevin/graphs/contributors
 [forks-shield]: https://img.shields.io/github/forks/opendevin/opendevin?style=for-the-badge
@@ -16,14 +16,18 @@ Every agent also has a `self.llm` which it can use to interact with the LLM conf
 See the [LiteLLM docs for `self.llm.completion`](https://docs.litellm.ai/docs/completion).

 ## State
+
 The `state` contains:
-* A history of actions taken by the agent, as well as any observations (e.g. file content, command output) from those actions
-* A list of actions/observations that have happened since the most recent step
-* A [`plan`](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/plan.py), which contains the main goal
-  * The agent can add and modify subtasks through the `AddTaskAction` and `ModifyTaskAction`
+
+- A history of actions taken by the agent, as well as any observations (e.g. file content, command output) from those actions
+- A list of actions/observations that have happened since the most recent step
+- A [`root_task`](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/controller/state/task.py), which contains a plan of action
+  - The agent can add and modify subtasks through the `AddTaskAction` and `ModifyTaskAction`

 ## Actions
+
 Here is a list of available Actions, which can be returned by `agent.step()`:
+
 - [`CmdRunAction`](../opendevin/action/bash.py) - Runs a command inside a sandboxed terminal
 - [`CmdKillAction`](../opendevin/action/bash.py) - Kills a background command
 - [`IPythonRunCellAction`](../opendevin/action/bash.py) - Execute a block of Python code interactively (in Jupyter notebook) and receives `CmdOutputObservation`. Requires setting up `jupyter` [plugin](../opendevin/sandbox/plugins) as a requirement.
@@ -35,40 +39,50 @@ Here is a list of available Actions, which can be returned by `agent.step()`:
 - [`ModifyTaskAction`](../opendevin/action/tasks.py) - Changes the state of a subtask
 - [`AgentThinkAction`](../opendevin/action/agent.py) - A no-op that allows the agent to add plaintext to the history (as well as the chat log)
 - [`AgentTalkAction`](../opendevin/action/agent.py) - A no-op that allows the agent to add plaintext to the history and talk to the user.
+- [`AgentFinishAction`](../opendevin/action/agent.py) - Stops the control loop, allowing the user/delegator agent to enter a new task
+- [`AgentRejectAction`](../opendevin/action/agent.py) - Stops the control loop, allowing the user/delegator agent to enter a new task
 - [`AgentFinishAction`](../opendevin/action/agent.py) - Stops the control loop, allowing the user to enter a new task
+- [`MessageAction`](../opendevin/action/message.py) - Represents a message from an agent or the user

 You can use `action.to_dict()` and `action_from_dict` to serialize and deserialize actions.

 ## Observations
+
 There are also several types of Observations. These are typically available in the step following the corresponding Action.
 But they may also appear as a result of asynchronous events (e.g. a message from the user, logs from a command running
 in the background).

 Here is a list of available Observations:
+
 - [`CmdOutputObservation`](../opendevin/observation/run.py)
 - [`BrowserOutputObservation`](../opendevin/observation/browse.py)
 - [`FileReadObservation`](../opendevin/observation/files.py)
 - [`FileWriteObservation`](../opendevin/observation/files.py)
- [`UserMessageObservation`](../opendevin/observation/)
 - [`AgentRecallObservation`](../opendevin/observation/recall.py)
- [`AgentErrorObservation`](../opendevin/observation/error.py)
+- [`ErrorObservation`](../opendevin/observation/error.py)
+- [`SuccessObservation`](../opendevin/observation/success.py)

 You can use `observation.to_dict()` and `observation_from_dict` to serialize and deserialize observations.

 ## Interface
+
 Every agent must implement the following methods:

 ### `step`
+
 ```
 def step(self, state: "State") -> "Action"
 ```
+
 `step` moves the agent forward one step towards its goal. This probably means
 sending a prompt to the LLM, then parsing the response into an `Action`.

 ### `search_memory`
+
 ```
-def search_memory(self, query: str) -> List[str]:
+def search_memory(self, query: str) -> list[str]:
 ```
+
 `search_memory` should return a list of events that match the query. This will be used
 for the `recall` action.

@@ -1,14 +1,13 @@
-from typing import List
-
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.events.action import (
    Action,
-    AgentThinkAction,
    FileReadAction,
    FileWriteAction,
+    MessageAction,
 )
 from opendevin.events.observation import Observation
+from opendevin.events.serialization.event import event_to_memory
 from opendevin.llm.llm import LLM

 from .parser import parse_command
@@ -32,16 +31,16 @@ class SWEAgent(Agent):
        super().__init__(llm)
        self.memory_window = 4
        self.max_retries = 2
-        self.running_memory: List[str] = []
+        self.running_memory: list[str] = []
        self.cur_file: str = ''
        self.cur_line: int = 0

    def _remember(self, action: Action, observation: Observation) -> None:
        """Agent has a limited memory of the few steps implemented as a queue"""
-        memory = MEMORY_FORMAT(action.to_memory(), observation.to_memory())
+        memory = MEMORY_FORMAT(event_to_memory(action), event_to_memory(observation))
        self.running_memory.append(memory)

-    def _think_act(self, messages: List[dict]) -> tuple[Action, str]:
+    def _think_act(self, messages: list[dict]) -> tuple[Action, str]:
        resp = self.llm.completion(
            messages=messages,
            temperature=0.05,
@@ -71,7 +70,8 @@ class SWEAgent(Agent):
        for prev_action, obs in state.updated_info:
            self._remember(prev_action, obs)

-        prompt = STEP_PROMPT(state.plan.main_goal, self.cur_file, self.cur_line)
+        goal = state.get_current_user_intent()
+        prompt = STEP_PROMPT(goal, self.cur_file, self.cur_line)

        msgs = [
            {'content': SYSTEM_MESSAGE, 'role': 'system'},
@@ -93,13 +93,13 @@ class SWEAgent(Agent):
            action, thought = self._think_act(messages=msgs)

        if not action:
-            action = AgentThinkAction(thought)
+            action = MessageAction(thought)

        self._update(action)
        self.latest_action = action
        return action

-    def search_memory(self, query: str) -> List[str]:
+    def search_memory(self, query: str) -> list[str]:
        return [item for item in self.running_memory if query in item]

    def reset(self) -> None:
@@ -2,21 +2,21 @@ import re

 from opendevin.events.action import (
    Action,
-    AgentEchoAction,
    AgentFinishAction,
-    AgentThinkAction,
    BrowseURLAction,
    CmdRunAction,
    FileReadAction,
    FileWriteAction,
+    MessageAction,
 )

 from .prompts import COMMAND_USAGE, CUSTOM_DOCS

 # commands: exit, read, write, browse, kill, search_file, search_dir

-no_open_file_error = AgentEchoAction(
-    'You are not currently in a file. You can use the read command to open a file and then use goto to navigate through it.')
+no_open_file_error = MessageAction(
+    'You are not currently in a file. You can use the read command to open a file and then use goto to navigate through it.'
+)


 def invalid_error(cmd, docs):
@@ -33,7 +33,9 @@ Try again using this format:
 """


-def get_action_from_string(command_string: str, path: str, line: int, thoughts: str = '') -> Action | None:
+def get_action_from_string(
+    command_string: str, path: str, line: int, thoughts: str = ''
+) -> Action | None:
    """
    Parses the command string to find which command the agent wants to run
    Converts the command into a proper Action and returns
@@ -46,7 +48,7 @@ def get_action_from_string(command_string: str, path: str, line: int, thoughts:
        return AgentFinishAction()

    elif 'think' == cmd:
-        return AgentThinkAction(' '.join(args))
+        return MessageAction(' '.join(args))

    elif 'scroll_up' == cmd:
        if not path:
@@ -68,7 +70,7 @@ def get_action_from_string(command_string: str, path: str, line: int, thoughts:
            end = start + 100
            return FileReadAction(path, start, end, thoughts)
        else:
-            return AgentEchoAction(invalid_error(command_string, 'goto'))
+            return MessageAction(invalid_error(command_string, 'goto'))

    elif 'edit' == cmd:
        if not path:
@@ -83,7 +85,7 @@ def get_action_from_string(command_string: str, path: str, line: int, thoughts:
                change = change[1:-1]
            return FileWriteAction(path, change, start, end, thoughts)
        else:
-            return AgentEchoAction(invalid_error(command_string, 'edit'))
+            return MessageAction(invalid_error(command_string, 'edit'))

    elif 'read' == cmd:
        rex = r'^read\s+(\S+)(?:\s+(\d+))?(?:\s+(-?\d+))?$'
@@ -98,7 +100,7 @@ def get_action_from_string(command_string: str, path: str, line: int, thoughts:

            return FileReadAction(file, start, end, thoughts)
        else:
-            return AgentEchoAction(invalid_error(command_string, 'read'))
+            return MessageAction(invalid_error(command_string, 'read'))

    elif 'write' == cmd:
        rex = r'^write\s+(\S+)\s+(.*?)\s*(\d+)?\s*(-?\d+)?$'
@@ -118,7 +120,7 @@ def get_action_from_string(command_string: str, path: str, line: int, thoughts:

            return FileWriteAction(file, content, start, end, thoughts)
        else:
-            return AgentEchoAction(invalid_error(command_string, 'write'))
+            return MessageAction(invalid_error(command_string, 'write'))

    elif 'browse' == cmd:
        return BrowseURLAction(args[0].strip())
@@ -129,13 +131,15 @@ def get_action_from_string(command_string: str, path: str, line: int, thoughts:
        if valid:
            return CmdRunAction(command_string)
        else:
-            return AgentEchoAction(f'Invalid command structure for\n ```\n{command_string}\n```.\nTry again using this format:\n{CUSTOM_DOCS}')
+            return MessageAction(
+                f'Invalid command structure for\n ```\n{command_string}\n```.\nTry again using this format:\n{CUSTOM_DOCS}'
+            )
    else:
        # check bash command
        obs = str(CmdRunAction(f'type {cmd}'))
        if obs.split(':')[-1].strip() == 'not found':
            # echo not found error for llm
-            return AgentEchoAction(content=obs)
+            return MessageAction(content=obs)
        else:
            # run valid command
            return CmdRunAction(command_string)
@@ -157,8 +161,7 @@ def parse_command(input_str: str, path: str, line: int):
        command_str = parts[1].strip()
        ind = 2 if len(parts) > 2 else 1
        accompanying_text = ''.join(parts[:-ind]).strip()
-        action = get_action_from_string(
-            command_str, path, line, accompanying_text)
+        action = get_action_from_string(command_str, path, line, accompanying_text)
        if action:
            return action, accompanying_text
    return None, input_str  # used for retry
@@ -1,4 +1,3 @@
-
 DEFAULT_COMMANDS_DICT = {
    'exit': 'Executed when task is complete',
    'read <file_name> [<start_line>] [<end_line>]': "Shows a given file's contents starting from <start_line> up to <end_line>. Default: start_line = 0, end_line = -1. By default the whole file will be read.",
@@ -6,12 +5,12 @@ DEFAULT_COMMANDS_DICT = {
    'browse <url>': 'Returns the text version of any url, this can be useful to look up documentation or finding issues on github',
    'scroll_up': 'Takes no arguments. This will scroll up and show you the 100 lines above your current lines',
    'scroll_down': 'Takes no arguments. This will scroll down and show you the 100 lines below your current lines',
-    'edit <start_line> <end_line> <changes>': 'This will modify lines in the currently open file. use start_line and end_line to designate which lines to change and then write the multiline changes',
+    'edit <start_line> <end_line> <changes>': 'This will modify lines in the currently open file. use start_line and end_line to designate which lines to change and then write the multiline changes. Set end_line to -1 to denote the end of the file',
    'goto <line_num>': 'This will take you directly to a line and show you the 100 lines below it.',
    '<bash_command> <args>': 'You can use any bash command you need (cd, ls, rm, grep, dir, mv, wget, git, zip, etc.) with their arguments included',
    'pip install <package>': 'You can use this to import python packages. Make sure you include the correct package name when using this command.',
    'ls': 'Use the ls command to view all the files in your current directory, this is a good starting point.',
-    'NOT ALLOWED': 'You cannot use interactive commands like python or node'
+    'NOT ALLOWED': 'You cannot use interactive commands like python or node',
 }

 COMMAND_USAGE = {
@@ -25,8 +24,7 @@ COMMAND_USAGE = {
    'browse': 'Args:\n<url>\nUsage:\n```\nbrowse https://github.com/OpenDevin/OpenDevin\n```\nThis will fetch the Text elements from the given url and show them to you.',
 }

-DEFAULT_COMMANDS = '\n'.join(
-    [k + ' - ' + v for k, v in DEFAULT_COMMANDS_DICT.items()])
+DEFAULT_COMMANDS = '\n'.join([k + ' - ' + v for k, v in DEFAULT_COMMANDS_DICT.items()])

 # from opendevin.parse_commands import parse_command_file
 # USE parse_command_file(filepath) to get the custom commands
@@ -126,7 +124,8 @@ You have access to a variety of tools and commands that you can use to help you
 """.strip()


-def NO_ACTION(latest): return f"""
+def NO_ACTION(latest):
+    return f"""
 You did not include any action to take in your most recent output:

 ===== Output ======
@@ -154,7 +153,8 @@ def file_info(file: str, line: int):
    """


-def STEP_PROMPT(task, file, line_num): return f"""
+def STEP_PROMPT(task, file, line_num):
+    return f"""
 {RESPONSE_FORMAT}
 You are currently trying to complete this task:
 {task}
@@ -185,7 +185,8 @@ def unpack_dict(data: dict, restrict: list[str] = []):
    return '\n'.join(lines)


-def MEMORY_FORMAT(act, obs): return f"""
+def MEMORY_FORMAT(act, obs):
+    return f"""
 Previous Action:
 {unpack_dict(act, ["content"])}

@@ -1,23 +1,27 @@
 import re
-from typing import List, Mapping

-from agenthub.codeact_agent.prompt import EXAMPLES, SYSTEM_MESSAGE
+from agenthub.codeact_agent.prompt import (
+    COMMAND_DOCS,
+    EXAMPLES,
+    GITHUB_MESSAGE,
+    SYSTEM_PREFIX,
+    SYSTEM_SUFFIX,
+)
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.logger import opendevin_logger as logger
 from opendevin.events.action import (
    Action,
-    AgentEchoAction,
    AgentFinishAction,
-    AgentTalkAction,
+    BrowseInteractiveAction,
    CmdRunAction,
    IPythonRunCellAction,
-    NullAction,
+    MessageAction,
 )
 from opendevin.events.observation import (
-    AgentMessageObservation,
+    BrowserOutputObservation,
    CmdOutputObservation,
    IPythonRunCellObservation,
-    UserMessageObservation,
 )
 from opendevin.llm.llm import LLM
 from opendevin.runtime.plugins import (
@@ -26,16 +30,18 @@ from opendevin.runtime.plugins import (
    SWEAgentCommandsRequirement,
 )

+ENABLE_GITHUB = True
+

 def parse_response(response) -> str:
    action = response.choices[0].message.content
-    for lang in ['bash', 'ipython']:
+    for lang in ['bash', 'ipython', 'browse']:
        if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
            action += f'</execute_{lang}>'
    return action


-def truncate_observation(observation: str, max_chars: int = 5000) -> str:
+def truncate_observation(observation: str, max_chars: int = 10_000) -> str:
    """
    Truncate the middle of the observation if it is too long.
    """
@@ -49,7 +55,39 @@ def truncate_observation(observation: str, max_chars: int = 5000) -> str:
    )


+def swe_agent_edit_hack(bash_command: str) -> str:
+    """
+    Hack to handle the SWE-agent edit command. The vanilla edit command will hang the SSHBox.
+
+    REPLACE THIS:
+    edit 683:693
+            try:
+                return list(urlsplit(url))
+            except ValueError:
+                raise ValidationError(self.error_messages['invalid'], code='invalid')
+    end_of_edit
+
+    WITH THIS:
+    edit 683:693 <<EOF
+            try:
+                return list(urlsplit(url))
+            except ValueError:
+                raise ValidationError(self.error_messages['invalid'], code='invalid')
+    EOF
+    """
+    if 'edit' in bash_command:
+        # edit\s(\d+):(\d+)([\s\S]*)end_of_edit
+        # replace
+        bash_command = re.sub(
+            r'edit\s(\d+):(\d+)([\s\S]*?)end_of_edit',
+            r'edit \1:\2 <<EOF\3EOF',
+            bash_command,
+        )
+    return bash_command
+
+
 class CodeActAgent(Agent):
+    VERSION = '1.3'
    """
    The Code Act Agent is a minimalist agent.
    The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
@@ -86,22 +124,15 @@ class CodeActAgent(Agent):

    """

-    sandbox_plugins: List[PluginRequirement] = [
+    sandbox_plugins: list[PluginRequirement] = [
        JupyterRequirement(),
        SWEAgentCommandsRequirement(),
    ]
-    SUPPORTED_ACTIONS = (
-        CmdRunAction,
-        IPythonRunCellAction,
-        AgentEchoAction,
-        AgentTalkAction,
-        NullAction,
-    )
-    SUPPORTED_OBSERVATIONS = (
-        AgentMessageObservation,
-        UserMessageObservation,
-        CmdOutputObservation,
-        IPythonRunCellObservation,
+
+    system_message: str = (
+        f'{SYSTEM_PREFIX}\n{GITHUB_MESSAGE}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
+        if ENABLE_GITHUB
+        else f'{SYSTEM_PREFIX}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
    )

    def __init__(
@@ -115,7 +146,21 @@ class CodeActAgent(Agent):
        - llm (LLM): The llm to be used by this agent
        """
        super().__init__(llm)
-        self.messages: List[Mapping[str, str]] = []
+        self.reset()
+
+    def reset(self) -> None:
+        """
+        Resets the CodeAct Agent.
+        """
+        super().reset()
+        self.messages: list[dict[str, str]] = [
+            {'role': 'system', 'content': self.system_message},
+            {
+                'role': 'user',
+                'content': f"Here is an example of how you can interact with the environment for task solving:\n{EXAMPLES}\n\nNOW, LET'S START!",
+            },
+        ]
+        self.cost_accumulator = 0

    def step(self, state: State) -> Action:
        """
@@ -128,76 +173,72 @@ class CodeActAgent(Agent):
        Returns:
        - CmdRunAction(command) - bash command to run
        - IPythonRunCellAction(code) - IPython code to run
-        - AgentTalkAction(content) - Talk action to run (e.g. ask for clarification)
+        - BrowseInteractiveAction(browsergym_command) - BrowserGym commands to run
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
        - AgentFinishAction() - end the interaction
        """

-        if len(self.messages) == 0:
-            assert state.plan.main_goal, 'Expecting instruction to be set'
-            self.messages = [
-                {'role': 'system', 'content': SYSTEM_MESSAGE},
-                {
-                    'role': 'user',
-                    'content': (
-                        f'Here is an example of how you can interact with the environment for task solving:\n{EXAMPLES}\n\n'
-                        f"NOW, LET'S START!\n\n{state.plan.main_goal}"
-                    ),
-                },
-            ]
        updated_info = state.updated_info
        if updated_info:
            for prev_action, obs in updated_info:
-                assert isinstance(
-                    prev_action, self.SUPPORTED_ACTIONS
-                ), f'{prev_action.__class__} is not supported (supported: {self.SUPPORTED_ACTIONS})'
-                # prev_action is already added to self.messages when returned
-
-                # handle observations
-                assert isinstance(
-                    obs, self.SUPPORTED_OBSERVATIONS
-                ), f'{obs.__class__} is not supported (supported: {self.SUPPORTED_OBSERVATIONS})'
-                if isinstance(obs, (AgentMessageObservation, UserMessageObservation)):
-                    self.messages.append({'role': 'user', 'content': obs.content})
-
-                    # User wants to exit
-                    if obs.content.strip() == '/exit':
+                if (
+                    isinstance(prev_action, MessageAction)
+                    and prev_action.source == 'user'
+                ):
+                    self.messages.append(
+                        {'role': 'user', 'content': prev_action.content}
+                    )
+                    if prev_action.content.strip() == '/exit':
+                        # User wants to exit
                        return AgentFinishAction()
-                elif isinstance(obs, CmdOutputObservation):
+
+                if isinstance(obs, CmdOutputObservation):
                    content = 'OBSERVATION:\n' + truncate_observation(obs.content)
                    content += f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]]'
                    self.messages.append({'role': 'user', 'content': content})
-
                elif isinstance(obs, IPythonRunCellObservation):
                    content = 'OBSERVATION:\n' + obs.content
                    # replace base64 images with a placeholder
-                    splited = content.split('\n')
-                    for i, line in enumerate(splited):
+                    splitted = content.split('\n')
+                    for i, line in enumerate(splitted):
                        if '![image](data:image/png;base64,' in line:
-                            splited[i] = (
+                            splitted[i] = (
                                '![image](data:image/png;base64, ...) already displayed to user'
                            )
-                    content = '\n'.join(splited)
+                    content = '\n'.join(splitted)
                    content = truncate_observation(content)
                    self.messages.append({'role': 'user', 'content': content})
-                else:
-                    raise NotImplementedError(
-                        f'Unknown observation type: {obs.__class__}'
-                    )
+                elif isinstance(obs, BrowserOutputObservation):
+                    content = 'OBSERVATION:\n' + truncate_observation(obs.content)
+                    self.messages.append({'role': 'user', 'content': content})
+
+        latest_user_message = [m for m in self.messages if m['role'] == 'user'][-1]
+        if latest_user_message:
+            latest_user_message['content'] += (
+                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
+            )

        response = self.llm.completion(
            messages=self.messages,
            stop=[
                '</execute_ipython>',
                '</execute_bash>',
+                '</execute_browse>',
            ],
            temperature=0.0,
        )
+
+        self.log_cost(response)
+
        action_str: str = parse_response(response)
        state.num_of_chars += sum(
            len(message['content']) for message in self.messages
        ) + len(action_str)
        self.messages.append({'role': 'assistant', 'content': action_str})

+        if finish_command := re.search(r'<finish>.*</finish>', action_str, re.DOTALL):
+            thought = action_str.replace(finish_command.group(0), '').strip()
+            return AgentFinishAction(thought=thought)
        if bash_command := re.search(
            r'<execute_bash>(.*)</execute_bash>', action_str, re.DOTALL
        ):
@@ -205,6 +246,8 @@ class CodeActAgent(Agent):
            thought = action_str.replace(bash_command.group(0), '').strip()
            # a command was found
            command_group = bash_command.group(1).strip()
+            command_group = swe_agent_edit_hack(command_group)
+
            if command_group.strip() == 'exit':
                return AgentFinishAction()
            return CmdRunAction(command=command_group, thought=thought)
@@ -215,10 +258,31 @@ class CodeActAgent(Agent):
            code_group = python_code.group(1).strip()
            thought = action_str.replace(python_code.group(0), '').strip()
            return IPythonRunCellAction(code=code_group, thought=thought)
+        elif browse_command := re.search(
+            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
+        ):
+            # BrowserGym actions was found
+            browse_actions = browse_command.group(1).strip()
+            thought = action_str.replace(browse_command.group(0), '').strip()
+            return BrowseInteractiveAction(
+                browser_actions=browse_actions, thought=thought
+            )
        else:
            # We assume the LLM is GOOD enough that when it returns pure natural language
            # it want to talk to the user
-            return AgentTalkAction(content=action_str)
+            return MessageAction(content=action_str, wait_for_response=True)

-    def search_memory(self, query: str) -> List[str]:
+    def search_memory(self, query: str) -> list[str]:
        raise NotImplementedError('Implement this abstract method')
+
+    def log_cost(self, response):
+        try:
+            cur_cost = self.llm.completion_cost(response)
+        except Exception:
+            cur_cost = 0
+        self.cost_accumulator += cur_cost
+        logger.info(
+            'Cost: %.2f USD | Accumulated Cost: %.2f USD',
+            cur_cost,
+            self.cost_accumulator,
+        )
@@ -18,31 +18,41 @@ search_file <search_term> [<file>] - searches for search_term in file. If file i
 find_file <file_name> [<dir>] - finds all files with the given name in dir. If dir is not provided, searches in the current directory
 edit <start_line>:<end_line>
 <replacement_text>
-end_of_edit - replaces lines <start_line> through <end_line> (inclusive) with the given text in the open file. The replacement text is terminated by a line with only end_of_edit on it. All of the <replacement text> will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again.
+end_of_edit - replaces lines <start_line> through <end_line> (inclusive) with the given text in the open file. The replacement text is terminated by a line with only end_of_edit on it. All of the <replacement text> will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again. Remember, the file must be open before editing.
 """

-_COMMAND_DOCS = (
+COMMAND_DOCS = (
    '\nApart from the standard bash commands, you can also use the following special commands in <execute_bash> environment:\n'
    f'{_SWEAGENT_BASH_DOCS}'
    "Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. If you'd like to add the line '        print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run."
 )

-SYSTEM_MESSAGE = f"""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+SYSTEM_PREFIX = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
 The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
 <execute_ipython>
 print("Hello World!")
 </execute_ipython>
 The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
 For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can browse the Internet with commands on behalf of the user by wrapping them with <execute_browse> and </execute_browse>.
+For example, you can browse a given URL by <execute_browse> goto("<URL>") </execute_browse>.
 The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
 The assistant can install Python packages through bash by <execute_bash> pip install [package needed] </execute_bash> and should always import packages and define variables before starting to use them.
 The assistant should stop <execute> and provide an answer when they have already obtained the answer from the execution result.
+If the assistant encounters an import error in IPython for a newly installed package, they should try to restart the kernel and import the package again. IPython kernel can be re-started by:
+<execute_ipython>
+import IPython
+IPython.Application.instance().kernel.do_shutdown(True)  # Restart the kernel
+</execute_ipython>"""

-{_COMMAND_DOCS}
+GITHUB_MESSAGE = """To do any activities on GitHub, you should use the token in the $GITHUB_TOKEN environment variable.
+For instance, to push a local branch `my_branch` to the github repo `owner/repo`, you can use the following four commands:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If you require access to GitHub but $GITHUB_TOKEN is not set, ask the user to set it for you."""

-The assistant's response should be concise, but do express their thoughts.
-Try to include one of <execute_ipython> or <execute_bash> in each of your responses, unless it is a direct answer to a question OR a message to the user.
-IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+SYSTEM_SUFFIX = """The assistant's response should be concise.
+You should include <execute_ipython> or <execute_bash> or <execute_browse> in every one of your responses, unless you are finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> or <execute_browse> instead of providing it.
 """

 EXAMPLES = """
@@ -146,6 +156,21 @@ Press CTRL+C to quit
 ASSISTANT:
 The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!

+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+goto("http://127.0.0.1:5000")
+</execute_browse>
+
+USER:
+Observation:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
 USER: Now kill the server, make it display the numbers in a table format.

 ASSISTANT:
@@ -179,12 +204,11 @@ USER:
 11:    app.run(port=5000)

 ASSISTANT:
-I should edit the file to display the numbers in a table format. Let me do that for you:
-
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
 <execute_bash>
-edit 8:8 <<EOF
+edit 8:8
    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
-EOF
+end_of_edit
 </execute_bash>

 USER:
@@ -223,4 +247,5 @@ INVALID_INPUT_MESSAGE = (
    "I don't understand your input. \n"
    'If you want to execute a bash command, please use <execute_bash> YOUR_COMMAND_HERE </execute_bash>.\n'
    'If you want to execute a block of Python code, please use <execute_ipython> YOUR_COMMAND_HERE </execute_ipython>.\n'
+    'If you want to browse the Internet, please use <execute_browse> YOUR_COMMAND_HERE </execute_browse>.\n'
 )
@@ -1,5 +1,3 @@
-from typing import List
-
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.events.action import Action, AgentDelegateAction, AgentFinishAction
@@ -38,48 +36,50 @@ class DelegatorAgent(Agent):
        """
        if self.current_delegate == '':
            self.current_delegate = 'study'
+            task = state.get_current_user_intent()
            return AgentDelegateAction(
-                agent='StudyRepoForTaskAgent', inputs={'task': state.plan.main_goal}
+                agent='StudyRepoForTaskAgent', inputs={'task': task}
            )

-        lastObservation = state.history[-1][1]
-        if not isinstance(lastObservation, AgentDelegateObservation):
+        last_observation = state.history[-1][1]
+        if not isinstance(last_observation, AgentDelegateObservation):
            raise Exception('Last observation is not an AgentDelegateObservation')

+        goal = state.get_current_user_intent()
        if self.current_delegate == 'study':
            self.current_delegate = 'coder'
            return AgentDelegateAction(
-                agent='Coder',
+                agent='CoderAgent',
                inputs={
-                    'task': state.plan.main_goal,
-                    'summary': lastObservation.outputs['summary'],
+                    'task': goal,
+                    'summary': last_observation.outputs['summary'],
                },
            )
        elif self.current_delegate == 'coder':
            self.current_delegate = 'verifier'
            return AgentDelegateAction(
-                agent='Verifier',
+                agent='VerifierAgent',
                inputs={
-                    'task': state.plan.main_goal,
+                    'task': goal,
                },
            )
        elif self.current_delegate == 'verifier':
            if (
-                'completed' in lastObservation.outputs
-                and lastObservation.outputs['completed']
+                'completed' in last_observation.outputs
+                and last_observation.outputs['completed']
            ):
                return AgentFinishAction()
            else:
                self.current_delegate = 'coder'
                return AgentDelegateAction(
-                    agent='Coder',
+                    agent='CoderAgent',
                    inputs={
-                        'task': state.plan.main_goal,
-                        'summary': lastObservation.outputs['summary'],
+                        'task': goal,
+                        'summary': last_observation.outputs['summary'],
                    },
                )
        else:
            raise Exception('Invalid delegate state')

-    def search_memory(self, query: str) -> List[str]:
+    def search_memory(self, query: str) -> list[str]:
        return []
@@ -1,5 +1,5 @@
 import time
-from typing import List, TypedDict
+from typing import TypedDict

 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
@@ -8,11 +8,12 @@ from opendevin.events.action import (
    AddTaskAction,
    AgentFinishAction,
    AgentRecallAction,
-    AgentThinkAction,
+    AgentRejectAction,
    BrowseURLAction,
    CmdRunAction,
    FileReadAction,
    FileWriteAction,
+    MessageAction,
    ModifyTaskAction,
 )
 from opendevin.events.observation import (
@@ -23,6 +24,7 @@ from opendevin.events.observation import (
    NullObservation,
    Observation,
 )
+from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM

 """
@@ -34,7 +36,7 @@ FIXME: There are a few problems this surfaced
 """

 ActionObs = TypedDict(
-    'ActionObs', {'action': Action, 'observations': List[Observation]}
+    'ActionObs', {'action': Action, 'observations': list[Observation]}
 )

 BACKGROUND_CMD = 'echo "This is in the background" && sleep .1 && echo "This too"'
@@ -48,7 +50,7 @@ class DummyAgent(Agent):

    def __init__(self, llm: LLM):
        super().__init__(llm)
-        self.steps: List[ActionObs] = [
+        self.steps: list[ActionObs] = [
            {
                'action': AddTaskAction(parent='0', goal='check the current directory'),
                'observations': [NullObservation('')],
@@ -58,11 +60,11 @@ class DummyAgent(Agent):
                'observations': [NullObservation('')],
            },
            {
-                'action': ModifyTaskAction(id='0.0', state='in_progress'),
+                'action': ModifyTaskAction(task_id='0.0', state='in_progress'),
                'observations': [NullObservation('')],
            },
            {
-                'action': AgentThinkAction(thought='Time to get started!'),
+                'action': MessageAction('Time to get started!'),
                'observations': [NullObservation('')],
            },
            {
@@ -95,7 +97,7 @@ class DummyAgent(Agent):
                'action': CmdRunAction(command=BACKGROUND_CMD, background=True),
                'observations': [
                    CmdOutputObservation(
-                        'Background command started. To stop it, send a `kill` action with id 42',
+                        'Background command started. To stop it, send a `kill` action with command_id 42',
                        command_id='42',  # type: ignore[arg-type]
                        command=BACKGROUND_CMD,
                    ),
@@ -123,6 +125,10 @@ class DummyAgent(Agent):
                'action': AgentFinishAction(),
                'observations': [],
            },
+            {
+                'action': AgentRejectAction(),
+                'observations': [],
+            },
        ]

    def step(self, state: State) -> Action:
@@ -133,8 +139,8 @@ class DummyAgent(Agent):
                expected_observations = prev_step['observations']
                hist_start = len(state.history) - len(expected_observations)
                for i in range(len(expected_observations)):
-                    hist_obs = state.history[hist_start + i][1].to_dict()
-                    expected_obs = expected_observations[i].to_dict()
+                    hist_obs = event_to_dict(state.history[hist_start + i][1])
+                    expected_obs = event_to_dict(expected_observations[i])
                    if (
                        'command_id' in hist_obs['extras']
                        and hist_obs['extras']['command_id'] != -1
@@ -155,5 +161,5 @@ class DummyAgent(Agent):
                    ), f'Expected observation {expected_obs}, got {hist_obs}'
        return self.steps[state.iteration]['action']

-    def search_memory(self, query: str) -> List[str]:
+    def search_memory(self, query: str) -> list[str]:
        return ['I am a computer.']
@@ -1,2 +1,2 @@
 * `kill` - kills a background command
-  * `id` - the ID of the background command to kill
+  * `command_id` - the ID of the background command to kill
@@ -0,0 +1,3 @@
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
@@ -0,0 +1,2 @@
+* `reject` - reject the task. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
@@ -1,2 +0,0 @@
-* `think` - make a plan, set a goal, or record your thoughts. Arguments:
-  * `thought` - the thought to record
@@ -1,13 +1,11 @@
-import copy
-import json
-from typing import Dict, List
-
 from jinja2 import BaseLoader, Environment

 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.exceptions import LLMOutputError
-from opendevin.events.action import Action, action_from_dict
+from opendevin.core.utils import json
+from opendevin.events.action import Action
+from opendevin.events.serialization.action import action_from_dict
+from opendevin.events.serialization.event import event_to_memory
 from opendevin.llm.llm import LLM

 from .instructions import instructions
@@ -15,71 +13,35 @@ from .registry import all_microagents


 def parse_response(orig_response: str) -> Action:
-    json_start = orig_response.find('{')
-    json_end = orig_response.rfind('}') + 1
-    response = orig_response[json_start:json_end]
-    try:
-        action_dict = json.loads(response)
-    except json.JSONDecodeError as e:
-        raise LLMOutputError(
-            'Invalid JSON in response. Please make sure the response is a valid JSON object'
-        ) from e
-    action = action_from_dict(action_dict)
-    return action
+    # attempt to load the JSON dict from the response
+    action_dict = json.loads(orig_response)

-
-def my_encoder(obj):
-    """
-    Encodes objects as dictionaries
-
-    Parameters:
-    - obj (Object): An object that will be converted
-
-    Returns:
-    - dict: If the object can be converted it is returned in dict format
-    """
-    if hasattr(obj, 'to_dict'):
-        return obj.to_dict()
-
-
-def _remove_fields(obj, fields: set[str]):
-    """
-    Remove fields from an object
-
-    Parameters:
-    - obj (Object): The object to remove fields from
-    - fields (set[str]): A set of field names to remove from the object
-    """
-    if isinstance(obj, dict):
-        for field in fields:
-            if field in obj:
-                del obj[field]
-        for _, value in obj.items():
-            _remove_fields(value, fields)
-    elif isinstance(obj, list) or isinstance(obj, tuple):
-        for item in obj:
-            _remove_fields(item, fields)
-    elif hasattr(obj, '__dataclass_fields__'):
-        for field in fields:
-            if field in obj.__dataclass_fields__:
-                setattr(obj, field, None)
-        for value in obj.__dict__.values():
-            _remove_fields(value, fields)
+    # load the action from the dict
+    return action_from_dict(action_dict)


 def to_json(obj, **kwargs):
    """
    Serialize an object to str format
    """
-    # Remove things like screenshots that shouldn't be in a prompt
-    sanitized_obj = copy.deepcopy(obj)
-    _remove_fields(sanitized_obj, {'screenshot'})
-    return json.dumps(sanitized_obj, default=my_encoder, **kwargs)
+    return json.dumps(obj, **kwargs)
+
+
+def history_to_json(obj, **kwargs):
+    """
+    Serialize and simplify history to str format
+    """
+    if isinstance(obj, list):
+        # process history, make it simpler.
+        processed_history = []
+        for action, observation in obj:
+            processed_history.append((event_to_memory(action), event_to_memory(observation)))
+        return json.dumps(processed_history, **kwargs)


 class MicroAgent(Agent):
    prompt = ''
-    agent_definition: Dict = {}
+    agent_definition: dict = {}

    def __init__(self, llm: LLM):
        super().__init__(llm)
@@ -90,11 +52,14 @@ class MicroAgent(Agent):
        del self.delegates[self.agent_definition['name']]

    def step(self, state: State) -> Action:
+        latest_user_message = state.get_current_user_intent()
        prompt = self.prompt_template.render(
            state=state,
            instructions=instructions,
            to_json=to_json,
+            history_to_json=history_to_json,
            delegates=self.delegates,
+            latest_user_message=latest_user_message,
        )
        messages = [{'content': prompt, 'role': 'user'}]
        resp = self.llm.completion(messages=messages)
@@ -103,5 +68,5 @@ class MicroAgent(Agent):
        action = parse_response(action_resp)
        return action

-    def search_memory(self, query: str) -> List[str]:
+    def search_memory(self, query: str) -> list[str]:
        return []
@@ -2,7 +2,7 @@
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:

-{{ state.plan.main_goal }}
+{{ latest_user_message }}

 {% if state.inputs.summary %}
 Here's a summary of the codebase, as it relates to this task:
@@ -14,14 +14,14 @@ Here's a summary of the codebase, as it relates to this task:
 {{ instructions.actions.run }}
 {{ instructions.actions.write }}
 {{ instructions.actions.read }}
-{{ instructions.actions.think }}
+{{ instructions.actions.message }}
 {{ instructions.actions.finish }}

 Do NOT finish until you have completed the tasks.

 ## History
 {{ instructions.history_truncated }}
-{{ to_json(state.history[-10:]) }}
+{{ history_to_json(state.history[-10:]) }}

 ## Format
 {{ instructions.format.action }}
@@ -1,6 +1,5 @@
 name: CommitWriterAgent
 description: "Write a git commit message for files in the git staging area"
-generates: Action
 inputs: {}
 outputs:
  answer: string
@@ -12,16 +12,19 @@ changes. The commit message should include:
 - Optionally, a detailed description if the changes are complex or need further explanation.

 You should find the diff using `git diff --cached`, compile a commit message,
-and call the `finish` action with `outputs.answer` set to the answer.
+and call the `finish` action with `outputs.answer` set to the answer. If current
+repo is not a valid git repo, or there is no diff in the staging area, please call
+the `reject` action with `outputs.answer` set to the reason.

 ## History
 {{ instructions.history_truncated }}
-{{ to_json(state.history[-10:]) }}
+{{ history_to_json(state.history[-10:]) }}

 If the last item in the history is an error, you should try to fix it.

 ## Available Actions
 {{ instructions.actions.run }}
+{{ instructions.actions.reject }}
 {{ instructions.actions.finish }}

 ## Format
@@ -1,19 +1,21 @@
 import os
-from typing import Dict

-instructions: Dict = {}
+instructions: dict = {}

 base_dir = os.path.dirname(os.path.abspath(__file__)) + '/_instructions'
 for root, dirs, files in os.walk(base_dir):
    if len(files) == 0:
        continue
-    rel_base = os.path.relpath(root, base_dir)
-    keys = rel_base.split('/')
-    obj = instructions
-    for key in keys:
-        if key not in obj:
-            obj[key] = {}
-        obj = obj[key]
+    if root == base_dir:
+        obj = instructions
+    else:
+        rel_base = os.path.relpath(root, base_dir)
+        keys = rel_base.split('/')
+        obj = instructions
+        for key in keys:
+            if key not in obj:
+                obj[key] = {}
+            obj = obj[key]
    for file in files:
        without_ext = os.path.splitext(file)[0]
        with open(os.path.join(root, file), 'r') as f:
@@ -1,6 +1,6 @@
 # Task
 You are in charge of accomplishing the following task:
-{{ state.plan.main_goal }}
+{{ latest_user_message }}

 In order to accomplish this goal, you must delegate tasks to one or more agents, who
 can do the actual work. A description of each agent is provided below. You MUST
@@ -17,7 +17,7 @@ provide the correct inputs for the delegate you select.

 ## History
 {{ instructions.history_truncated }}
-{{ to_json(state.history[-10:]) }}
+{{ history_to_json(state.history[-10:]) }}

 ## Available Actions
 {{ instructions.actions.delegate }}
@@ -1,6 +1,5 @@
 name: MathAgent
 description: "Solves simple and complex math problems using python"
-generates: Action
 container: python:3.12.3-bookworm
 inputs:
  task: string
@@ -1,7 +1,7 @@
 # Task
 You are a brilliant mathematician and programmer. You've been given the following problem to solve:

-{{ state.plan.main_goal }}
+{{ latest_user_message }}

 Please write a python script that solves this problem, and prints the answer to stdout.
 ONLY print the answer to stdout, nothing else.
@@ -10,7 +10,7 @@ and call the `finish` action with `outputs.answer` set to the answer.

 ## History
 {{ instructions.history_truncated }}
-{{ to_json(state.history[-10:]) }}
+{{ history_to_json(state.history[-10:]) }}

 If the last item in the history is an error, you should try to fix it.

@@ -1,6 +1,5 @@
 name: PostgresAgent
 description: Writes and maintains PostgreSQL migrations
-generates: Action
 inputs:
  task: string
 outputs: {}
@@ -2,7 +2,7 @@
 You are a database engineer. You are working on an existing Postgres project, and have been given
 the following task:

-{{ state.plan.main_goal }}
+{{ latest_user_message }}

 You must:
 * Investigate the existing migrations to understand the current schema
@@ -11,14 +11,14 @@ You must:

 ## Actions
 You may take any of the following actions:
-{{ instructions.actions.think }}
+{{ instructions.actions.message }}
 {{ instructions.actions.read }}
 {{ instructions.actions.write }}
 {{ instructions.actions.run }}

 ## History
 {{ instructions.history_truncated }}
-{{ to_json(state.history[-10:]) }}
+{{ history_to_json(state.history[-10:]) }}

 ## Format
 {{ instructions.format.action }}
@@ -10,7 +10,7 @@ of the codebase, including:
 ## Available Actions
 {{ instructions.actions.run }}
 {{ instructions.actions.read }}
-{{ instructions.actions.think }}
+{{ instructions.actions.message }}
 {{ instructions.actions.finish }}

 You should ONLY `run` commands that have no side-effects, like `ls` and `grep`.
@@ -20,7 +20,7 @@ When you're done, put your summary into the output of the `finish` action.

 ## History
 {{ instructions.history_truncated }}
-{{ to_json(state.history[-10:]) }}
+{{ history_to_json(state.history[-10:]) }}

 ## Format
 {{ instructions.format.action }}
@@ -3,12 +3,12 @@ You are a software engineer. You've inherited an existing codebase, which you're
 learning about for the first time. You need to study the codebase to find all
 the information needed to complete this task:

-{{ state.plan.main_goal }}
+{{ latest_user_message }}

 ## Available Actions
 {{ instructions.actions.run }}
 {{ instructions.actions.read }}
-{{ instructions.actions.think }}
+{{ instructions.actions.message }}
 {{ instructions.actions.finish }}

 You must ONLY `run` commands that have no side-effects, like `ls` and `grep`.
@@ -19,7 +19,7 @@ When you're done, put your summary in `outputs.summary` in the `finish` action.

 ## History
 {{ instructions.history_truncated }}
-{{ to_json(state.history[-10:]) }}
+{{ history_to_json(state.history[-10:]) }}

 ## Format
 {{ instructions.format.action }}
@@ -0,0 +1,5 @@
+name: TypoFixerAgent
+description: Fixes typos in files in the current working directory
+inputs: {}
+outputs:
+  summary: string
@@ -0,0 +1,46 @@
+# Task
+You are a proofreader tasked with fixing typos in the files in your current working directory. Your goal is to:
+1. Scan the files for typos
+2. Overwrite the files with the typos fixed
+3. Provide a summary of the typos fixed
+
+## Available Actions
+{{ instructions.actions.read }}
+{{ instructions.actions.write }}
+{{ instructions.actions.run }}
+{{ instructions.actions.message }}
+{{ instructions.actions.finish }}
+
+To complete this task:
+1. Use the `read` action to read the contents of the files in your current working directory. Make sure to provide the file path in the format `'./file_name.ext'`.
+2. Use the `think` action to analyze the contents and identify typos.
+3. Use the `write` action to create new versions of the files with the typos fixed.
+  - Overwrite the original files with the corrected content. Make sure to provide the file path in the format `'./file_name.ext'`.
+4. Use the `think` action to generate a summary of the typos fixed, including the original and fixed versions of each typo, and the file(s) they were found in.
+5. Use the `finish` action to return the summary in the `outputs.summary` field.
+
+Do NOT finish until you have fixed all the typos and generated a summary.
+
+## History
+{{ instructions.history_truncated }}
+{{ history_to_json(state.history[-5:]) }}
+
+## Format
+{{ instructions.format.action }}
+
+For example, if you want to use the read action to read the contents of a file named example.txt, your response should look like this:
+{
+  "action": "read",
+  "args": {
+    "path": "./example.txt"
+  }
+}
+
+Similarly, if you want to use the write action to write content to a file named output.txt, your response should look like this:
+{
+  "action": "write",
+  "args": {
+    "path": "./output.txt",
+    "content": "This is the content to be written to the file."
+  }
+}
@@ -2,14 +2,14 @@
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:

-{{ state.plan.main_goal }}
+{{ latest_user_message }}

 Your goal is to verify that the changes are correct and bug-free.

 ## Available Actions
 {{ instructions.actions.run }}
 {{ instructions.actions.read }}
-{{ instructions.actions.think }}
+{{ instructions.actions.message }}
 {{ instructions.actions.finish }}

 You must ONLY `run` commands that have no side-effects, like `ls`, `grep`, and test scripts.
@@ -21,7 +21,7 @@ explaining what the problem is.

 ## History
 {{ instructions.history_truncated }}
-{{ to_json(state.history[-10:]) }}
+{{ history_to_json(state.history[-10:]) }}

 ## Format
 {{ instructions.format.action }}
@@ -1,22 +1,17 @@
-from typing import List
-
 import agenthub.monologue_agent.utils.prompts as prompts
-from agenthub.monologue_agent.utils.monologue import Monologue
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core import config
+from opendevin.core.config import config
 from opendevin.core.exceptions import AgentNoInstructionError
 from opendevin.core.schema import ActionType
-from opendevin.core.schema.config import ConfigType
 from opendevin.events.action import (
    Action,
    AgentRecallAction,
-    AgentThinkAction,
    BrowseURLAction,
    CmdRunAction,
    FileReadAction,
    FileWriteAction,
-    GitHubPushAction,
+    MessageAction,
    NullAction,
 )
 from opendevin.events.observation import (
@@ -27,12 +22,15 @@ from opendevin.events.observation import (
    NullObservation,
    Observation,
 )
+from opendevin.events.serialization.event import event_to_memory
 from opendevin.llm.llm import LLM
+from opendevin.memory.condenser import MemoryCondenser
+from opendevin.memory.history import ShortTermHistory

-if config.get(ConfigType.AGENT_MEMORY_ENABLED):
-    from agenthub.monologue_agent.utils.memory import LongTermMemory
+if config.agent.memory_enabled:
+    from opendevin.memory.memory import LongTermMemory

-MAX_MONOLOGUE_LENGTH = 20000
+MAX_TOKEN_COUNT_PADDING = 512
 MAX_OUTPUT_LENGTH = 5000

 INITIAL_THOUGHTS = [
@@ -71,10 +69,6 @@ INITIAL_THOUGHTS = [
    'BROWSE google.com',
    '<form><input type="text"></input><button type="submit"></button></form>',
    'I can browse the web too!',
-    'If I have done some work and I want to push it to github, I can do that also!',
-    "Let's do it.",
-    'PUSH owner/repo branch',
-    'The repo was successfully pushed to https://github.com/owner/repo/branch',
    'And once I have completed my task, I can use the finish action to stop working.',
    "But I should only use the finish action when I'm absolutely certain that I've completed my task and have tested my work.",
    'Very cool. Now to accomplish my task.',
@@ -93,8 +87,9 @@ class MonologueAgent(Agent):
    """

    _initialized = False
-    monologue: Monologue
+    monologue: ShortTermHistory
    memory: 'LongTermMemory | None'
+    memory_condenser: MemoryCondenser

    def __init__(self, llm: LLM):
        """
@@ -105,7 +100,7 @@ class MonologueAgent(Agent):
        """
        super().__init__(llm)

-    def _add_event(self, event: dict):
+    def _add_event(self, event_dict: dict):
        """
        Adds a new event to the agent's monologue and memory.
        Monologue automatically condenses when it gets too large.
@@ -114,22 +109,34 @@ class MonologueAgent(Agent):
        - event (dict): The event that will be added to monologue and memory
        """

-        if 'extras' in event and 'screenshot' in event['extras']:
-            del event['extras']['screenshot']
        if (
-            'args' in event
-            and 'output' in event['args']
-            and len(event['args']['output']) > MAX_OUTPUT_LENGTH
+            'args' in event_dict
+            and 'output' in event_dict['args']
+            and len(event_dict['args']['output']) > MAX_OUTPUT_LENGTH
        ):
-            event['args']['output'] = (
-                event['args']['output'][:MAX_OUTPUT_LENGTH] + '...'
+            event_dict['args']['output'] = (
+                event_dict['args']['output'][:MAX_OUTPUT_LENGTH] + '...'
            )

-        self.monologue.add_event(event)
+        self.monologue.add_event(event_dict)
        if self.memory is not None:
-            self.memory.add_event(event)
-        if self.monologue.get_total_length() > MAX_MONOLOGUE_LENGTH:
-            self.monologue.condense(self.llm)
+            self.memory.add_event(event_dict)
+
+        # Test monologue token length
+        prompt = prompts.get_request_action_prompt(
+            '',
+            self.monologue.get_events(),
+            [],
+        )
+        messages = [{'content': prompt, 'role': 'user'}]
+        token_count = self.llm.get_token_count(messages)
+
+        if token_count + MAX_TOKEN_COUNT_PADDING > self.llm.max_input_tokens:
+            prompt = prompts.get_summarize_monologue_prompt(self.monologue.events)
+            summary_response = self.memory_condenser.condense(
+                summarize_prompt=prompt, llm=self.llm
+            )
+            self.monologue.events = prompts.parse_summary_response(summary_response)

    def _initialize(self, task: str):
        """
@@ -151,12 +158,14 @@ class MonologueAgent(Agent):
        if task is None or task == '':
            raise AgentNoInstructionError()

-        self.monologue = Monologue()
-        if config.get(ConfigType.AGENT_MEMORY_ENABLED):
+        self.monologue = ShortTermHistory()
+        if config.agent.memory_enabled:
            self.memory = LongTermMemory()
        else:
            self.memory = None

+        self.memory_condenser = MemoryCondenser()
+
        self._add_initial_thoughts(task)
        self._initialized = True

@@ -178,7 +187,7 @@ class MonologueAgent(Agent):
                    observation = BrowserOutputObservation(
                        content=thought, url='', screenshot=''
                    )
-                self._add_event(observation.to_memory())
+                self._add_event(event_to_memory(observation))
                previous_action = ''
            else:
                action: Action = NullAction()
@@ -203,14 +212,9 @@ class MonologueAgent(Agent):
                    url = thought.split('BROWSE ')[1]
                    action = BrowseURLAction(url=url)
                    previous_action = ActionType.BROWSE
-                elif thought.startswith('PUSH'):
-                    owner_repo, branch = thought.split('PUSH ')[1].split(' ')
-                    owner, repo = owner_repo.split('/')
-                    action = GitHubPushAction(owner=owner, repo=repo, branch=branch)
-                    previous_action = ActionType.PUSH
                else:
-                    action = AgentThinkAction(thought=thought)
-                self._add_event(action.to_memory())
+                    action = MessageAction(thought)
+                self._add_event(event_to_memory(action))

    def step(self, state: State) -> Action:
        """
@@ -222,16 +226,18 @@ class MonologueAgent(Agent):
        Returns:
        - Action: The next action to take based on LLM response
        """
-        self._initialize(state.plan.main_goal)
+
+        goal = state.get_current_user_intent()
+        self._initialize(goal)
        for prev_action, obs in state.updated_info:
-            self._add_event(prev_action.to_memory())
-            self._add_event(obs.to_memory())
+            self._add_event(event_to_memory(prev_action))
+            self._add_event(event_to_memory(obs))

        state.updated_info = []

        prompt = prompts.get_request_action_prompt(
-            state.plan.main_goal,
-            self.monologue.get_thoughts(),
+            goal,
+            self.monologue.get_events(),
            state.background_commands_obs,
        )
        messages = [{'content': prompt, 'role': 'user'}]
@@ -242,7 +248,7 @@ class MonologueAgent(Agent):
        self.latest_action = action
        return action

-    def search_memory(self, query: str) -> List[str]:
+    def search_memory(self, query: str) -> list[str]:
        """
        Uses VectorIndexRetriever to find related memories within the long term memory.
        Uses search to produce top 10 results.
@@ -251,7 +257,7 @@ class MonologueAgent(Agent):
        - query (str): The query that we want to find related memories for

        Returns:
-        - List[str]: A list of top 10 text results that matched the query
+        - list[str]: A list of top 10 text results that matched the query
        """
        if self.memory is None:
            return []
@@ -1,38 +0,0 @@
-import json
-
-from json_repair import repair_json
-
-
-def my_encoder(obj):
-    """
-    Encodes objects as dictionaries
-
-    Parameters:
-    - obj (Object): An object that will be converted
-
-    Returns:
-    - dict: If the object can be converted it is returned in dict format
-    """
-    if hasattr(obj, 'to_dict'):
-        return obj.to_dict()
-
-
-def dumps(obj, **kwargs):
-    """
-    Serialize an object to str format
-    """
-
-    return json.dumps(obj, default=my_encoder, **kwargs)
-
-
-def loads(s, **kwargs):
-    """
-    Create a JSON object from str
-    """
-    json_start = s.find('{')
-    json_end = s.rfind('}') + 1
-    if json_start == -1 or json_end == -1:
-        raise ValueError('Invalid response: no JSON found')
-    s = s[json_start:json_end]
-    s = repair_json(s)
-    return json.loads(s, **kwargs)
@@ -1,79 +0,0 @@
-import agenthub.monologue_agent.utils.json as json
-import agenthub.monologue_agent.utils.prompts as prompts
-from opendevin.core.exceptions import AgentEventTypeError
-from opendevin.core.logger import opendevin_logger as logger
-from opendevin.llm.llm import LLM
-
-
-class Monologue:
-    """
-    The monologue is a representation for the agent's internal monologue where it can think.
-    The agent has the capability of using this monologue for whatever it wants.
-    """
-
-    def __init__(self):
-        """
-        Initialize the empty list of thoughts
-        """
-        self.thoughts = []
-
-    def add_event(self, t: dict):
-        """
-        Adds an event to memory if it is a valid event.
-
-        Parameters:
-        - t (dict): The thought that we want to add to memory
-
-        Raises:
-        - AgentEventTypeError: If t is not a dict
-        """
-        if not isinstance(t, dict):
-            raise AgentEventTypeError()
-        self.thoughts.append(t)
-
-    def get_thoughts(self):
-        """
-        Get the current thoughts of the agent.
-
-        Returns:
-        - List: The list of thoughts that the agent has.
-        """
-        return self.thoughts
-
-    def get_total_length(self):
-        """
-        Gives the total number of characters in all thoughts
-
-        Returns:
-        - Int: Total number of chars in thoughts.
-        """
-        total_length = 0
-        for t in self.thoughts:
-            try:
-                total_length += len(json.dumps(t))
-            except TypeError as e:
-                logger.error('Error serializing thought: %s', str(e), exc_info=False)
-        return total_length
-
-    def condense(self, llm: LLM):
-        """
-        Attempts to condense the monologue by using the llm
-
-        Parameters:
-        - llm (LLM): llm to be used for summarization
-
-        Raises:
-        - Exception: the same exception as it got from the llm or processing the response
-        """
-
-        try:
-            prompt = prompts.get_summarize_monologue_prompt(self.thoughts)
-            messages = [{'content': prompt, 'role': 'user'}]
-            resp = llm.completion(messages=messages)
-            summary_resp = resp['choices'][0]['message']['content']
-            self.thoughts = prompts.parse_summary_response(summary_resp)
-        except Exception as e:
-            logger.error('Error condensing thoughts: %s', str(e), exc_info=False)
-
-            # TODO If the llm fails with ContextWindowExceededError, we can try to condense the monologue chunk by chunk
-            raise
@@ -1,19 +1,12 @@
-import re
-from json import JSONDecodeError
-from typing import List
-
-from opendevin.core import config
-from opendevin.core.exceptions import LLMOutputError
-from opendevin.core.schema.config import ConfigType
+from opendevin.core.config import config
+from opendevin.core.utils import json
 from opendevin.events.action import (
    Action,
-    action_from_dict,
 )
 from opendevin.events.observation import (
    CmdOutputObservation,
 )
-
-from . import json
+from opendevin.events.serialization.action import action_from_dict

 ACTION_PROMPT = """
 You're a thoughtful robot. Your main task is this:
@@ -42,7 +35,7 @@ Here are the possible actions:
  * `command` - the command to run
  * `background` - if true, run the command in the background, so that other commands can be run concurrently. Useful for e.g. starting a server. You won't be able to see the logs. You don't need to end the command with `&`, just set this to true.
 * `kill` - kills a background command
-  * `id` - the ID of the background command to kill
+  * `command_id` - the ID of the background command to kill
 * `browse` - opens a web page. Arguments:
  * `url` - the URL to open
 * `push` - Push a branch from the current repo to github:
@@ -51,15 +44,16 @@ Here are the possible actions:
  * `branch` - the name of the branch to push
 * `recall` - recalls a past memory. Arguments:
  * `query` - the query to search for
-* `think` - make a plan, set a goal, or record your thoughts. Arguments:
-  * `thought` - the thought to record
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
 * `finish` - if you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.

 %(background_commands)s

-You MUST take time to think in between read, write, run, browse, push, and recall actions.
+You MUST take time to think in between read, write, run, browse, push, and recall actions--do this with the `message` action.
 You should never act twice in a row without thinking. But if your last several
-actions are all "think" actions, you should consider taking a different action.
+actions are all `message` actions, you should consider taking a different action.

 Notes:
 * you are logged in as %(user)s, but sudo will always work without a password.
@@ -68,7 +62,7 @@ Notes:
 * don't run interactive commands, or commands that don't return (e.g. `node server.js`). You may run commands in the background (e.g. `node server.js &`)
 * don't run interactive text editors (e.g. `nano` or 'vim'), instead use the 'write' or 'read' action.
 * don't run gui applications (e.g. software IDEs (like vs code or codium), web browsers (like firefox or chromium), or other complex software packages). Use non-interactive cli applications, or special actions instead.
-* whenever an action fails, always `think` about why it may have happened before acting again.
+* whenever an action fails, always send a `message` about why it may have happened before acting again.

 What is your next single thought or action? Again, you must reply with JSON, and only with JSON. You must respond with exactly one 'action' object.

@@ -99,7 +93,7 @@ You can also use the same action and args from the source monologue.
 """


-def get_summarize_monologue_prompt(thoughts: List[dict]):
+def get_summarize_monologue_prompt(thoughts: list[dict]):
    """
    Gets the prompt for summarizing the monologue

@@ -113,16 +107,16 @@ def get_summarize_monologue_prompt(thoughts: List[dict]):

 def get_request_action_prompt(
    task: str,
-    thoughts: List[dict],
-    background_commands_obs: List[CmdOutputObservation] = [],
+    thoughts: list[dict],
+    background_commands_obs: list[CmdOutputObservation] = [],
 ):
    """
    Gets the action prompt formatted with appropriate values.

    Parameters:
    - task (str): The current task the agent is trying to accomplish
-    - thoughts (List[dict]): The agent's current thoughts
-    - background_commands_obs (List[CmdOutputObservation]): List of all observed background commands running
+    - thoughts (list[dict]): The agent's current thoughts
+    - background_commands_obs (list[CmdOutputObservation]): list of all observed background commands running

    Returns:
    - str: Formatted prompt string with hint, task, monologue, and background included
@@ -132,8 +126,8 @@ def get_request_action_prompt(
    if len(thoughts) > 0:
        latest_thought = thoughts[-1]
        if 'action' in latest_thought:
-            if latest_thought['action'] == 'think':
-                if latest_thought['args']['thought'].startswith('OK so my task is'):
+            if latest_thought['action'] == 'message':
+                if latest_thought['args']['content'].startswith('OK so my task is'):
                    hint = "You're just getting started! What should you do first?"
                else:
                    hint = "You've been thinking a lot lately. Maybe it's time to take action?"
@@ -147,9 +141,9 @@ def get_request_action_prompt(
            bg_commands_message += (
                f'\n`{command_obs.command_id}`: {command_obs.command}'
            )
-        bg_commands_message += '\nYou can end any process by sending a `kill` action with the numerical `id` above.'
+        bg_commands_message += '\nYou can end any process by sending a `kill` action with the numerical `command_id` above.'

-    user = 'opendevin' if config.get(ConfigType.RUN_AS_DEVIN) else 'root'
+    user = 'opendevin' if config.run_as_devin else 'root'

    return ACTION_PROMPT % {
        'task': task,
@@ -157,14 +151,12 @@ def get_request_action_prompt(
        'background_commands': bg_commands_message,
        'hint': hint,
        'user': user,
-        'timeout': config.get(ConfigType.SANDBOX_TIMEOUT),
-        'WORKSPACE_MOUNT_PATH_IN_SANDBOX': config.get(
-            ConfigType.WORKSPACE_MOUNT_PATH_IN_SANDBOX
-        ),
+        'timeout': config.sandbox_timeout,
+        'WORKSPACE_MOUNT_PATH_IN_SANDBOX': config.workspace_mount_path_in_sandbox,
    }


-def parse_action_response(response: str) -> Action:
+def parse_action_response(orig_response: str) -> Action:
    """
    Parses a string to find an action within it

@@ -174,39 +166,17 @@ def parse_action_response(response: str) -> Action:
    Returns:
    - Action: The action that was found in the response string
    """
-    try:
-        action_dict = json.loads(response)
-    except JSONDecodeError:
-        # Find response-looking json in the output and use the more promising one. Helps with weak llms
-        response_json_matches = re.finditer(
-            r"""{\s*\"action\":\s?\"(\w+)\"(?:,?|,\s*\"args\":\s?{((?:.|\s)*?)})\s*}""",
-            response,
-        )  # Find all response-looking strings
+    # attempt to load the JSON dict from the response
+    action_dict = json.loads(orig_response)

-        def rank(match):
-            return (
-                len(match[2]) if match[1] == 'think' else 130
-            )  # Crudely rank multiple responses by length
-
-        try:
-            action_dict = json.loads(
-                max(response_json_matches, key=rank)[0]
-            )  # Use the highest ranked response
-        except (ValueError, JSONDecodeError):
-            raise LLMOutputError(
-                'Invalid JSON, the response must be well-formed JSON as specified in the prompt.'
-            )
-    except (ValueError, TypeError):
-        raise LLMOutputError(
-            'Invalid JSON, the response must be well-formed JSON as specified in the prompt.'
-        )
    if 'content' in action_dict:
        # The LLM gets confused here. Might as well be robust
        action_dict['contents'] = action_dict.pop('content')
+
    return action_from_dict(action_dict)


-def parse_summary_response(response: str) -> List[dict]:
+def parse_summary_response(response: str) -> list[dict]:
    """
    Parses a summary of the monologue

@@ -214,7 +184,7 @@ def parse_summary_response(response: str) -> List[dict]:
    - response (str): The response string to be parsed

    Returns:
-    - List[dict]: The list of summaries output by the model
+    - list[dict]: The list of summaries output by the model
    """
    parsed = json.loads(response)
    return parsed['new_monologue']
@@ -1,5 +1,3 @@
-from typing import List
-
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.events.action import Action, AgentFinishAction
@@ -36,9 +34,13 @@ class PlannerAgent(Agent):
        - Action: The next action to take based on llm response
        """

-        if state.plan.task.state in ['completed', 'verified', 'abandoned']:
+        if state.root_task.state in [
+            'completed',
+            'verified',
+            'abandoned',
+        ]:
            return AgentFinishAction()
-        prompt = get_prompt(state.plan, state.history)
+        prompt = get_prompt(state)
        messages = [{'content': prompt, 'role': 'user'}]
        resp = self.llm.completion(messages=messages)
        action_resp = resp['choices'][0]['message']['content']
@@ -46,5 +48,5 @@ class PlannerAgent(Agent):
        action = parse_response(action_resp)
        return action

-    def search_memory(self, query: str) -> List[str]:
+    def search_memory(self, query: str) -> list[str]:
        return []
@@ -1,43 +1,16 @@
-import json
-from typing import Dict, List, Tuple, Type
-
-from opendevin.controller.state.plan import Plan
+from opendevin.controller.state.state import State
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.schema import ActionType
+from opendevin.core.utils import json
 from opendevin.events.action import (
    Action,
-    AddTaskAction,
-    AgentFinishAction,
-    AgentRecallAction,
-    AgentSummarizeAction,
-    AgentThinkAction,
-    BrowseURLAction,
-    CmdKillAction,
-    CmdRunAction,
-    FileReadAction,
-    FileWriteAction,
-    ModifyTaskAction,
    NullAction,
-    action_from_dict,
 )
 from opendevin.events.observation import (
    NullObservation,
-    Observation,
 )
-
-ACTION_TYPE_TO_CLASS: Dict[str, Type[Action]] = {
-    ActionType.RUN: CmdRunAction,
-    ActionType.KILL: CmdKillAction,
-    ActionType.BROWSE: BrowseURLAction,
-    ActionType.READ: FileReadAction,
-    ActionType.WRITE: FileWriteAction,
-    ActionType.RECALL: AgentRecallAction,
-    ActionType.THINK: AgentThinkAction,
-    ActionType.SUMMARIZE: AgentSummarizeAction,
-    ActionType.FINISH: AgentFinishAction,
-    ActionType.ADD_TASK: AddTaskAction,
-    ActionType.MODIFY_TASK: ModifyTaskAction,
-}
+from opendevin.events.serialization.action import action_from_dict
+from opendevin.events.serialization.event import event_to_memory

 HISTORY_SIZE = 10

@@ -106,23 +79,24 @@ It must be an object, and it must contain two fields:
  * `command` - the command to run
  * `background` - if true, run the command in the background, so that other commands can be run concurrently. Useful for e.g. starting a server. You won't be able to see the logs. You don't need to end the command with `&`, just set this to true.
 * `kill` - kills a background command
-  * `id` - the ID of the background command to kill
+  * `command_id` - the ID of the background command to kill
 * `browse` - opens a web page. Arguments:
  * `url` - the URL to open
-* `think` - make a plan, set a goal, or record your thoughts. Arguments:
-  * `thought` - the thought to record
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
 * `add_task` - add a task to your plan. Arguments:
-  * `parent` - the ID of the parent task
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
  * `goal` - the goal of the task
  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
 * `modify_task` - close a task. Arguments:
-  * `id` - the ID of the task to close
+  * `task_id` - the ID of the task to close
  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
 * `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.

-You MUST take time to think in between read, write, run, browse, and recall actions.
+You MUST take time to think in between read, write, run, browse, and recall actions--do this with the `message` action.
 You should never act twice in a row without thinking. But if your last several
-actions are all `think` actions, you should consider taking a different action.
+actions are all `message` actions, you should consider taking a different action.

 What is your next thought or action? Again, you must reply with JSON, and only with JSON.

@@ -139,7 +113,7 @@ def get_hint(latest_action_id: str) -> str:
        ActionType.READ: 'You should think about the file you just read, what you learned from it, and how that affects your plan.',
        ActionType.WRITE: 'You just changed a file. You should think about how it affects your plan.',
        ActionType.BROWSE: 'You should think about the page you just visited, and what you learned from it.',
-        ActionType.THINK: "Look at your last thought in the history above. What does it suggest? Don't think anymore--take action.",
+        ActionType.MESSAGE: "Look at your last thought in the history above. What does it suggest? Don't think anymore--take action.",
        ActionType.RECALL: 'You should think about the information you just recalled, and how it should affect your plan.',
        ActionType.ADD_TASK: 'You should think about the next action to take.',
        ActionType.MODIFY_TASK: 'You should think about the next action to take.',
@@ -149,47 +123,42 @@ def get_hint(latest_action_id: str) -> str:
    return hints.get(latest_action_id, '')


-def get_prompt(plan: Plan, history: List[Tuple[Action, Observation]]) -> str:
+def get_prompt(state: State) -> str:
    """
    Gets the prompt for the planner agent.
    Formatted with the most recent action-observation pairs, current task, and hint based on last action

    Parameters:
-    - plan (Plan): The original plan outlined by the user with LLM defined tasks
-    - history (List[Tuple[Action, Observation]]): List of corresponding action-observation pairs
+    - state (State): The state of the current agent

    Returns:
    - str: The formatted string prompt with historical values
    """

-    plan_str = json.dumps(plan.task.to_dict(), indent=2)
-    sub_history = history[-HISTORY_SIZE:]
+    plan_str = json.dumps(state.root_task.to_dict(), indent=2)
+    sub_history = state.history[-HISTORY_SIZE:]
    history_dicts = []
    latest_action: Action = NullAction()
    for action, observation in sub_history:
        if not isinstance(action, NullAction):
-            history_dicts.append(action.to_memory())
+            history_dicts.append(event_to_memory(action))
            latest_action = action
        if not isinstance(observation, NullObservation):
-            observation_dict = observation.to_memory()
-            if (
-                'extras' in observation_dict
-                and 'screenshot' in observation_dict['extras']
-            ):
-                del observation_dict['extras']['screenshot']
+            observation_dict = event_to_memory(observation)
            history_dicts.append(observation_dict)
    history_str = json.dumps(history_dicts, indent=2)
-    current_task = plan.get_current_task()
+    current_task = state.root_task.get_current_task()
    if current_task is not None:
        plan_status = f"You're currently working on this task:\n{current_task.goal}."
        if len(current_task.subtasks) == 0:
            plan_status += "\nIf it's not achievable AND verifiable with a SINGLE action, you MUST break it down into subtasks NOW."
    else:
        plan_status = "You're not currently working on any tasks. Your next action MUST be to mark a task as in_progress."
-    hint = get_hint(latest_action.to_dict()['action'])
+    hint = get_hint(event_to_memory(latest_action).get('action', ''))
    logger.info('HINT:\n' + hint, extra={'msg_type': 'INFO'})
+    task = state.get_current_user_intent()
    return prompt % {
-        'task': plan.main_goal,
+        'task': task,
        'plan': plan_str,
        'history': history_str,
        'hint': hint,
@@ -207,9 +176,6 @@ def parse_response(response: str) -> Action:
    Returns:
    - Action: A valid next action to perform from model output
    """
-    json_start = response.find('{')
-    json_end = response.rfind('}') + 1
-    response = response[json_start:json_end]
    action_dict = json.loads(response)
    if 'contents' in action_dict:
        # The LLM gets confused here. Might as well be robust
@@ -8,6 +8,4 @@ by the `ghcr.yml` workflow.
 ```
 docker build -f containers/app/Dockerfile -t opendevin .
 docker build -f containers/sandbox/Dockerfile -t sandbox .
-docker build -f containers/evaluation/Dockerfile -t evaluation evaluation/SWE-bench/
-
 ```
@@ -45,6 +45,7 @@ RUN apt-get update -y \
    && apt-get install -y curl ssh sudo

 RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs # Default is 1000, but OSX is often 501
+RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs # Default is 60000, but we've seen up to 200000

 RUN groupadd app
 RUN useradd -l -m -u $OPENDEVIN_USER_ID -s /bin/bash opendevin && \
@@ -73,4 +74,8 @@ COPY --chown=opendevin:app --chmod=770 --from=frontend-builder /app/dist ./front
 COPY --chown=opendevin:app --chmod=770 ./containers/app/entrypoint.sh /app/entrypoint.sh

 USER root
-CMD ["/app/entrypoint.sh"]
+
+WORKDIR /app
+
+ENTRYPOINT ["/app/entrypoint.sh"]
+CMD ["uvicorn", "opendevin.server.listen:app", "--host", "0.0.0.0", "--port", "3000"]
@@ -1,5 +1,13 @@
 #!/bin/bash
-# check user is root
+set -eo pipefail
+
+echo "Starting OpenDevin..."
+if [[ $NO_SETUP == "true" ]]; then
+  echo "Skipping setup, running as $(whoami)"
+  "$@"
+  exit 0
+fi
+
 if [ "$(id -u)" -ne 0 ]; then
  echo "The OpenDevin entrypoint.sh must run as root"
  exit 1
@@ -11,30 +19,37 @@ if [ -z "$SANDBOX_USER_ID" ]; then
 fi

 if [[ "$SANDBOX_USER_ID" -eq 0 ]]; then
-  echo "SANDBOX_USER_ID cannot be 0. Please run with a different user id."
-  exit 1
-fi
-
-# change uid of opendevin user to match the host user
-# but the group id is not changed, so the user can still access everything under /app
-if ! useradd -l -m -u $SANDBOX_USER_ID -s /bin/bash enduser; then
-  echo "Failed to create user enduser with id $SANDBOX_USER_ID. Moving opendevin user."
-  incremented_id=$(($SANDBOX_USER_ID + 1))
-  usermod -u $incremented_id opendevin
+  echo "Running OpenDevin as root"
+  export RUN_AS_DEVIN=false
+  mkdir -p /root/.cache/ms-playwright/
+  mv /home/opendevin/.cache/ms-playwright/ /root/.cache/
+  "$@"
+else
+  echo "Setting up enduser with id $SANDBOX_USER_ID"
  if ! useradd -l -m -u $SANDBOX_USER_ID -s /bin/bash enduser; then
-    echo "Failed to create user enduser with id $SANDBOX_USER_ID for a second time. Exiting."
-    exit 1
+    echo "Failed to create user enduser with id $SANDBOX_USER_ID. Moving opendevin user."
+    incremented_id=$(($SANDBOX_USER_ID + 1))
+    usermod -u $incremented_id opendevin
+    if ! useradd -l -m -u $SANDBOX_USER_ID -s /bin/bash enduser; then
+      echo "Failed to create user enduser with id $SANDBOX_USER_ID for a second time. Exiting."
+      exit 1
+    fi
  fi
+  usermod -aG app enduser
+  # get the user group of /var/run/docker.sock and set opendevin to that group
+  DOCKER_SOCKET_GID=$(stat -c '%g' /var/run/docker.sock)
+  echo "Docker socket group id: $DOCKER_SOCKET_GID"
+  if getent group $DOCKER_SOCKET_GID; then
+    echo "Group with id $DOCKER_SOCKET_GID already exists"
+  else
+    echo "Creating group with id $DOCKER_SOCKET_GID"
+    groupadd -g $DOCKER_SOCKET_GID docker
+  fi
+
+  mkdir -p /home/enduser/.cache/ms-playwright/
+  mv /home/opendevin/.cache/ms-playwright/ /home/enduser/.cache/
+
+  usermod -aG $DOCKER_SOCKET_GID enduser
+  echo "Running as enduser"
+  su enduser /bin/bash -c "$*"
 fi
-
-usermod -aG app enduser
-mkdir -p /home/enduser/.cache/ms-playwright/
-mv /home/opendevin/.cache/ms-playwright/ /home/enduser/.cache/
-
-# get the user group of /var/run/docker.sock and set opendevin to that group
-DOCKER_SOCKET_GID=$(stat -c '%g' /var/run/docker.sock)
-echo "Docker socket group id: $DOCKER_SOCKET_GID"
-usermod -aG $DOCKER_SOCKET_GID enduser
-
-# switch to the user and start the server
-su enduser -c "cd /app && uvicorn opendevin.server.listen:app --host 0.0.0.0 --port 3000"
@@ -1,41 +0,0 @@
-FROM ubuntu:20.04
-
-# https://github.com/princeton-nlp/SWE-bench/issues/15#issuecomment-1815392192
-RUN apt-get update && \
-    apt-get install -y bash gcc git jq wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN git config --global user.email "swebench@pnlp.org"
-RUN git config --global user.name "swebench"
-
-RUN apt update && apt install -y build-essential
-
-# Create new user
-RUN useradd -ms /bin/bash swe-bench
-USER swe-bench
-WORKDIR /home/swe-bench
-
-# Setup Conda
-ENV PATH="/home/swe-bench/miniconda3/bin:${PATH}"
-ARG PATH="/home/swe-bench/miniconda3/bin:${PATH}"
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-`uname -m`.sh -O miniconda.sh \
-    && mkdir ~/.conda \
-    && bash miniconda.sh -b \
-    && rm -f miniconda.sh
-RUN conda --version
-
-# Setup SWE-Bench Env
-COPY environment.yml .
-RUN conda env create -f environment.yml
-
-# Add commands
-COPY ./commands.sh .
-RUN . ./commands.sh
-
-# Some missing packages
-RUN pip install datasets python-dotenv gitpython
-
-RUN conda init bash
-
-CMD ["/bin/bash"]
@@ -1,4 +0,0 @@
-DOCKER_REGISTRY=ghcr.io
-DOCKER_ORG=opendevin
-DOCKER_IMAGE=eval-swe-bench
-DOCKER_BASE_DIR=evaluation/SWE-bench
@@ -34,7 +34,7 @@ Now we have both Slack workspace for the collaboration on building OpenDevin and
 - [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ggtwn3k5-PvAA2LUmqGHVZ~XzGq~ILw)
 - [Discord server](https://discord.gg/ESHStjSjD4)

-If you would love to contribute, feel free to join our community (note that now there is no need to fill in the [form](https://forms.gle/758d5p6Ve8r2nxxq6)). Let's simplify software engineering together!
+If you would love to contribute, feel free to join our community. Let's simplify software engineering together!

 🐚 **Code less, make more with OpenDevin.**

@@ -8,12 +8,13 @@ sidebar_position: 3

 ### Description

-This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.13463), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
+This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both _simplicity_ and _performance_ (see paper for more details).

 The conceptual idea is illustrated below. At each turn, the agent can:

 1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
 2. **CodeAct**: Choose to perform the task by executing code
+
 - Execute any valid Linux `bash` command
 - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.

@@ -22,6 +23,7 @@ The conceptual idea is illustrated below. At each turn, the agent can:
 ### Plugin System

 To make the CodeAct agent more powerful with only access to `bash` action space, CodeAct agent leverages OpenDevin&#x27;s plugin system:
+
 - [Jupyter plugin](https://github.com/OpenDevin/OpenDevin/tree/main/opendevin/runtime/plugins/jupyter): for IPython execution via bash command
 - [SWE-agent tool plugin](https://github.com/OpenDevin/OpenDevin/tree/main/opendevin/runtime/plugins/swe_agent_commands): Powerful bash command line tools for software development tasks introduced by [swe-agent](https://github.com/princeton-nlp/swe-agent).

@@ -29,8 +31,7 @@ To make the CodeAct agent more powerful with only access to `bash` action space,

 https://github.com/OpenDevin/OpenDevin/assets/38853559/f592a192-e86c-4f48-ad31-d69282d5f6ac

-*Example of CodeActAgent with `gpt-4-turbo-2024-04-09` performing a data science task (linear regression)*
-
+_Example of CodeActAgent with `gpt-4-turbo-2024-04-09` performing a data science task (linear regression)_

 ### Actions

@@ -50,18 +51,17 @@ https://github.com/OpenDevin/OpenDevin/assets/38853559/f592a192-e86c-4f48-ad31-d

 ### Methods

-| Method          | Description                                                                                                                                                                                                                                             |
-| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `__init__`      | Initializes an agent with `llm` and a list of messages `List[Mapping[str, str]]`                                                                                                                                                                        |
+| Method          | Description                                                                                                                                     |
+| --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
+| `__init__`      | Initializes an agent with `llm` and a list of messages `list[Mapping[str, str]]`                                                                |
 | `step`          | Performs one step using the CodeAct Agent. This includes gathering info on previous steps and prompting the model to make a command to execute. |
-| `search_memory` | Not yet implemented                                                                                                                                                                                                                                     |
+| `search_memory` | Not yet implemented                                                                                                                             |

 ### Work-in-progress &amp; Next step

 [] Support web-browsing
 [] Complete the workflow for CodeAct agent to submit Github PRs

-
 ## Monologue Agent

 ### Description
@@ -60,7 +60,7 @@ Explore the codebase of OpenDevin on [GitHub](https://github.com/OpenDevin/OpenD

 The easiest way to run OpenDevin is inside a Docker container.

-To start the app, run these commands, replacing `$(pwd)/workspace` with the path to the code you want OpenDevin to work with.
+To start the app, run these commands, replacing `$(pwd)/workspace` with the directory you want OpenDevin to work with.

 ```
 # Your OpenAI API key, or any other LLM API key
@@ -79,13 +79,16 @@ OpenDevin runs bash commands within a Docker sandbox, so it should not affect yo

 ```
 docker run \
+    -it \
+    --pull=always \
    -e LLM_API_KEY \
+    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 3000:3000 \
-    --add-host host.docker.internal=host-gateway \
-    ghcr.io/opendevin/opendevin:0.4.0
+    --add-host host.docker.internal:host-gateway \
+    ghcr.io/opendevin/opendevin:0.5
 ```

 You'll find opendevin running at [http://localhost:3000](http://localhost:3000).
@@ -44,4 +44,4 @@ are actively working on building better open source models!

 Some LLMs have rate limits and may require retries. OpenDevin will automatically retry requests if it receives a 429 error or API connection error.
 You can set `LLM_NUM_RETRIES`, `LLM_RETRY_MIN_WAIT`, `LLM_RETRY_MAX_WAIT` environment variables to control the number of retries and the time between retries.
-By default, `LLM_NUM_RETRIES` is 5 and `LLM_RETRY_MIN_WAIT`, `LLM_RETRY_MAX_WAIT` are 3 seconds and respectively 60 seconds.
+By default, `LLM_NUM_RETRIES` is 5 and `LLM_RETRY_MIN_WAIT`, `LLM_RETRY_MAX_WAIT` are 3 seconds and 60 seconds respectively.
@@ -1,9 +1,9 @@
 # Local LLM with Ollama

 Ensure that you have the Ollama server up and running.
-For detailed startup instructions, refer to the [here](https://github.com/ollama/ollama)
+For detailed startup instructions, refer to [here](https://github.com/ollama/ollama)

-This guide assumes you've started ollama with `ollama serve`. If you're running ollama differently (e.g. inside docker), the instructions might need to be modified. Please note that if you're running wsl the default ollama configuration blocks requests from docker containers. See [here](#4-configuring-the-ollama-service-wsl).
+This guide assumes you've started ollama with `ollama serve`. If you're running ollama differently (e.g. inside docker), the instructions might need to be modified. Please note that if you're running WSL the default ollama configuration blocks requests from docker containers. See [here](#configuring-the-ollama-service-wsl).

 ## Pull Models

@@ -32,7 +32,7 @@ Use the instructions [here](../intro) to start OpenDevin using Docker.
 But when running `docker run`, you'll need to add a few more arguments:

 ```bash
--add-host host.docker.internal=host-gateway \
+--add-host host.docker.internal:host-gateway \
 -e LLM_API_KEY="ollama" \
 -e LLM_BASE_URL="http://host.docker.internal:11434" \
 ```
@@ -44,7 +44,10 @@ For example:
 export WORKSPACE_BASE=$(pwd)/workspace

 docker run \
-    --add-host host.docker.internal=host-gateway \
+    -it \
+    --pull=always \
+    --add-host host.docker.internal:host-gateway \
+    -e SANDBOX_USER_ID=$(id -u) \
    -e LLM_API_KEY="ollama" \
    -e LLM_BASE_URL="http://host.docker.internal:11434" \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
@@ -84,7 +87,7 @@ And now you're ready to go!

 ## Configuring the ollama service (WSL)

-The default configuration for ollama in wsl only serves localhost. This means you can't reach it from a docker container. eg. it wont work with OpenDevin. First let's test that ollama is running correctly.
+The default configuration for ollama in WSL only serves localhost. This means you can't reach it from a docker container. eg. it wont work with OpenDevin. First let's test that ollama is running correctly.

 ```bash
 ollama list # get list of installed models
@@ -93,7 +96,7 @@ curl http://localhost:11434/api/generate -d '{"model":"[NAME]","prompt":"hi"}'
 #ex. curl http://localhost:11434/api/generate -d '{"model":"codellama","prompt":"hi"}' #the tag is optional if there is only one
 ```

-Once that is done test that it allows "outside" requests, like those from inside a docker container.
+Once that is done, test that it allows "outside" requests, like those from inside a docker container.

 ```bash
 docker ps # get list of running docker containers, for most accurate test choose the open devin sandbox container.
@@ -103,7 +106,7 @@ docker exec [CONTAINER ID] curl http://host.docker.internal:11434/api/generate -

 ## Fixing it

-Now let's make it work, edit /etc/systemd/system/ollama.service with sudo privileges. (Path may vary depending on linux flavor)
+Now let's make it work. Edit /etc/systemd/system/ollama.service with sudo privileges. (Path may vary depending on linux flavor)

 ```bash
 sudo vi /etc/systemd/system/ollama.service
@@ -42,6 +42,7 @@ OpenDevin uses a docker container to do its work safely, without potentially bre
 * Run `docker ps` to ensure that docker is running
 * Make sure you don't need `sudo` to run docker [see here](https://www.baeldung.com/linux/docker-run-without-sudo)
 * If you are on a mac, check the [permissions requirements](https://docs.docker.com/desktop/mac/permission-requirements/) and in particular consider enabling the "Allow the default Docker socket to be used" under "Settings > Advanced" in Docker Desktop.
+* If you are on a mac, Upgrade your Docker to the latest version under "Check for Updates"

 ## Unable to connect to SSH box
 [GitHub Issue](https://github.com/OpenDevin/OpenDevin/issues/1156)
@@ -89,7 +90,7 @@ See our guide for [local LLMs](llms/localLLMs) for more information.

 - Check your `LLM_BASE_URL`
 - Check that ollama is running OK
- Make sure you're using `--add-host host.docker.internal=host-gateway` when running in docker
+- Make sure you're using `--add-host host.docker.internal:host-gateway` when running in docker

 ## 404 Resource not found
 ### Symptoms
@@ -7,7 +7,7 @@ Please be sure to run all commands inside your WSL terminal.

 ### Failed to create opendevin user

-If you encounter the following error during setup: `Exception: Failed to create opendevin user in sandbox: b'useradd: UID 0 is not unique\n'`
+If you encounter the following error during setup: `Exception: Failed to create opendevin user in sandbox: b'useradd: UID 0 is not unique\n'`.
 You can resolve it by running:
 `    export SANDBOX_USER_ID=1000
   `
@@ -20,7 +20,7 @@ If you face issues running Poetry even after installing it during the build proc

 ### NoneType object has no attribute 'request'

-If you experiencing issues related to networking, such as `NoneType object has no attribute 'request'` when executing `make run`, you may need to configure your WSL2 networking settings. Follow these steps:
+If you are experiencing issues related to networking, such as `NoneType object has no attribute 'request'` when executing `make run`, you may need to configure your WSL2 networking settings. Follow these steps:

 - Open or create the `.wslconfig` file located at `C:\Users\%username%\.wslconfig` on your Windows host machine.
 - Add the following configuration to the `.wslconfig` file:
@@ -8,13 +8,14 @@ export function Code() {
 export WORKSPACE_BASE=$(pwd)/workspace`;

  const dockerCode = `docker run \\
+    -it \\
    --pull=always \\
    -e SANDBOX_USER_ID=$(id -u) \\
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \\
    -v $WORKSPACE_BASE:/opt/workspace_base \\
    -v /var/run/docker.sock:/var/run/docker.sock \\
    -p 3000:3000 \\
-    --add-host host.docker.internal=host-gateway \\
+    --add-host host.docker.internal:host-gateway \\
    ghcr.io/opendevin/opendevin:0.5`;

  return (
@@ -4,76 +4,21 @@ This folder contains code and resources to run experiments and evaluations.

 ## Logistics
 To better organize the evaluation folder, we should follow the rules below:
-  - Each subfolder contains a specific benchmark or experiment. For example, `evaluation/SWE-bench` should contain
+  - Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
 all the preprocessing/evaluation/analysis scripts.
-  - Raw data and experimental records should not be stored within this repo (e.g. Google Drive or Hugging Face Datasets).
+  - Raw data and experimental records should not be stored within this repo.
+    - For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenDevin/evaluation) for visualization.
  - Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.

-## Roadmap
+## Supported Benchmarks

- Sanity check. Reproduce Devin's scores on SWE-bench using the released outputs to make sure that our harness pipeline works.
- Open source model support.
-  - Contributors are encouraged to submit their commits to our [forked SEW-bench repo](https://github.com/OpenDevin/SWE-bench).
-  - Ensure compatibility with OpenAI interface for inference.
-  - Serve open source models, prioritizing high concurrency and throughput.
+- SWE-Bench: [`evaluation/swe_bench`](./swe_bench)

-## SWE-bench
- notebooks
-  - `devin_eval_analysis.ipynb`: notebook analyzing devin's outputs
- scripts
-  - `prepare_devin_outputs_for_evaluation.py`: script fetching and converting [devin's output](https://github.com/CognitionAI/devin-swebench-results/tree/main) into the desired json file for evaluation.
-    - usage: `python prepare_devin_outputs_for_evaluation.py <setting>` where setting can be `passed`, `failed` or `all`
- resources
-  - Devin related SWE-bench test subsets
-    - [🤗 OpenDevin/SWE-bench-devin-passed](https://huggingface.co/datasets/OpenDevin/SWE-bench-devin-passed)
-    - [🤗 OpenDevin/SWE-bench-devin-full-filtered](https://huggingface.co/datasets/OpenDevin/SWE-bench-devin-full-filtered)
-  - Devin's outputs processed for evaluations is available on [Huggingface](https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output)
-    - get predictions that passed the test: `wget https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output/raw/main/devin_swe_passed.json`
-    - get all predictions `wget https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output/raw/main/devin_swe_outputs.json`
+### Result Visualization

-See [`SWE-bench/README.md`](./SWE-bench/README.md) for more details on how to run SWE-Bench for evaluation.
+Check [this huggingface space](https://huggingface.co/spaces/OpenDevin/evaluation) for visualization of existing experimental results.

-### Results

-We have refined the original SWE-bench evaluation pipeline to enhance its efficiency and reliability. The updates are as follows:
- Reuse testbeds and Conda environments.
- Additionally try `patch` command for patch application if `git apply` command fails.
+### Upload your results

-#### Results on SWE-bench-devin-passed
-
-[🤗 OpenDevin/SWE-bench-devin-passed](https://huggingface.co/datasets/OpenDevin/SWE-bench-devin-passed)
-
-| Model/Agent            | #instances | #init | #apply | #resolve |
-|------------------------|------------|-------|--------|----------|
-| Gold                   | 79         | 79    | 79     | 79       |
-| Devin                  | 79         | 79    | 76     | 76       |
-
-#init: number of instances where testbeds have been successfully initialized.
-
-In the 3 Devin-failed instances (see below), Devin has made changes to the tests, which are incompatible with the provided test patch and causes failures during patch application. The evaluation adopted by Devin does not seem to align with the original SWE-bench evaluation.
-
-```shell
-django__django-11244
-scikit-learn__scikit-learn-10870
-sphinx-doc__sphinx-9367
-```
-
-#### Results on SWE-bench-devin-failed
-
-| Model/Agent            | #instances | #init | #apply | #resolve |
-|------------------------|------------|-------|--------|----------|
-| Gold                   | 491        | 491   | 491    | 371      |
-| Devin                  | 491        | 491   | 463    | 7        |
-
-Devin **passes** 7 instances on the `SWE-bench-devin-failed` subset. SWE-bench dataset appears to be noisy, evidenced by 120 instances where gold patches do not pass.
-
-We have filtered out the problematic 120 instances, resulting in the creation of the `SWE-bench-devin-full-filtered` subset.
-
-## Results on SWE-bench-devin-full-filtered
-
-[🤗 OpenDevin/SWE-bench-devin-full-filtered](https://huggingface.co/datasets/OpenDevin/SWE-bench-devin-full-filtered)
-
-| Model/Agent            | #instances | #init | #apply | #resolve |
-|------------------------|------------|-------|--------|----------|
-| Gold                   | 450        | 450   | 450    | 450      |
-| Devin                  | 450        | 450   | 426    | 83       |
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results to our hosted huggingface repo via PR following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
@@ -1,80 +0,0 @@
-# SWE-Bench Evaluation
-
-Work in-progress.
-
-**TODOs**:
-
- [ ] Generate `predictions` files given an OpenDevin `Agent` implementation. We could borrow something from [devin's eval-harness implementation](https://github.com/CognitionAI/devin-swebench-results/tree/main/harness), for example, [how to generate `TestSpec`](https://github.com/CognitionAI/devin-swebench-results/blob/main/harness/scripts.py#L150-L160).
- [ ] Make sure the evaluation suite runs on all repos. I only tested on `matplotlib` so far, `scikit-learn` does not work for now (see [this issue](https://github.com/princeton-nlp/SWE-bench/issues/57))).
-
-
-## Run tests for a prediction file inside a docker container
-
-Currently, the docker container should be able to for running SWE-Bench. It was tested on `matplotlib`, but it requires further testing to make sure it works on other repositories. Currently, [it does not work for `scikit-learn`](https://github.com/princeton-nlp/SWE-bench/issues/57)).
-
-### Setup example data
-
-```bash
-cd evaluation/SWE-bench
-./scripts/prepare_devin_swe_bench_data.sh
-
-# Clone the repo
-# This is a fork that fixes some issues that stops matplotlib from running (see https://github.com/princeton-nlp/SWE-bench/pull/56)
-git clone https://github.com/OpenDevin/SWE-bench.git
-
-# Enter the docker container
-./scripts/run_docker_interactive.sh
-```
-
-### Run evaluation
-
-```bash
-#!/bin/bash
-rm -rf data/logs/ data/testbeds/ # (Optional) remove previous outputs
-mkdir -p data/logs
-mkdir -p data/testbeds
-
-python SWE-bench/harness/run_evaluation.py \
-    --predictions_path data/predictions/devin_swe_outputs.json \
-    --swe_bench_tasks data/processed/swe-bench-test.json \
-    --log_dir data/logs \
-    --testbed data/testbeds \
-    --skip_existing \
-    --timeout 900 \
-    --verbose
-```
-
-You will see the command line outputs similar to this (if success):
-
-```log
-swe-bench@2f3a6b9fcab2:/swe-bench$ ./harness/run_evaluation.sh
-/swe-bench/harness/run_evaluation.py:101: SyntaxWarning: assertion is always true, perhaps remove parentheses?
-  assert(temp, datasets.arrow_dataset.Dataset)
-2024-03-20 09:21:18,796 - INFO - Found 1 predictions across 1 model(s) in predictions file
-2024-03-20 09:21:18,796 - INFO - [claude-2/matplotlib__matplotlib/3.6] # of predictions to evaluate: 1 (0 already evaluated)
-2024-03-20 09:21:18,797 - INFO - [Testbed] Creating log directory /swe-bench/data/logs/claude-2
-2024-03-20 09:21:18,797 - INFO - [Testbed] Using conda path /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmp09wrm708
-2024-03-20 09:21:18,797 - INFO - [Testbed] Using working directory /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmpfy1qth23 for testbed
-2024-03-20 09:21:18,797 - INFO - [Testbed] Repo matplotlib/matplotlib: 1 versions
-2024-03-20 09:21:18,797 - INFO - [Testbed]      Version 3.6: 1 instances
-2024-03-20 09:21:18,797 - INFO - No conda path provided, creating temporary install in /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmp09wrm708/miniconda3...
-2024-03-20 09:21:27,482 - INFO - [Testbed] Using conda path /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmp09wrm708/miniconda3
-2024-03-20 09:21:27,942 - INFO - [Testbed] Setting up testbed for matplotlib__matplotlib__3.6
-2024-03-20 09:21:44,257 - INFO - [Testbed] Cloned matplotlib/matplotlib to /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmpfy1qth23/matplotlib__matplotlib__3.6
-2024-03-20 09:21:44,415 - INFO - [Testbed] Creating environment matplotlib__matplotlib__3.6; Command: /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmp09wrm708/miniconda3/bin/conda env create --file /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmpfy1qth23/environment.yml
-2024-03-20 09:23:39,781 - INFO - [Testbed] Installing pip packages for matplotlib__matplotlib__3.6; Command: . /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmp09wrm708/miniconda3/bin/activate matplotlib__matplotlib__3.6 && pip install pytest
-/swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmpfy1qth23/matplotlib__matplotlib__3.6: 1 instances
-2024-03-20 09:23:42,309 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Reset task environment to aca6e9d5e98811ca37c442217914b15e78127c89
-2024-03-20 09:23:42,314 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Apply patch successful (pred_try)
-2024-03-20 09:23:42,318 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Revert patch successful (pred_try)
-2024-03-20 09:23:42,318 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Installing with command: . /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmp09wrm708/miniconda3/bin/activate matplotlib__matplotlib__3.6 && echo 'activate successful' && python -m pip install -e .
-2024-03-20 09:24:54,966 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Installation successful
-2024-03-20 09:24:54,970 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Apply patch successful (test)
-2024-03-20 09:24:54,974 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Apply patch successful (pred)
-2024-03-20 09:25:04,775 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Test script run successful
-swe-bench@2f3a6b9fcab2:/swe-bench$
-```
-
-### Interpret Results
-
-Then you may interpret the results under `data/logs`, and interpret it following [this guide](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-metrics).
@@ -1,155 +0,0 @@
-# @yaml
-# signature: search_dir <search_term> [<dir>]
-# docstring: searches for search_term in all files in dir. If dir is not provided, searches in the current directory
-# arguments:
-#   search_term:
-#     type: string
-#     description: the term to search for
-#     required: true
-#   dir:
-#     type: string
-#     description: the directory to search in (if not provided, searches in the current directory)
-#     required: false
-search_dir() {
-    if [ $# -eq 1 ]; then
-        local search_term="$1"
-        local dir="./"
-    elif [ $# -eq 2 ]; then
-        local search_term="$1"
-        if [ -d "$2" ]; then
-            local dir="$2"
-        else
-            echo "Directory $2 not found"
-            return
-        fi
-    else
-        echo "Usage: search_dir <search_term> [<dir>]"
-        return
-    fi
-    dir=$(realpath "$dir")
-    local matches=$(find "$dir" -type f ! -path '*/.*' -exec grep -nIH "$search_term" {} + | cut -d: -f1 | sort | uniq -c)
-    # if no matches, return
-    if [ -z "$matches" ]; then
-        echo "No matches found for \"$search_term\" in $dir"
-        return
-    fi
-    # Calculate total number of matches
-    local num_matches=$(echo "$matches" | awk '{sum+=$1} END {print sum}')
-    # calculate total number of files matched
-    local num_files=$(echo "$matches" | wc -l | awk '{$1=$1; print $0}')
-    # if num_files is > 100, print an error
-    if [ $num_files -gt 100 ]; then
-        echo "More than $num_files files matched for \"$search_term\" in $dir. Please narrow your search."
-        return
-    fi
-
-    echo "Found $num_matches matches for \"$search_term\" in $dir:"
-    echo "$matches" | awk '{$2=$2; gsub(/^\.+\/+/, "./", $2); print $2 " ("$1" matches)"}'
-    echo "End of matches for \"$search_term\" in $dir"
-}
-
-# @yaml
-# signature: search_file <search_term> [<file>]
-# docstring: searches for search_term in file. If file is not provided, searches in the current open file
-# arguments:
-#   search_term:
-#     type: string
-#     description: the term to search for
-#     required: true
-#   file:
-#     type: string
-#     description: the file to search in (if not provided, searches in the current open file)
-#     required: false
-search_file() {
-    # Check if the first argument is provided
-    if [ -z "$1" ]; then
-        echo "Usage: search_file <search_term> [<file>]"
-        return
-    fi
-    # Check if the second argument is provided
-    if [ -n "$2" ]; then
-        # Check if the provided argument is a valid file
-        if [ -f "$2" ]; then
-            local file="$2"  # Set file if valid
-        else
-            echo "Usage: search_file <search_term> [<file>]"
-            echo "Error: File name $2 not found. Please provide a valid file name."
-            return  # Exit if the file is not valid
-        fi
-    else
-        # Check if a file is open
-        if [ -z "$CURRENT_FILE" ]; then
-            echo "No file open. Use the open command first."
-            return  # Exit if no file is open
-        fi
-        local file="$CURRENT_FILE"  # Set file to the current open file
-    fi
-    local search_term="$1"
-    file=$(realpath "$file")
-    # Use grep to directly get the desired formatted output
-    local matches=$(grep -nH "$search_term" "$file")
-    # Check if no matches were found
-    if [ -z "$matches" ]; then
-        echo "No matches found for \"$search_term\" in $file"
-        return
-    fi
-    # Calculate total number of matches
-    local num_matches=$(echo "$matches" | wc -l | awk '{$1=$1; print $0}')
-
-    # calculate total number of lines matched
-    local num_lines=$(echo "$matches" | cut -d: -f1 | sort | uniq | wc -l | awk '{$1=$1; print $0}')
-    # if num_lines is > 100, print an error
-    if [ $num_lines -gt 100 ]; then
-        echo "More than $num_lines lines matched for \"$search_term\" in $file. Please narrow your search."
-        return
-    fi
-
-    # Print the total number of matches and the matches themselves
-    echo "Found $num_matches matches for \"$search_term\" in $file:"
-    echo "$matches" | cut -d: -f1-2 | sort -u -t: -k2,2n | while IFS=: read -r filename line_number; do
-        echo "Line $line_number:$(sed -n "${line_number}p" "$file")"
-    done
-    echo "End of matches for \"$search_term\" in $file"
-}
-
-# @yaml
-# signature: find_file <file_name> [<dir>]
-# docstring: finds all files with the given name in dir. If dir is not provided, searches in the current directory
-# arguments:
-#   file_name:
-#     type: string
-#     description: the name of the file to search for
-#     required: true
-#   dir:
-#     type: string
-#     description: the directory to search in (if not provided, searches in the current directory)
-#     required: false
-find_file() {
-    if [ $# -eq 1 ]; then
-        local file_name="$1"
-        local dir="./"
-    elif [ $# -eq 2 ]; then
-        local file_name="$1"
-        if [ -d "$2" ]; then
-            local dir="$2"
-        else
-            echo "Directory $2 not found"
-            return
-        fi
-    else
-        echo "Usage: find_file <file_name> [<dir>]"
-        return
-    fi
-
-    dir=$(realpath "$dir")
-    local matches=$(find "$dir" -type f -name "$file_name")
-    # if no matches, return
-    if [ -z "$matches" ]; then
-        echo "No matches found for \"$file_name\" in $dir"
-        return
-    fi
-    # Calculate total number of matches
-    local num_matches=$(echo "$matches" | wc -l | awk '{$1=$1; print $0}')
-    echo "Found $num_matches matches for \"$file_name\" in $dir:"
-    echo "$matches" | awk '{print $0}'
-}
@@ -1,15 +0,0 @@
-# FROM https://github.com/princeton-nlp/SWE-bench/blob/main/environment.yml
-name: swe-bench
-dependencies:
-  - python=3.9
-  - pip
-  - pip:
-    - beautifulsoup4
-    - chardet
-    - ghapi
-    - GitPython
-    - python-dotenv
-    - requests
-    - rich
-    - transformers>=4.34.0
-  - conda-forge::gh
@@ -1,5 +0,0 @@
-from datasets import load_dataset
-
-dataset = load_dataset('princeton-nlp/SWE-bench')
-test = dataset['test'].to_pandas()
-test.to_json('data/processed/swe-bench-test.json', orient='records')
@@ -1,81 +0,0 @@
-'''
-Script used to convert devin's output into the desired json format for evaluation on SWE-bench
-
-Usage:
-    python prepare_devin_outputs_for_evaluation.py <setting>
-    <setting> can be "passed", "failed", "all"
-
-Outputs:
-    two json files under evaluation/SWE-bench/data/
-
-'''
-
-# fetch devin's outputs into a json file for evaluation
-import json
-import os
-import sys
-
-import requests
-from tqdm import tqdm
-
-
-def get_devin_eval_output(setting):
-    repo_url = 'CognitionAI/devin-swebench-results'
-    folder_path = 'output_diffs'
-
-    base_url = 'https://api.github.com/repos/'
-    pass_api_url = f'{base_url}{repo_url}/contents/{folder_path}/pass'
-    failed_api_url = f'{base_url}{repo_url}/contents/{folder_path}/fail'
-
-    pass_files_info = []
-    failed_files_info = []
-
-    def get_files(api_url, subfolder_name, files_info):
-        response = requests.get(api_url)
-        if response.status_code == 200:
-            contents = response.json()
-            for item in tqdm(contents):
-                if item['type'] == 'file':
-                    file_url = f"https://raw.githubusercontent.com/{repo_url}/main/{folder_path}/{subfolder_name}/{item['name']}"
-                    file_content = requests.get(file_url).text
-                    instance_id = item['name'][:-9]
-                    model_name = 'Devin'  # Update with actual model name
-                    files_info.append({
-                        'instance_id': instance_id,
-                        'model_patch': file_content,
-                        'model_name_or_path': model_name,
-                        'pass_or_fail': subfolder_name
-                    })
-
-    if setting == 'passed' or setting == 'all':
-        get_files(pass_api_url, 'pass', pass_files_info)
-    if setting == 'failed' or setting == 'all':
-        get_files(failed_api_url, 'fail', failed_files_info)
-
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    output_dir = os.path.join(script_dir, '../data/devin/')
-
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    if setting == 'passed' or setting == 'all':
-        with open(os.path.join(output_dir, 'devin_swe_passed.json'), 'w') as pass_file:
-            json.dump(pass_files_info, pass_file, indent=4)
-
-    if setting == 'failed' or setting == 'all':
-        with open(os.path.join(output_dir, 'devin_swe_failed.json'), 'w') as fail_file:
-            json.dump(failed_files_info, fail_file, indent=4)
-
-    if setting == 'all':
-        merged_output = pass_files_info + failed_files_info
-        with open(os.path.join(output_dir, 'devin_swe_outputs.json'), 'w') as merge_file:
-            json.dump(merged_output, merge_file, indent=4)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        print('Usage: python script_name.py <setting>')
-        sys.exit(1)
-
-    setting = sys.argv[1]
-    get_devin_eval_output(setting)
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-set -xeo pipefail
-mkdir -p data/processed
-python3 scripts/download_test_data.py
-
-# Download an example output file (FROM claude-2)
-# https://gist.github.com/sorendunn/9f1f1fade59f986b4925b6633f9ff165
-mkdir -p data/predictions
-wget https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output/raw/main/devin_swe_outputs.json -O data/predictions/devin_swe_outputs.json
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-DOCKER_IMAGE=ghcr.io/opendevin/eval-swe-bench
-WORK_DIR=`pwd`
-
-docker run \
-    -it \
-    --rm \
-    --user root \
-    --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
-    -v $WORK_DIR:/swe-bench \
-    -w /swe-bench \
-    $DOCKER_IMAGE \
-    /bin/bash -c "usermod -u $(id -u) swe-bench && su swe-bench"
@@ -14,9 +14,9 @@ To run the tests for OpenDevin project, you can use the provided test runner scr
 3. Navigate to the root directory of the project.
 4. Run the test suite using the test runner script with the required arguments:
   ```
-   python evaluation/regression/run_tests.py --OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxx --model=gpt-3.5-turbo-1106
+   python evaluation/regression/run_tests.py --OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxx --model=gpt-3.5-turbo
   ```
-   Replace `sk-xxxxxxxxxxxxxxxxxxxxxx` with your actual OpenAI API key. The default model is `gpt-3.5-turbo-1106`, but you can specify a different model if needed.
+   Replace `sk-xxxxxxxxxxxxxxxxxxxxxx` with your actual OpenAI API key. The default model is `gpt-3.5-turbo`, but you can specify a different model if needed.

 The test runner will discover and execute all the test cases in the `cases/` directory, and display the results of the test suite, including the status of each individual test case and the overall summary.

@@ -19,7 +19,9 @@ def agents():
    """
    agents = []
    for agent in os.listdir(AGENTHUB_DIR):
-        if os.path.isdir(os.path.join(AGENTHUB_DIR, agent)) and agent.endswith('_agent'):
+        if os.path.isdir(os.path.join(AGENTHUB_DIR, agent)) and agent.endswith(
+            '_agent'
+        ):
            agents.append(agent)
    return agents

@@ -74,9 +76,9 @@ def model(request):
        request: The pytest request object.

    Returns:
-        The model name, defaulting to "gpt-3.5-turbo-1106".
+        The model name, defaulting to "gpt-3.5-turbo".
    """
-    return request.config.getoption('model', default='gpt-3.5-turbo-1106')
+    return request.config.getoption('model', default='gpt-3.5-turbo')


@pytest.fixture
@@ -91,6 +93,7 @@ def run_test_case(test_cases_dir, workspace_dir, request):
    Returns:
        A function that runs a test case for a given agent and case.
    """
+
    def _run_test_case(agent, case):
        """Runs a test case for a given agent.

@@ -116,14 +119,32 @@ def run_test_case(test_cases_dir, workspace_dir, request):

        shutil.rmtree(os.path.join(agent_dir, 'workspace'), ignore_errors=True)
        if os.path.isdir(os.path.join(case_dir, 'start')):
-            os.copytree(os.path.join(case_dir, 'start'), os.path.join(agent_dir, 'workspace'))
+            os.copytree(
+                os.path.join(case_dir, 'start'), os.path.join(agent_dir, 'workspace')
+            )
        else:
            os.makedirs(os.path.join(agent_dir, 'workspace'))
        agents_ref = {
            'monologue_agent': 'MonologueAgent',
-            'codeact_agent': 'CodeActAgent'
+            'codeact_agent': 'CodeActAgent',
        }
-        process = subprocess.Popen(['python3', f'{SCRIPT_DIR}/../../opendevin/main.py', '-d', f"{os.path.join(agent_dir, 'workspace')}", '-c', f'{agents_ref[agent]}', '-t', f'{task}', '-m', 'gpt-3.5-turbo-1106'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+        process = subprocess.Popen(
+            [
+                'python3',
+                f'{SCRIPT_DIR}/../../opendevin/main.py',
+                '-d',
+                f"{os.path.join(agent_dir, 'workspace')}",
+                '-c',
+                f'{agents_ref[agent]}',
+                '-t',
+                f'{task}',
+                '-m',
+                'gpt-3.5-turbo',
+            ],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            universal_newlines=True,
+        )
        stdout, stderr = process.communicate()
        logging.info(f'Stdout: {stdout}')
        logging.error(f'Stderr: {stderr}')
@@ -146,6 +167,6 @@ def pytest_configure(config):
        format='%(asctime)s [%(levelname)s] %(message)s',
        handlers=[
            logging.FileHandler(f"test_results_{now.strftime('%Y%m%d_%H%M%S')}.log"),
-            logging.StreamHandler()
-        ]
+            logging.StreamHandler(),
+        ],
    )
@@ -2,7 +2,7 @@ import argparse

 import pytest

-from opendevin import config
+from opendevin.config import config

 if __name__ == '__main__':
    """Main entry point of the script.
@@ -0,0 +1,39 @@
+# Pre-build Testbed and Env
+
+In the original SWE-Bench implementation, conda environment for evaluation is typically installed from scratch while evaluating on a paticular instance. This poses serveral challenges:
+
+- Effeciency: most time of evaluation will be wasted on downloading packages
+- Stability: setup could failed due to bad internet connectivity
+- Reliability: it is possible that an instance is considered failed not because the agent did badly, but because the environment setup failed.
+
+In OpenDevin-SWE-Bench fork, we try to pre-build the **testbed** (i.e., code of the repository we want the agent to edit) AND the **conda environment**, so that in evaluation (inference) time, we can directly leverage existing environments for effecienct evaluation.
+
+NOTE: We only support SWE-Bench lite for now. But modifying our existing scripts for full SWE-Bench should be quite straight forward.
+
+## How to pre-build your testbed
+
+### Setup Eval Workspace (Util + Data)
+
+Setup your eval workspace by:
+1. Clone OpenDevin SWE-Bench [fork](https://github.com/OpenDevin/OD-SWE-bench.git)
+2. Prepare SWE-Bench data
+
+Run the following command to do the above two steps. The results will be saved to `evaluation/SWE-bench/eval_workspace`.
+
+```bash
+./evaluation/swe_bench/scripts/setup/prepare_swe_utils.sh
+```
+
+### Pre-build Conda Env and Test Bed
+
+```bash
+./evaluation/swe_bench/scripts/setup/swe_env_setup.sh
+```
+
+### Build the pre-build conda env and testbed into ONE docker image
+
+```bash
+pushd evaluation/swe_bench
+docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.0 -f ./scripts/docker/Dockerfile.full.v1.0 .
+docker push ghcr.io/opendevin/eval-swe-bench:full-v1.0
+```
@@ -0,0 +1,256 @@
+# Evaluate Generated Patches
+
+## Evaluate patches generated by OpenDevin
+
+This section explains in detail how `evaluation/swe_bench/scripts/eval_infer.sh` described in [SWE-Bench README](./README.md) works.
+
+Use `scripts/setup/get_agent_report.sh` to evaluate patches generated by an OpenDevin agent. This script is available in the container at `/swe_util/get_agent_report.sh`.
+
+- `output-file` (*required*): specify the path to your patch file inside the container
+- `agent-name` (*required*): your agent name
+- `dataset` (*required*): `swe-bench-test-lite` or `swe-bench-test`
+- `num-processes`: defaults to 15.
+- `experiment-name`: set to `${parent_folder_of_output_fils}_${current_folder_of_output_file}` if not given. E.g., `xxx/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v2_cd/output.jsonl` -> `CodeActAgent_gpt-4-1106-preview_maxiter_50_N_v2_cd` as experiment name.
+- `merge_report`: if set, merges the evaluation report into the original output jsonl file and saves as a `.merged.jsonl` file.
+
+An example to run evaluation on the given example agent output (`./examples/example_agent_output.json`).
+
+```shell
+export MINICONDA3=/swe_util/miniforge3
+export OD_SWE_BENCH=/OD-SWE-bench
+export EVAL_DATA_DIR=/swe_util/eval_data
+cd /swe_util && ./get_agent_report.sh --output-file /swe_bench_output/example_agent_output.jsonl \
+--agent-name CodeActAgent \
+--dataset swe-bench-test-lite \
+--experiment-name test_experiment \
+--merge-report
+```
+
+You should get the following report:
+```shell
+- no_generation: 4
+- generated: 26
+- with_logs: 26
+- install_fail: 0
+- reset_failed: 0
+- no_apply: 0
+- applied: 24
+- test_errored: 0
+- test_timeout: 0
+- resolved: 6
+['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'django__django-17087', 'sympy__sympy-20590', 'django__django-11583', 'sympy__sympy-21612']
+Report saved at /swe_util/eval_data/eval_logs/test_experiment/test_experiment_swe-bench-test-lite.report.json
+Agent output with report merged created at /swe_bench_output/example_agent_output.merged.jsonl
+```
+
+An additional `fine_grained_report` field will be added to each instance in the `example_agent_output.merged.jsonl`.
+
+```json
+"fine_grained_report": {
+  "gold_tests": {
+    "FAIL_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_default\"]",
+    "PASS_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_enabled\", \"tests/test_ext_viewcode.py::test_linkcode\", \"tests/test_ext_viewcode.py::test_local_source_files\"]"
+  },
+  "generated": true,
+  "with_logs": true,
+  "applied": true,
+  "test_errored": false,
+  "test_timeout": false,
+  "resolved": true,
+  "log_parse": {
+    "tests/test_ext_viewcode.py::test_viewcode_epub_default": "PASSED",
+    "tests/test_ext_viewcode.py::test_viewcode_epub_enabled": "PASSED",
+    "tests/test_ext_viewcode.py::test_linkcode": "PASSED",
+    "tests/test_ext_viewcode.py::test_local_source_files": "PASSED",
+    "tests/test_ext_viewcode.py::test_viewcode": "FAILED"
+  },
+  "eval_report": {
+    "FAIL_TO_PASS": {
+      "success": [
+        "tests/test_ext_viewcode.py::test_viewcode_epub_default"
+      ],
+      "failure": []
+    },
+    "PASS_TO_PASS": {
+      "success": [
+        "tests/test_ext_viewcode.py::test_viewcode_epub_enabled",
+        "tests/test_ext_viewcode.py::test_linkcode",
+        "tests/test_ext_viewcode.py::test_local_source_files"
+      ],
+      "failure": []
+    },
+    "FAIL_TO_FAIL": {
+      "success": [],
+      "failure": []
+    },
+    "PASS_TO_FAIL": {
+      "success": [],
+      "failure": []
+    }
+  }
+}
+```
+
+## If you already have patches not generated by OpenDevin
+
+### Prepare Output Files
+
+Ensure that model outputs are formatted correctly as below:
+```json
+[
+  {
+    "instance_id": "",
+    "model_patch": "",
+    "model_name_or_path": ""
+  },
+  ...
+]
+```
+An example can be found [here](./examples/example_model_output.json).
+
+Agent output should be adhere to the OpenDevin format. An example can be found [here](./examples/example_agent_output.json).
+
+### Set Up the Environment
+
+Before evaluating generated patches, you need to set up the Docker environment. Run the following command to instantiate the Docker container and mount the directory to your output files on the host:
+
+```shell
+docker run -it \
+-v DIR_TO_YOUR_PATCH_FILES_ON_HOST:/swe_bench_output \
+ghcr.io/opendevin/eval-swe-bench:full-v1.0 /bin/bash
+```
+
+### Evaluate Model Generated Patches
+
+Use `scripts/get_model_report.sh` to evaluate patches generated by a model. This script is located in the container at `/swe_util/get_model_report.sh`.
+
+- `output-file` (*required*): specify the path to your patch file inside the container
+- `model-name` (*required*): this must match the `model_name_or_path` in your patch file
+- `dataset` (*required*): `swe-bench-test-lite` or `swe-bench-test`
+- `num-processes`: defaults to 15.
+- `experiment-name`: set to `{model-name}__{dataset}` unless specified
+
+An example to run evaluation on the given example model output (`./examples/example_agent_output.json`).
+
+```shell
+export MINICONDA3=/swe_util/miniforge3
+export OD_SWE_BENCH=/swe_util/OD-SWE-bench
+export EVAL_DATA_DIR=/swe_util/eval_data
+cd /swe_util && ./get_model_report.sh --output-file /swe_bench_output/example_model_output.json \
+--model-name opendevin \
+--dataset swe-bench-test-lite
+```
+
+You should get the following report:
+```shell
+- no_generation: 4
+- generated: 26
+- with_logs: 26
+- install_fail: 0
+- reset_failed: 0
+- no_apply: 0
+- applied: 24
+- test_errored: 0
+- test_timeout: 0
+- resolved: 6
+['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'django__django-17087', 'sympy__sympy-20590', 'django__django-11583', 'sympy__sympy-21612']
+Report saved at /swe_util/eval_data/eval_logs/opendevin__swe-bench-test-lite/example_model_output.report.json
+```
+Note: please ignore the `no_apply` in the report for now.
+
+The script will generate a `{experiment_name}` folder under `$EVAL_DATA_DIR/eval_logs`
+```shell
+├── $EVAL_DATA_DIR/eval_logs/$experiment_name
+│   ├── $experiment_name.json
+│   ├── $experiment_name.report.json
+│   ├── $model_name # eval log dir
+```
+
+### Evaluate Agent Generated Patches
+
+Use `scripts/setup/get_agent_report.sh` to evaluate patches generated by an agent. This script is available in the container at `/swe_util/get_agent_report.sh`.
+
+- `output-file` (*required*): specify the path to your patch file inside the container
+- `agent-name` (*required*): your agent name
+- `dataset` (*required*): `swe-bench-test-lite` or `swe-bench-test`
+- `num-processes`: defaults to 15.
+- `experiment-name`: set to `${parent_folder_of_output_fils}_${current_folder_of_output_file}` if not given. E.g., `xxx/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v2_cd/output.jsonl` -> `CodeActAgent_gpt-4-1106-preview_maxiter_50_N_v2_cd` as experiment name.
+- `merge_report`: if set, merges the evaluation report into the original output jsonl file and saves as a `.merged.jsonl` file.
+
+An example to run evaluation on the given example agent output (`./examples/example_agent_output.json`).
+
+```shell
+export MINICONDA3=/swe_util/miniforge3
+export OD_SWE_BENCH=/OD-SWE-bench
+export EVAL_DATA_DIR=/swe_util/eval_data
+cd /swe_util && ./get_agent_report.sh --output-file /swe_bench_output/example_agent_output.jsonl \
+--agent-name CodeActAgent \
+--dataset swe-bench-test-lite \
+--experiment-name test_experiment \
+--merge-report
+```
+
+You should get the following report:
+```shell
+- no_generation: 4
+- generated: 26
+- with_logs: 26
+- install_fail: 0
+- reset_failed: 0
+- no_apply: 0
+- applied: 24
+- test_errored: 0
+- test_timeout: 0
+- resolved: 6
+['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'django__django-17087', 'sympy__sympy-20590', 'django__django-11583', 'sympy__sympy-21612']
+Report saved at /swe_util/eval_data/eval_logs/test_experiment/test_experiment_swe-bench-test-lite.report.json
+Agent output with report merged created at /swe_bench_output/example_agent_output.merged.jsonl
+```
+
+An additional `fine_grained_report` field will be added to each instance in the `example_agent_output.merged.jsonl`.
+
+```json
+"fine_grained_report": {
+  "gold_tests": {
+    "FAIL_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_default\"]",
+    "PASS_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_enabled\", \"tests/test_ext_viewcode.py::test_linkcode\", \"tests/test_ext_viewcode.py::test_local_source_files\"]"
+  },
+  "generated": true,
+  "with_logs": true,
+  "applied": true,
+  "test_errored": false,
+  "test_timeout": false,
+  "resolved": true,
+  "log_parse": {
+    "tests/test_ext_viewcode.py::test_viewcode_epub_default": "PASSED",
+    "tests/test_ext_viewcode.py::test_viewcode_epub_enabled": "PASSED",
+    "tests/test_ext_viewcode.py::test_linkcode": "PASSED",
+    "tests/test_ext_viewcode.py::test_local_source_files": "PASSED",
+    "tests/test_ext_viewcode.py::test_viewcode": "FAILED"
+  },
+  "eval_report": {
+    "FAIL_TO_PASS": {
+      "success": [
+        "tests/test_ext_viewcode.py::test_viewcode_epub_default"
+      ],
+      "failure": []
+    },
+    "PASS_TO_PASS": {
+      "success": [
+        "tests/test_ext_viewcode.py::test_viewcode_epub_enabled",
+        "tests/test_ext_viewcode.py::test_linkcode",
+        "tests/test_ext_viewcode.py::test_local_source_files"
+      ],
+      "failure": []
+    },
+    "FAIL_TO_FAIL": {
+      "success": [],
+      "failure": []
+    },
+    "PASS_TO_FAIL": {
+      "success": [],
+      "failure": []
+    }
+  }
+}
+```
@@ -0,0 +1,150 @@
+# SWE-Bench Evaluation with OpenDevin SWE-Bench Docker Image
+
+
+This folder contains evaluation harness we built on top of the original [SWE-Bench benchmark](https://www.swebench.com/) ([paper](https://arxiv.org/abs/2310.06770)). We create [a fork of SWE-Bench](https://github.com/OpenDevin/OD-SWE-bench.git) mostly build on top of [the original repo](https://github.com/princeton-nlp/SWE-bench) and [containerized](#opendevin-swe-bench-docker-image) it for easy evaluation.
+
+## OpenDevin SWE-Bench Docker Image
+
+In [OpenDevin-SWE-Bench fork](https://github.com/OpenDevin/OD-SWE-bench.git) (mostly from [original repo](https://github.com/princeton-nlp/SWE-bench) with some fixes), we try to pre-build the **testbed** (i.e., code of the repository we want the agent to edit) AND the **conda environment**, so that in evaluation (inference) time, we can directly leverage existing environments for effecienct evaluation.
+
+**We pack everything you need for SWE-Bench evaluation into one, gigantic, docker image.** To use it:
+
+```bash
+docker pull ghcr.io/opendevin/eval-swe-bench:full-v1.0
+```
+
+The Docker image contains several important directories:
+- `/swe_util/OD-SWE-bench`: root directory for the OD-SWE-bench repository
+- `/swe_util/eval_data`: director to eval data
+  - `/swe_util/eval_data/eval_logs/`: evaluation logs
+  - `/swe_util/eval_data/eval_temp/`: temporary folder for the evaluation process
+  - `/swe_util/eval_data/instances/`: swe-bench raw instances
+  - `/swe_util/eval_data/outputs/`: model or agent outputs
+  - `/swe_util/eval_data/testbed_logs/`: logs for testbed building
+  - `/swe_util/eval_data/testbeds/`: directory for all testbeds
+- `/swe_util/miniforge3/`: directory for miniforge3
+
+To reproduce how we pack the image, check [this doc](./BUILD_TESTBED_AND_ENV.md).
+
+NOTE: We only support SWE-Bench lite for now. But modifying our existing scripts for full SWE-Bench should be quite straight forward.
+
+## Test if your environment works
+
+```bash
+python3 evaluation/swe_bench/swe_env_box.py
+```
+
+If you get to the interactive shell successfully, it means success!
+
+## Configure your LLM
+
+Create a `config.toml` file if not exists at the root of workspace.
+
+Add the following configurations:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/tmp/cache"
+sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
+sandbox_type = "ssh"
+use_host_network = true
+ssh_hostname = "localhost"
+sandbox_timeout = 120
+# eval specific
+run_as_devin = false
+
+# TODO: Change these to the model you want to evaluate
+[eval_gpt4_1106_preview]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[eval_some_openai_compatible_model]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Run Inference on SWE-Bench Instances
+
+```bash
+./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview
+```
+
+You can replace `eval_gpt4_1106_preview` with any model you setted up in `config.toml`.
+
+
+## Evaluate Generated Patches
+
+After running the inference described in the previous section, you will obtain a `output.jsonl` (by default it will save to `evaluation/evaluation_outputs`). Then you can run this one line script to evaluate generated patches, and produce a fine-grained report:
+
+If you want to evaluate existing results, you should first run this to clone existing outputs
+
+```bash
+git clone https://huggingface.co/spaces/OpenDevin/evaluation evaluation/evaluation_outputs
+```
+
+Then you can run the following:
+```bash
+# ./evaluation/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL
+# For example:
+./evaluation/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
+```
+
+The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.merged.jsonl`.
+
+It will contains an additional field `fine_grained_report` (see example below) compared to the `output.jsonl` from the previous inference stage.
+
+```json
+"fine_grained_report": {
+  "gold_tests": {
+    "FAIL_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_default\"]",
+    "PASS_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_enabled\", \"tests/test_ext_viewcode.py::test_linkcode\", \"tests/test_ext_viewcode.py::test_local_source_files\"]"
+  },
+  "generated": true,
+  "with_logs": true,
+  "applied": true,
+  "test_errored": false,
+  "test_timeout": false,
+  "resolved": true,
+  "log_parse": {
+    "tests/test_ext_viewcode.py::test_viewcode_epub_default": "PASSED",
+    "tests/test_ext_viewcode.py::test_viewcode_epub_enabled": "PASSED",
+    "tests/test_ext_viewcode.py::test_linkcode": "PASSED",
+    "tests/test_ext_viewcode.py::test_local_source_files": "PASSED",
+    "tests/test_ext_viewcode.py::test_viewcode": "FAILED"
+  },
+  "eval_report": {
+    "FAIL_TO_PASS": {
+      "success": [
+        "tests/test_ext_viewcode.py::test_viewcode_epub_default"
+      ],
+      "failure": []
+    },
+    "PASS_TO_PASS": {
+      "success": [
+        "tests/test_ext_viewcode.py::test_viewcode_epub_enabled",
+        "tests/test_ext_viewcode.py::test_linkcode",
+        "tests/test_ext_viewcode.py::test_local_source_files"
+      ],
+      "failure": []
+    },
+    "FAIL_TO_FAIL": {
+      "success": [],
+      "failure": []
+    },
+    "PASS_TO_FAIL": {
+      "success": [],
+      "failure": []
+    }
+  }
+}
+```
+
+Please refer to [EVAL_PATCH.md](./EVAL_PATCH.md) if you want to learn more about how to evaluate patches that are already generated (e.g., not by OpenDevin).
+
+## Submit your evaluation results
+
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
@@ -0,0 +1,411 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+import pandas as pd
+import whatthepatch
+from datasets import load_dataset
+from tqdm import tqdm
+
+from evaluation.swe_bench.swe_env_box import SWEBenchSSHBox
+from opendevin.controller.state.state import State
+from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    print('Cleaning up child processes...')
+    for process in mp.active_children():
+        print(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
+    )
+    if state.history:
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have fixed the issue through code changes, please run the following command: <execute_bash> exit </execute_bash>.\n'
+}
+
+
+def get_test_result(instance, sandbox, workspace_dir_name):
+    test_result = {'result': {}, 'metadata': {}}
+    try:
+        test_patch_parsed = whatthepatch.parse_patch(instance.test_patch)
+        # get a list of filepaths that are involved in the patch
+        involved_filepaths = set()
+        for patch in test_patch_parsed:
+            involved_filepaths.add(patch.header.old_path.removeprefix('a/'))
+            involved_filepaths.add(patch.header.new_path.removeprefix('b/'))
+        involved_filepaths = list(involved_filepaths)
+        test_result['metadata']['1_test_patch_parse_success'] = True
+        test_result['metadata']['1_test_involved_filepaths'] = involved_filepaths
+    except Exception as e:
+        logger.error(
+            f'Error parsing test patch for instance {instance.instance_id}: {e}'
+        )
+        test_result['metadata']['1_test_patch_parse_success'] = False
+        test_result['metadata']['1_test_patch_parse_error'] = str(e)
+        test_result['metadata']['1_test_involved_filepaths'] = None
+        involved_filepaths = []
+
+    # Try to revert the changes for involved filepaths
+    err_code, output = sandbox.execute(f'cd /workspace/{workspace_dir_name}')
+    test_result['metadata']['2_revert_test_involved_filepaths_success'] = []
+    for filepath in involved_filepaths:
+        err_code, output = sandbox.execute(
+            f'git checkout {instance["base_commit"]} -- {filepath}'
+        )
+        if err_code != 0:
+            logger.error(f'Error reverting changes for {filepath}: {output}')
+            test_result['metadata']['2_revert_test_involved_filepaths_success'].append(
+                False
+            )
+        else:
+            test_result['metadata']['2_revert_test_involved_filepaths_success'].append(
+                True
+            )
+
+    # Apply the testcase
+    err_code, output = sandbox.execute('git apply $SWE_TASK_DIR/test.patch')
+    if err_code != 0:
+        logger.error(f'Error applying test patch: {output}')
+        test_result['metadata']['3_apply_test_patch_success'] = False
+        test_result['metadata']['3_apply_test_patch_error'] = output
+    else:
+        test_result['metadata']['3_apply_test_patch_success'] = True
+
+    # Run the test command
+    err_code, output = sandbox.execute(
+        '$TEST_CMD > /workspace/$SWE_INSTANCE_ID.log 2>&1'
+    )
+    if err_code != 0:
+        logger.error(f'Error running test command: {output}')
+        test_result['metadata']['4_run_test_command_success'] = False
+        test_result['metadata']['4_run_test_command_error'] = output
+    else:
+        test_result['metadata']['4_run_test_command_success'] = True
+
+    # Get the test output
+    err_code, output = sandbox.execute('cat /workspace/$SWE_INSTANCE_ID.log')
+    if err_code != 0:
+        logger.error(f'Error getting test output: {output}')
+        test_result['metadata']['4_get_test_output_success'] = False
+        test_result['metadata']['4_get_test_output_error'] = output
+    else:
+        test_result['metadata']['4_get_test_output_success'] = True
+        test_result['test_output'] = output
+
+    # Reformat instance.json
+    # $SWE_TASK_DIR/instance.json is a dict {"XXX": "YYY"}, add a [ before and a ] after
+    err_code, output = sandbox.execute(
+        (
+            'cat $SWE_TASK_DIR/instance.json | sed "s/^{/[{/" | sed "s/}$/}]/" > /workspace/instance.json'
+        )
+    )
+    if err_code != 0:
+        logger.error(f'Error creating instance.json: {output}')
+        test_result['metadata']['5_reformat_instance_json_success'] = False
+        test_result['metadata']['5_reformat_instance_json_error'] = output
+    else:
+        test_result['metadata']['5_reformat_instance_json_success'] = True
+
+    # Get the instance report
+    err_code, output = sandbox.execute(
+        (
+            'cd /swe_util/OD-SWE-bench '
+            '&& export PYTHONPATH=$(pwd):$PYTHONPATH '
+            '&& conda run -n swe-bench-eval python swebench/metrics/get_instance_report.py --swe_bench_task /workspace/instance.json --log_path /workspace/$SWE_INSTANCE_ID.log'
+        )
+    )
+    if err_code != 0:
+        logger.error(f'Error getting instance report: {output}')
+        test_result['metadata']['6_get_instance_report_success'] = False
+        test_result['metadata']['6_get_instance_report_error'] = output
+    else:
+        test_result['metadata']['6_get_instance_report_success'] = True
+        test_result['result_raw'] = output
+
+        # try to parse output
+        for line in output.strip().split('\n'):
+            line = line.strip('-')
+            try:
+                key, value = line.split(':')
+            except ValueError:
+                # skip this line
+                print(f'Error parsing result line: {line}')
+                continue
+            value = value.strip()
+            try:
+                value = int(value)
+            except ValueError:
+                pass
+            test_result['result'][key.strip()] = value
+    return test_result
+
+
+def process_instance(
+    instance, agent_class, metadata, skip_workspace_mount, reset_logger: bool = True
+):
+    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
+    # create process-specific workspace dir
+    if not skip_workspace_mount:
+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(
+            eval_output_dir, 'logs', f'instance_{instance.instance_id}.log'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {instance.instance_id}.\nLOG:   tail -f {log_file}'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+
+    if not skip_workspace_mount:
+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+    workspace_dir_name = f'{instance.repo}__{instance.version}'.replace('/', '__')
+    sandbox = SWEBenchSSHBox.get_box_for_instance(
+        instance,
+        workspace_dir_name,
+        skip_workspace_mount=skip_workspace_mount,
+        workspace_mount_path=workspace_mount_path,
+    )
+
+    # Prepare instruction
+    instruction = (
+        f'Please fix the following issue for the repository in /workspace/{workspace_dir_name}.\n'
+        'Environment has been set up for you to start working. You may assume all necessary tools are installed.\n\n'
+        '# Problem Statement\n'
+        f'{instance.problem_statement}\n\n'
+    )
+    if instance.hints_text:
+        instruction += f'# Hints\n{instance.hints_text}\n\n'
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
+        'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
+    )
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+
+    # Run the agent
+    state: State = asyncio.run(
+        main(
+            instruction,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+            sandbox=sandbox,
+        )
+    )
+
+    # Get git patch
+    git_patch = sandbox.get_diff_patch()
+    logger.info(f'Got git diff for instance {instance.instance_id}')
+
+    # ======= Attempt to evaluate the agent's edits =======
+    # Attempt to analyze the test patch to get involved filepaths
+    test_result = get_test_result(instance, sandbox, workspace_dir_name)
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # Save the output
+    output = {
+        'instance_id': instance.instance_id,
+        'swe_instance': instance.to_dict(),
+        'instruction': instruction,
+        'git_patch': git_patch,
+        'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'error': state.error if state and state.error else None,
+        'test_result': test_result,
+    }
+
+    # Close the sandbox
+    sandbox.close()
+    return output
+
+
+if __name__ == '__main__':
+    # Load the dataset
+    dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
+    swe_bench_tests = dataset['test'].to_pandas()
+
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'swe_bench',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        swe_bench_tests = swe_bench_tests.head(eval_n_limit)
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_instance_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_instance_ids.add(data['instance_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # filter out finished instances
+    new_swe_bench_tests = []
+    for idx, instance in swe_bench_tests.iterrows():
+        if instance.instance_id in finished_instance_ids:
+            logger.info(
+                f'Skipping instance {instance.instance_id} as it is already finished.'
+            )
+            continue
+        new_swe_bench_tests.append(instance)
+
+    swe_bench_tests = pd.DataFrame(new_swe_bench_tests)
+    logger.info(
+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(swe_bench_tests)}'
+    )
+
+    pbar = tqdm(total=len(swe_bench_tests))
+
+    def update_progress(future):
+        pbar.update(1)
+        output = future.result()
+        pbar.set_description(f'Instance {output["instance_id"]}')
+        pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        output_fp.flush()
+
+    num_workers = args.eval_num_workers
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    skip_workspace_mount = agent_class == 'CodeActAgent'
+    logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            for row_idx, instance in swe_bench_tests.iterrows():
+                future = executor.submit(
+                    process_instance,
+                    instance,
+                    agent_class,
+                    metadata,
+                    skip_workspace_mount,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
@@ -0,0 +1,17 @@
+FROM ghcr.io/opendevin/sandbox:latest
+
+RUN apt-get update && \
+    apt-get install -y libffi-dev bash gcc git jq wget pkg-config libfreetype-dev libfreetype6 libfreetype6-dev rsync && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -sfn /bin/bash /bin/sh
+RUN mkdir -p /opendevin/logs && chmod 777 /opendevin/logs
+
+# Setup Git
+RUN git config --global user.email "swebench@swebench.ai"
+RUN git config --global user.name "swebench"
+
+CMD ["/bin/bash"]
+# pushd evaluation/swe_bench
+# docker build -t ghcr.io/opendevin/eval-swe-bench:builder -f ./scripts/docker/Dockerfile.builder .
@@ -0,0 +1,19 @@
+FROM ghcr.io/opendevin/eval-swe-bench:builder
+
+# # Install Mamba/Conda
+RUN wget "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
+# install to /opt/miniforge3
+RUN mkdir /swe_util
+RUN bash Miniforge3-$(uname)-$(uname -m).sh -b -p /swe_util/miniforge3
+RUN export PATH=/swe_util/miniforge3/bin:$PATH
+RUN /swe_util/miniforge3/bin/mamba init bash
+
+# Setup SWE-Bench Eval Env
+RUN /bin/bash -c "/swe_util/miniforge3/bin/mamba create -n swe-bench-eval python==3.11.5 -y"
+RUN /bin/bash -c ". /swe_util/miniforge3/etc/profile.d/conda.sh && conda activate swe-bench-eval && \
+pip install requests python-dotenv GitPython datasets pandas beautifulsoup4 ghapi"
+RUN /bin/bash -c ". /swe_util/miniforge3/etc/profile.d/conda.sh && conda config --set changeps1 False && conda config --append channels conda-forge"
+
+CMD ["/bin/bash"]
+# pushd evaluation/swe_bench
+# docker build -t ghcr.io/opendevin/eval-swe-bench:builder_with_conda -f ./scripts/docker/Dockerfile.builder_with_conda .
@@ -0,0 +1,13 @@
+FROM ghcr.io/opendevin/eval-swe-bench:full_deps
+
+# ================== COPY Smaller things ==================
+# copy everything except the folder of `eval_data` or `miniforge3`
+# typically, this should be the OD codebase
+RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
+    rsync -ar --progress \
+    --exclude='eval_data' \
+    --exclude='miniforge3' \
+    /eval_workspace/ /swe_util/
+
+# pushd evaluation/SWE-bench
+# docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.0 -f ./scripts/docker/Dockerfile.full.v1.0 .
@@ -0,0 +1,72 @@
+FROM ghcr.io/opendevin/eval-swe-bench:builder
+
+# This Dockefile is used to build the Docker image for the evaluation of the SWE-Bench.
+# YOU SHOULD ENSURE ./eval_workspace CONTAINS THE EVALUATION WORKSPACE (testbed, conda)
+# Check BUILD_TESTBED_AND_ENV.md for more details.
+
+RUN mkdir -p /swe_util
+
+# Use https://github.com/moby/moby/issues/15771#issuecomment-1762893340
+# to copy files from host to container with --exclude
+
+# # ================== Prepare Eval Data ==================
+# Copy everything in eval_data except the "testbeds"
+RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
+    rsync -ar --progress \
+    --exclude='testbeds' \
+    /eval_workspace/eval_data /swe_util/
+
+RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
+    rsync -ar --progress \
+    --exclude='matplotlib*' \
+    --exclude='scikit-learn*' \
+    /eval_workspace/eval_data/testbeds /swe_util/eval_data/
+
+# # copy the larger ones in separate layers
+# COPY ./eval_workspace/eval_data/testbeds/matplotlib* /swe_util/eval_data/testbeds/
+RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
+    rsync -ar --progress \
+    /eval_workspace/eval_data/testbeds/matplotlib* /swe_util/eval_data/testbeds/
+
+# COPY ./eval_workspace/eval_data/testbeds/scikit-learn* /swe_util/eval_data/testbeds/
+RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
+    rsync -ar --progress \
+    /eval_workspace/eval_data/testbeds/scikit-learn* /swe_util/eval_data/testbeds/
+
+# ================== Prepare Miniconda3 ==================
+# Copy the Miniconda3 environment
+# copy everything except the folder of `envs` & `pkgs` (two large folders)
+RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
+    rsync -ar --progress \
+    --exclude='envs' \
+    --exclude='pkgs' \
+    /eval_workspace/miniforge3 /swe_util/
+
+# copy pkgs in separate layers (~9.4GB)
+RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
+    rsync -ar --progress \
+    /eval_workspace/miniforge3/pkgs /swe_util/miniforge3/
+
+# copy envs in separate layers (except matplotlib & scikit-learn - larger ones)
+RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
+    rsync -ar --progress \
+    --exclude='matplotlib*' \
+    --exclude='scikit-learn*' \
+    --exclude='pydata*' \
+    /eval_workspace/miniforge3/envs /swe_util/miniforge3/
+
+# copy the larger ones in separate layers
+RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
+    rsync -ar --progress \
+    /eval_workspace/miniforge3/envs/matplotlib* /swe_util/miniforge3/envs/
+
+RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
+    rsync -ar --progress \
+    /eval_workspace/miniforge3/envs/scikit-learn* /swe_util/miniforge3/envs/
+
+RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
+    rsync -ar --progress \
+    /eval_workspace/miniforge3/envs/pydata* /swe_util/miniforge3/envs/
+
+# pushd evaluation/SWE-bench
+# docker build -t ghcr.io/opendevin/eval-swe-bench:full_deps -f ./scripts/docker/Dockerfile.full_deps .
@@ -0,0 +1,13 @@
+# Docker Build Guide
+
+## Builder
+
+This constructs docker container used for `evaluation/swe_bench/scripts/prepare_swe_utils.sh` that downloads the datasets.
+
+```bash
+pushd evaluation/swe_bench
+# This builds base image with basic dependencies
+docker build -t ghcr.io/opendevin/eval-swe-bench:builder -f ./scripts/docker/Dockerfile.builder .
+# This builds image with SWE-Bench conda environment pre-installed
+docker build -t ghcr.io/opendevin/eval-swe-bench:builder_with_conda -f ./scripts/docker/Dockerfile.builder_with_conda .
+```
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+PROCESS_FILEPATH=$1
+if [ -z "$PROCESS_FILEPATH" ]; then
+    echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file>"
+    exit 1
+fi
+
+if [ ! -f $PROCESS_FILEPATH ]; then
+    echo "Error: $PROCESS_FILEPATH is not a file"
+    exit 1
+fi
+
+PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
+FILE_DIR=$(dirname $PROCESS_FILEPATH)
+FILE_NAME=$(basename $PROCESS_FILEPATH)
+mkdir -p $FILE_DIR/eval_logs
+mkdir -p $FILE_DIR/swe_bench_format
+
+echo "Evaluating $FILE_NAME @ $FILE_DIR"
+echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
+
+docker run --rm \
+    -v $FILE_DIR:/swe_bench_output \
+    -e MINICONDA3=/swe_util/miniforge3 \
+    -e OD_SWE_BENCH=/swe_util/OD-SWE-bench \
+    -e EVAL_DATA_DIR=/swe_util/eval_data \
+    -w /swe_util \
+    ghcr.io/opendevin/eval-swe-bench:full-v1.0 \
+    bash -c "./get_agent_report.sh --output-file /swe_bench_output/$FILE_NAME \
+    --agent-name CodeActAgent \
+    --dataset swe-bench-test-lite \
+    --experiment-name test_experiment \
+    --merge-report && cp -r /swe_util/eval_data/eval_logs/test_experiment/* /swe_bench_output/eval_logs \
+    && cp -r /swe_util/eval_data/outputs/* /swe_bench_output/swe_bench_format/"
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+AGENT=CodeActAgent
+AGENT_VERSION=v1.3
+MODEL_CONFIG=$1
+
+# You should add $MODEL_CONFIG in your `config.toml`
+
+poetry run python3 evaluation/swe_bench/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 50 \
+  --max-chars 10000000 \
+  --eval-num-workers 8 \
+  --eval-note $AGENT_VERSION
@@ -0,0 +1,81 @@
+#!/bin/bash
+# THIS SCRIPT ONLY NEED TO BE RUN ONCE BEFORE EVALUATION
+set -e
+
+function setup_environment_and_testbed {
+    local instance_file_name=$1
+
+    # throw error if user name is not opendevin
+    if [ "$USER" != "opendevin" ]; then
+        echo "Error: This script is intended to be run by the 'opendevin' user only." >&2
+        exit 1
+    fi
+
+    # =======================================================
+    # Install & Setup Conda
+
+    # assume /swe_util/miniforge3 already exists
+    # install if swe-util does NOT have conda
+    if [ ! -d /swe_util/miniforge3 ]; then
+        pushd /swe_util
+        echo "Downloading and installing Miniforge3"
+        wget "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
+        bash Miniforge3-$(uname)-$(uname -m).sh -b -p /swe_util/miniforge3
+    fi
+
+    echo 'export PATH=/swe_util/miniforge3/bin:$PATH' >> ~/.bashrc
+    eval "$(/swe_util/miniforge3/bin/conda shell.bash hook)"
+    conda init bash
+    source ~/.bashrc
+    conda config --set changeps1 False
+    conda config --append channels conda-forge
+
+    # =======================================================
+    # Install swe-bench-eval environment if it does not exist
+    ENV_EXISTS=$(conda info --envs | awk '/swe-bench-eval/ {print $1}')
+    echo "ENV_EXISTS: $ENV_EXISTS"
+    if [ -z "$ENV_EXISTS" ]; then
+        echo "Environment swe-bench-eval does not exist. Creating the environment."
+        conda create -n swe-bench-eval python==3.11.5 -y
+        conda activate swe-bench-eval
+        pip install requests python-dotenv GitPython datasets pandas beautifulsoup4 ghapi
+    fi
+    conda activate swe-bench-eval
+    echo 'swe-bench-eval environment is ready.'
+
+    # =======================================================
+    # Read the swe-bench-test-lite.json / swe-bench-test.json file and extract the required item based on instance_id
+    INSTANCE_DATA_FILE=/swe_util/eval_data/instances/$instance_file_name
+    echo "Instance data file loaded: $INSTANCE_DATA_FILE"
+
+    # =======================================================
+    # generate testbed & conda environment for ALL instances in the test file
+    echo "Generating testbed & conda environment for all instances in the test file"
+    export PYTHONPATH=/swe_util/OD-SWE-bench:$PYTHONPATH
+    python3 /swe_util/OD-SWE-bench/swebench/harness/engine_testbed.py \
+        --instances_path $INSTANCE_DATA_FILE \
+        --log_dir /swe_util/eval_data/testbed_logs \
+        --conda_path /swe_util/miniforge3 \
+        --testbed /swe_util/eval_data/testbeds \
+        --timeout 1000
+
+    # Check every log in /swe_util/eval_data/testbed_logs to see if they contains "Init Succeeded"
+    # If not, print the log file name and exit
+    for log_file in /swe_util/eval_data/testbed_logs/*; do
+        if ! grep -q "Init Succeeded" $log_file; then
+            echo "Error: $log_file does not contain 'Init Succeeded'"
+            exit 1
+        fi
+    done
+    echo "All logs contain 'Init Succeeded'. Testbed & conda environment setup is successful."
+}
+
+# check if $1 is either swe-bench-test-lite.json or swe-bench-test.json
+if [ "$1" != "swe-bench-test-lite.json" ] && [ "$1" != "swe-bench-test.json" ]; then
+    echo "Error: Invalid input file name. Please provide either swe-bench-test-lite.json or swe-bench-test.json"
+    exit 1
+fi
+
+# call the function
+echo "Calling setup_environment_and_testbed with $1"
+setup_environment_and_testbed $1
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Initialize variables
+output_file=""
+agent_name=""
+dataset=""
+num_processes=15
+experiment_name=""
+merge_report=false
+
+# Parse command-line arguments
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --output-file) output_file="$2"; shift ;;
+        --agent-name) agent_name="$2"; shift ;;
+        --dataset) dataset="$2"; shift ;;
+        --num-processes) num_processes="$2"; shift ;;
+        --experiment-name) experiment_name="$2"; shift ;;
+        --merge-report) merge_report=true ;;
+        *) echo "Unknown parameter passed: $1"; exit 1 ;;
+    esac
+    shift
+done
+
+# Check if arguments are provided
+if [[ -z "$output_file" || -z "$agent_name" || -z "$dataset" ]]; then
+    echo "output-file, agent-name and dataset are required!"
+    exit 1
+fi
+echo "output file: $output_file"
+echo "agent name: $agent_name"
+echo "dataset: $dataset"
+echo "num processes: $num_processes"
+if [ ! -z "$experiment_name" ]
+then
+    echo "use provided experiment name: $experiment_name"
+else
+    current_folder=$(basename $(dirname $output_file))
+    parent_foler=$(basename $(dirname $(dirname $output_file)))
+    experiment_name="${parent_foler}_${current_folder}"
+    echo "use generated experiment name: $experiment_name"
+fi
+
+# Convert the agent output to the SWE-Bench format
+if [ -z "$EVAL_DATA_DIR" ]; then
+    echo "EVAL_DATA_DIR is not set."
+    exit 1
+fi
+target_file="${EVAL_DATA_DIR}/outputs/${experiment_name}_${dataset}.json"
+python process_output_json_file.py $output_file $agent_name $target_file
+
+# Run the evaluation script
+if [ -z "$OD_SWE_BENCH" ]; then
+    echo "OD_SWE_BENCH is not set."
+    exit 1
+fi
+if [ -z "$MINICONDA3" ]; then
+    echo "MINICONDA3 is not set."
+    exit 1
+fi
+mkdir -p $EVAL_DATA_DIR/eval_logs/$experiment_name
+export PYTHONPATH=$OD_SWE_BENCH && cd $OD_SWE_BENCH && . $MINICONDA3/etc/profile.d/conda.sh && conda activate $MINICONDA3/envs/swe-bench-eval && python swebench/harness/run_evaluation.py \
+    --swe_bench_tasks $EVAL_DATA_DIR/instances/$dataset.json \
+    --temp_dir $EVAL_DATA_DIR/eval_temp \
+    --testbed $EVAL_DATA_DIR/testbeds \
+    --conda_path $MINICONDA3 \
+    --predictions_path $target_file \
+    --log_dir $EVAL_DATA_DIR/eval_logs/$experiment_name \
+    --num_processes 15 \
+    --skip_existing \
+    --timeout 1600 \
+    --verbose
+
+# Get the report
+cp $target_file $EVAL_DATA_DIR/eval_logs/$experiment_name
+export PYTHONPATH=$OD_SWE_BENCH && cd $OD_SWE_BENCH && . $MINICONDA3/etc/profile.d/conda.sh && conda activate $MINICONDA3/envs/swe-bench-eval && python swebench/metrics/get_model_report.py \
+	--model $agent_name \
+    --swe_bench_tasks $EVAL_DATA_DIR/instances/$dataset.json \
+    --predictions_path $EVAL_DATA_DIR/eval_logs/$experiment_name/${experiment_name}_${dataset}.json \
+    --log_dir $EVAL_DATA_DIR/eval_logs/$experiment_name/$agent_name
+
+# Merge report to the agent output
+if [ "$merge_report" = true ]; then
+    cd /swe_util && python merge_fine_grained_report.py --od_output_file $output_file \
+    --fine_grained_report_file $EVAL_DATA_DIR/eval_logs/$experiment_name/${experiment_name}_${dataset}.report.json
+fi
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Input arguments
+output_file=""
+model_name=""
+dataset=""
+num_processes=15
+experiment_name=""
+
+# Parse command-line arguments
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --output-file) output_file="$2"; shift ;;
+        --model-name) model_name="$2"; shift ;;
+        --dataset) dataset="$2"; shift ;;
+        --num-processes) num_processes="$2"; shift ;;
+        --experiment-name) experiment_name="$2"; shift ;;
+        *) echo "Unknown parameter passed: $1"; exit 1 ;;
+    esac
+    shift
+done
+
+# Check if arguments are provided
+if [[ -z "$output_file" || -z "$model_name" || -z "$dataset" ]]; then
+    echo "output-file, model-name and dataset are required!"
+    exit 1
+fi
+echo "output file: $output_file"
+echo "model name: $model_name"
+echo "dataset: $dataset"
+echo "num processes: $num_processes"
+if [ ! -z "$experiment_name" ]
+then
+    echo "use provided experiment name: $experiment_name"
+else
+    experiment_name=${model_name}__${dataset}
+    echo "use generated experiment name: $experiment_name"
+fi
+
+# Run the evaluation script
+mkdir -p $EVAL_DATA_DIR/eval_logs/$experiment_name
+export PYTHONPATH=$OD_SWE_BENCH && cd $OD_SWE_BENCH && . $MINICONDA3/etc/profile.d/conda.sh && conda activate $MINICONDA3/envs/swe-bench-eval && python swebench/harness/run_evaluation.py \
+    --swe_bench_tasks $EVAL_DATA_DIR/instances/$dataset.json \
+    --temp_dir $EVAL_DATA_DIR/eval_temp \
+    --testbed $EVAL_DATA_DIR/testbeds \
+    --conda_path $MINICONDA3 \
+    --predictions_path $output_file \
+    --log_dir $EVAL_DATA_DIR/eval_logs/$experiment_name \
+    --num_processes $num_processes \
+    --skip_existing \
+    --timeout 1600 \
+    --verbose
+
+# Get the report
+predictions_fname=$(basename $output_file)
+cp $output_file $EVAL_DATA_DIR/eval_logs/$experiment_name
+export PYTHONPATH=$OD_SWE_BENCH && cd $OD_SWE_BENCH && . $MINICONDA3/etc/profile.d/conda.sh && conda activate $MINICONDA3/envs/swe-bench-eval && python swebench/metrics/get_model_report.py \
+	--model $model_name \
+    --swe_bench_tasks $EVAL_DATA_DIR/instances/$dataset.json \
+    --predictions_path $EVAL_DATA_DIR/eval_logs/$experiment_name/$predictions_fname \
+    --log_dir $EVAL_DATA_DIR/eval_logs/$experiment_name/$model_name
@@ -0,0 +1,29 @@
+import argparse
+import json
+
+
+def merge_fine_grained_report(od_output_file, fine_grained_report_file):
+    merged_od_output_file = od_output_file.replace('.jsonl', '.merged.jsonl')
+    merged_report = []
+    fine_grained_report = json.load(open(fine_grained_report_file))
+    for line in open(od_output_file):
+        line = json.loads(line)
+        instance_id = line['instance_id']
+        line['fine_grained_report'] = fine_grained_report[instance_id]
+        merged_report.append(line)
+    # dump the merged report as a jsonl file
+    with open(merged_od_output_file, 'w') as f:
+        for line in merged_report:
+            f.write(json.dumps(line) + '\n')
+    print(f'Agent output with report merged created at {merged_od_output_file}')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--od_output_file', help='Path to the OD output file')
+    parser.add_argument(
+        '--fine_grained_report_file', help='Path to the fine grained report file'
+    )
+    args = parser.parse_args()
+
+    merge_fine_grained_report(args.od_output_file, args.fine_grained_report_file)
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -e
+EVAL_WORKSPACE="evaluation/swe_bench/eval_workspace"
+mkdir -p $EVAL_WORKSPACE
+
+# 1. Prepare REPO
+echo "==== Prepare SWE-bench repo ===="
+OD_SWE_BENCH_REPO_PATH="https://github.com/OpenDevin/OD-SWE-bench.git"
+OD_SWE_BENCH_REPO_BRANCH="eval"
+git clone -b $OD_SWE_BENCH_REPO_BRANCH $OD_SWE_BENCH_REPO_PATH $EVAL_WORKSPACE/OD-SWE-bench
+
+# 2. Prepare DATA
+echo "==== Prepare SWE-bench data ===="
+EVAL_IMAGE=ghcr.io/opendevin/eval-swe-bench:builder_with_conda
+EVAL_WORKSPACE=$(realpath $EVAL_WORKSPACE)
+chmod +x $EVAL_WORKSPACE/OD-SWE-bench/swebench/harness/prepare_data.sh
+if [ -d $EVAL_WORKSPACE/eval_data ]; then
+    rm -r $EVAL_WORKSPACE/eval_data
+fi
+docker run \
+    -v $EVAL_WORKSPACE:/workspace \
+    -w /workspace \
+    -u $(id -u):$(id -g) \
+    -e HF_DATASETS_CACHE="/tmp" \
+    --rm -it $EVAL_IMAGE \
+    bash -c "cd OD-SWE-bench/swebench/harness && /swe_util/miniforge3/bin/conda run -n swe-bench-eval ./prepare_data.sh && mv eval_data /workspace/"
@@ -0,0 +1,35 @@
+import json
+import sys
+
+
+def process_jsonl(input_file, model_name, output_file):
+    try:
+        with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
+            data = []
+            for line in infile:
+                if line.strip():  # Ensure the line is not empty
+                    json_obj = json.loads(line)
+                    # Create new object with required fields and new model_name
+                    new_obj = {
+                        'instance_id': json_obj['instance_id'],
+                        'model_patch': json_obj['git_patch'],
+                        'model_name_or_path': model_name,
+                    }
+                    data.append(new_obj)
+            json.dump(
+                data, outfile, indent=2
+            )  # Write the list of JSON objects to a file
+        print(f'Output JSON list created at {output_file}')
+    except Exception as e:
+        print(f'Error: {str(e)}')
+
+
+# Usage: python script.py input.jsonl model_name output.json
+if __name__ == '__main__':
+    if len(sys.argv) != 4:
+        print('Usage: python script.py <input_file> <model_name> <output_file>')
+    else:
+        input_file = sys.argv[1]
+        model_name = sys.argv[2]
+        output_file = sys.argv[3]
+        process_jsonl(input_file, model_name, output_file)
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+set -e
+
+# assert user name is `root`
+if [ "$USER" != "root" ]; then
+    echo "Error: This script is intended to be run by the 'root' user only." >&2
+    exit 1
+fi
+
+source ~/.bashrc
+
+SWEUTIL_DIR=/swe_util
+
+# Create logs directory
+LOG_DIR=/opendevin/logs
+mkdir -p $LOG_DIR && chmod 777 $LOG_DIR
+
+# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
+# SWE_INSTANCE_ID=django__django-11099
+if [ -z "$SWE_INSTANCE_ID" ]; then
+    echo "Error: SWE_INSTANCE_ID is not set." >&2
+    exit 1
+fi
+
+# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
+item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-test-lite.json)
+
+if [[ -z "$item" ]]; then
+  echo "No item found for the provided instance ID."
+  exit 1
+fi
+
+CONDA_ENV_NAME=$(echo "$item" | jq -r '.repo + "__" + .version | gsub("/"; "__")')
+
+echo "CONDA_ENV_NAME: $CONDA_ENV_NAME"
+
+SWE_TASK_DIR=/opendevin/swe_tasks
+mkdir -p $SWE_TASK_DIR
+# Dump test_patch to /workspace/test.patch
+echo "$item" | jq -r '.test_patch' > $SWE_TASK_DIR/test.patch
+# Dump patch to /workspace/gold.patch
+echo "$item" | jq -r '.patch' > $SWE_TASK_DIR/gold.patch
+# Dump the item to /workspace/instance.json except for the "test_patch" and "patch" fields
+echo "$item" | jq 'del(.test_patch, .patch)' > $SWE_TASK_DIR/instance.json
+
+# Clear the workspace
+rm -rf /workspace/*
+# Copy repo to workspace
+if [ -d /workspace/$CONDA_ENV_NAME ]; then
+    rm -rf /workspace/$CONDA_ENV_NAME
+fi
+cp -r $SWEUTIL_DIR/eval_data/testbeds/$CONDA_ENV_NAME /workspace
+
+# Reset swe-bench testbed and install the repo
+. $SWEUTIL_DIR/miniforge3/etc/profile.d/conda.sh
+conda config --set changeps1 False
+conda config --append channels conda-forge
+conda activate swe-bench-eval
+
+mkdir -p $SWE_TASK_DIR/reset_testbed_temp
+mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
+SWE_BENCH_DIR=/swe_util/OD-SWE-bench
+output=$(
+    export PYTHONPATH=$SWE_BENCH_DIR && \
+    cd $SWE_BENCH_DIR && \
+    python swebench/harness/reset_swe_env.py \
+    --swe_bench_tasks $SWEUTIL_DIR/eval_data/instances/swe-bench-test.json \
+    --temp_dir $SWE_TASK_DIR/reset_testbed_temp \
+    --testbed /workspace \
+    --conda_path $SWEUTIL_DIR/miniforge3 \
+    --instance_id $SWE_INSTANCE_ID \
+    --log_dir $SWE_TASK_DIR/reset_testbed_log_dir \
+    --timeout 900 \
+    --verbose
+)
+
+REPO_PATH=$(echo "$output" | awk -F': ' '/repo_path:/ {print $2}')
+TEST_CMD=$(echo "$output" | awk -F': ' '/test_cmd:/ {print $2}')
+echo "Repo Path: $REPO_PATH"
+echo "Test Command: $TEST_CMD"
+
+echo "export SWE_BENCH_DIR=\"$SWE_BENCH_DIR\"" >> ~/.bashrc
+echo "export REPO_PATH=\"$REPO_PATH\"" >> ~/.bashrc
+echo "export TEST_CMD=\"$TEST_CMD\"" >> ~/.bashrc
+
+if [[ "$REPO_PATH" == "None" ]]; then
+    echo "Error: Failed to retrieve repository path. Tests may not have passed or output was not as expected." >&2
+    exit 1
+fi
+
+# Activate instance-specific environment
+. $SWEUTIL_DIR/miniforge3/etc/profile.d/conda.sh
+conda activate $CONDA_ENV_NAME
+
+set +e
@@ -0,0 +1,31 @@
+#!/bin/bash
+# THIS SCRIPT ONLY NEED TO BE RUN ONCE BEFORE EVALUATION
+
+EVAL_DOCKER_IMAGE=ghcr.io/opendevin/eval-swe-bench:builder
+EVAL_WORKSPACE="evaluation/swe_bench/eval_workspace"
+EVAL_WORKSPACE=$(realpath $EVAL_WORKSPACE)
+
+SETUP_INSTANCE_FILENAME=swe-bench-test.json # OR swe-bench-test-lite.json
+
+if [ ! -d $EVAL_WORKSPACE ]; then
+    mkdir -p $EVAL_WORKSPACE
+fi
+
+if [ -f $EVAL_WORKSPACE/swe_env_setup.sh ]; then
+    rm $EVAL_WORKSPACE/swe_env_setup.sh
+fi
+SCRIPT_DIR=evaluation/swe_bench/scripts/setup
+
+cp $SCRIPT_DIR/_swe_env_setup.sh $EVAL_WORKSPACE/swe_env_setup.sh
+cp $SCRIPT_DIR/swe_entry.sh $EVAL_WORKSPACE/swe_entry.sh
+cp $SCRIPT_DIR/get_model_report.sh $EVAL_WORKSPACE/get_model_report.sh
+cp $SCRIPT_DIR/get_agent_report.sh $EVAL_WORKSPACE/get_agent_report.sh
+cp $SCRIPT_DIR/process_output_json_file.py $EVAL_WORKSPACE/process_output_json_file.py
+cp $SCRIPT_DIR/merge_fine_grained_report.py $EVAL_WORKSPACE/merge_fine_grained_report.py
+
+docker run \
+    -v $EVAL_WORKSPACE:/swe_util \
+    -e UID=$(id -u) \
+    --rm -it $EVAL_DOCKER_IMAGE \
+    bash -c "useradd -rm -d /home/opendevin -s /bin/bash -u $(id -u) opendevin && su opendevin -c 'bash /swe_util/swe_env_setup.sh $SETUP_INSTANCE_FILENAME'"
+#
@@ -0,0 +1,204 @@
+import sys
+import uuid
+
+from opendevin.core.config import config
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.runtime.docker.ssh_box import DockerSSHBox
+from opendevin.runtime.plugins import JupyterRequirement, SWEAgentCommandsRequirement
+
+SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.0'
+
+
+class SWEBenchSSHBox(DockerSSHBox):
+    def __init__(
+        self,
+        container_image: str,
+        timeout: int = 120,
+        sid: str | None = None,
+        swe_instance_id: str | None = None,
+        swe_instance: dict | None = None,
+        skip_workspace_mount: bool = True,
+    ):
+        if swe_instance_id is None:
+            raise ValueError('swe_instance_id must be provided!')
+        self.swe_instance_id = swe_instance_id
+        self.swe_instance = swe_instance
+        self.skip_workspace_mount = skip_workspace_mount
+
+        assert (
+            container_image is not None
+        ), 'container_image is required for SWEBenchSSHBox!'
+        # Need to run as root to use SWEBench container
+        sid = f'swe_bench_{swe_instance_id}' + str(uuid.uuid4())
+        super().__init__(container_image, timeout, sid)
+
+        exit_code, output = self.execute('mv ~/.bashrc ~/.bashrc.bak')
+        assert exit_code == 0, f'Failed to backup ~/.bashrc: {output}'
+
+        exit_code, output = self.execute(
+            f"echo 'export SWE_INSTANCE_ID={self.swe_instance_id}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo \"alias git='git --no-pager'\" >> ~/.bashrc"
+        )
+        assert exit_code == 0, f'Failed to set SWE_INSTANCE_ID in ~/.bashrc: {output}'
+
+        logger.info('Sourcing swe_entry.sh to set up environment variables')
+        # larger timeout for SWEBench init to account for long-running installations (e.g., require compilation)
+        exit_code, output = self.execute('source /swe_util/swe_entry.sh', timeout=600)
+        logger.info('exit code: %d', exit_code)
+        logger.info(output)
+        assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
+        logger.info('Sourced swe_entry.sh successfully')
+
+    @property
+    def volumes(self):
+        if self.skip_workspace_mount:
+            return {
+                k: v
+                for k, v in super().volumes.items()
+                if not v['bind'] == self.sandbox_workspace_dir
+            }
+        return super().volumes
+
+    @classmethod
+    def get_box_for_instance(
+        cls,
+        instance,
+        workspace_dir_name=None,
+        n_tries=5,
+        skip_workspace_mount: bool = True,
+        workspace_mount_path: str | None = None,
+    ) -> 'SWEBenchSSHBox':
+        if workspace_dir_name is None:
+            workspace_dir_name = f"{instance['repo']}__{instance['version']}".replace(
+                '/', '__'
+            )
+        config.workspace_base = workspace_mount_path
+        config.workspace_mount_path = workspace_mount_path
+        sandbox = cls(
+            container_image=SWE_BENCH_CONTAINER_IMAGE,
+            swe_instance_id=instance['instance_id'],
+            swe_instance=instance,
+            skip_workspace_mount=skip_workspace_mount,
+        )
+        logger.info(f"SSH box started for instance {instance['instance_id']}.")
+
+        # cd to the repo
+        exit_code, output = sandbox.execute(f'cd /workspace/{workspace_dir_name}')
+        if exit_code != 0:
+            logger.error(f'Failed to cd to the repo: {output}')
+            sys.exit(1)
+
+        # remove all future commits & remote following Devin
+        # https://www.cognition-labs.com/post/swe-bench-technical-report
+        exit_code, output = sandbox.execute('git reset --hard')
+        if exit_code != 0:
+            logger.error(f'Failed to reset the repo: {output}')
+            sys.exit(1)
+        exit_code, output = sandbox.execute(
+            'for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
+        )
+        if exit_code != 0:
+            logger.error(f'Failed to remove remote: {output}')
+            sys.exit(1)
+        return sandbox
+
+    def get_diff_patch(self):
+        # add everything to the index
+        exit_code, output = self.execute('git add --all')
+        if exit_code != 0:
+            logger.error('Failed to add everything to the index')
+            return ''
+
+        # get the git diff
+        exit_code, git_patch = self.execute(
+            f'git diff --no-color --cached {self.swe_instance["base_commit"]}'
+        )
+        if exit_code != 0:
+            logger.error('Failed to get git diff')
+            return ''
+        return git_patch
+
+
+if __name__ == '__main__':
+    EXAMPLE_INSTANCE = {
+        'repo': 'django/django',
+        'instance_id': 'django__django-11099',
+        'base_commit': 'd26b2424437dabeeca94d7900b37d2df4410da0c',
+        'patch': "diff --git a/django/contrib/auth/validators.py b/django/contrib/auth/validators.py\n--- a/django/contrib/auth/validators.py\n+++ b/django/contrib/auth/validators.py\n@@ -7,7 +7,7 @@\n \n @deconstructible\n class ASCIIUsernameValidator(validators.RegexValidator):\n-    regex = r'^[\\w.@+-]+$'\n+    regex = r'^[\\w.@+-]+\\Z'\n     message = _(\n         'Enter a valid username. This value may contain only English letters, '\n         'numbers, and @/./+/-/_ characters.'\n@@ -17,7 +17,7 @@ class ASCIIUsernameValidator(validators.RegexValidator):\n \n @deconstructible\n class UnicodeUsernameValidator(validators.RegexValidator):\n-    regex = r'^[\\w.@+-]+$'\n+    regex = r'^[\\w.@+-]+\\Z'\n     message = _(\n         'Enter a valid username. This value may contain only letters, '\n         'numbers, and @/./+/-/_ characters.'\n",
+        'test_patch': "diff --git a/tests/auth_tests/test_validators.py b/tests/auth_tests/test_validators.py\n--- a/tests/auth_tests/test_validators.py\n+++ b/tests/auth_tests/test_validators.py\n@@ -237,7 +237,7 @@ def test_unicode_validator(self):\n         invalid_usernames = [\n             \"o'connell\", \"عبد ال\",\n             \"zerowidth\\u200Bspace\", \"nonbreaking\\u00A0space\",\n-            \"en\\u2013dash\",\n+            \"en\\u2013dash\", 'trailingnewline\\u000A',\n         ]\n         v = validators.UnicodeUsernameValidator()\n         for valid in valid_usernames:\n@@ -250,7 +250,7 @@ def test_unicode_validator(self):\n \n     def test_ascii_validator(self):\n         valid_usernames = ['glenn', 'GLEnN', 'jean-marc']\n-        invalid_usernames = [\"o'connell\", 'Éric', 'jean marc', \"أحمد\"]\n+        invalid_usernames = [\"o'connell\", 'Éric', 'jean marc', \"أحمد\", 'trailingnewline\\n']\n         v = validators.ASCIIUsernameValidator()\n         for valid in valid_usernames:\n             with self.subTest(valid=valid):\n",
+        'problem_statement': "UsernameValidator allows trailing newline in usernames\nDescription\n\t\nASCIIUsernameValidator and UnicodeUsernameValidator use the regex \nr'^[\\w.@+-]+$'\nThe intent is to only allow alphanumeric characters as well as ., @, +, and -. However, a little known quirk of Python regexes is that $ will also match a trailing newline. Therefore, the user name validators will accept usernames which end with a newline. You can avoid this behavior by instead using \\A and \\Z to terminate regexes. For example, the validator regex could be changed to\nr'\\A[\\w.@+-]+\\Z'\nin order to reject usernames that end with a newline.\nI am not sure how to officially post a patch, but the required change is trivial - using the regex above in the two validators in contrib.auth.validators.\n",
+        'hints_text': '',
+        'created_at': '2019-03-20T03:46:18Z',
+        'version': '3.0',
+        'FAIL_TO_PASS': '["test_ascii_validator (auth_tests.test_validators.UsernameValidatorsTests)", "test_unicode_validator (auth_tests.test_validators.UsernameValidatorsTests)", "test_help_text (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)"]',
+        'PASS_TO_PASS': '["test_help_text (auth_tests.test_validators.MinimumLengthValidatorTest)", "test_validate (auth_tests.test_validators.MinimumLengthValidatorTest)", "test_help_text (auth_tests.test_validators.NumericPasswordValidatorTest)", "test_validate (auth_tests.test_validators.NumericPasswordValidatorTest)", "test_validate (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)", "test_validate_property (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)", "test_empty_password_validator_help_text_html (auth_tests.test_validators.PasswordValidationTest)", "test_get_default_password_validators (auth_tests.test_validators.PasswordValidationTest)", "test_get_password_validators_custom (auth_tests.test_validators.PasswordValidationTest)", "test_password_changed (auth_tests.test_validators.PasswordValidationTest)", "test_password_changed_with_custom_validator (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_text_html (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_text_html_escaping (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_texts (auth_tests.test_validators.PasswordValidationTest)", "test_validate_password (auth_tests.test_validators.PasswordValidationTest)", "test_help_text (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate_custom_list (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate_django_supplied_file (auth_tests.test_validators.CommonPasswordValidatorTest)"]',
+        'environment_setup_commit': '419a78300f7cd27611196e1e464d50fd0385ff27',
+    }
+
+    sandbox = SWEBenchSSHBox.get_box_for_instance(instance=EXAMPLE_INSTANCE)
+
+    # in actual eval, this will be initialized by the controller
+    sandbox.init_plugins([JupyterRequirement(), SWEAgentCommandsRequirement()])
+
+    # PRE TEST
+    exit_code, output = sandbox.execute('cd $REPO_PATH')
+    assert exit_code == 0, 'Failed to cd $REPO_PATH'
+    logger.info(f'cd $REPO_PATH: {output}')
+
+    # apply test patch
+    exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/test.patch')
+    assert exit_code == 0, 'Failed to apply test patch'
+    logger.info(f'git apply $SWE_TASK_DIR/test.patch: {output}')
+
+    # TEST
+    exit_code, output = sandbox.execute(
+        './tests/runtests.py --verbosity 2 auth_tests.test_validators'
+    )
+    assert exit_code == 1, 'Expected exit code 1 (since this is a FAIL_TO_PASS)'
+    logger.info(f'$TEST_CMD:\n{output}')
+
+    # apply gold patch
+    exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/gold.patch')
+    logger.info('exit code: %d', exit_code)
+    logger.info(f'git apply $SWE_TASK_DIR/gold.patch: {output}')
+
+    # TEST
+    exit_code, output = sandbox.execute(
+        './tests/runtests.py --verbosity 2 auth_tests.test_validators'
+    )
+    assert exit_code == 0, 'Expected exit code 0 (since we applied the gold patch)'
+    logger.info(f'$TEST_CMD:\n{output}')
+
+    # Reset the repo
+    exit_code, output = sandbox.execute('git reset --hard')
+    assert exit_code == 0, 'Failed to reset the repo'
+    logger.info(f'git reset --hard: {output}')
+
+    bg_cmd = sandbox.execute_in_background(
+        "while true; do echo 'dot ' && sleep 10; done"
+    )
+
+    sys.stdout.flush()
+    try:
+        while True:
+            try:
+                user_input = input('>>> ')
+            except EOFError:
+                logger.info('Exiting...')
+                break
+            if user_input.lower() == 'exit':
+                logger.info('Exiting...')
+                break
+            if user_input.lower() == 'kill':
+                sandbox.kill_background(bg_cmd.pid)
+                logger.info('Background process killed')
+                continue
+            exit_code, output = sandbox.execute(user_input)
+            logger.info('exit code: %d', exit_code)
+            logger.info(output)
+            if bg_cmd.pid in sandbox.background_commands:
+                logs = sandbox.read_logs(bg_cmd.pid)
+                logger.info('background logs: %s', logs)
+            sys.stdout.flush()
+    except KeyboardInterrupt:
+        logger.info('Exiting...')
+    sandbox.close()
@@ -16,7 +16,7 @@ import AgentControlBar from "./components/AgentControlBar";
 import AgentStatusBar from "./components/AgentStatusBar";
 import Terminal from "./components/terminal/Terminal";
 import { initializeAgent } from "./services/agent";
-import { getSettings } from "./services/settings";
+import { settingsAreUpToDate } from "./services/settings";

 interface Props {
  setSettingOpen: (isOpen: boolean) => void;
@@ -73,7 +73,11 @@ function App(): JSX.Element {
    if (initOnce) return;
    initOnce = true;

-    initializeAgent(getSettings());
+    if (!settingsAreUpToDate()) {
+      onSettingsModalOpen();
+    } else {
+      initializeAgent();
+    }

    Socket.registerCallback("open", [getMsgTotal]);

@@ -4,47 +4,35 @@ import { useSelector } from "react-redux";
 import ArrowIcon from "#/assets/arrow";
 import PauseIcon from "#/assets/pause";
 import PlayIcon from "#/assets/play";
-import { changeTaskState } from "#/services/agentStateService";
+import { changeAgentState } from "#/services/agentStateService";
 import { clearMsgs } from "#/services/session";
 import store, { RootState } from "#/store";
-import AgentTaskAction from "#/types/AgentTaskAction";
-import AgentTaskState from "#/types/AgentTaskState";
+import AgentState from "#/types/AgentState";
 import { clearMessages } from "#/state/chatSlice";

-const TaskStateActionMap = {
-  [AgentTaskAction.START]: AgentTaskState.RUNNING,
-  [AgentTaskAction.PAUSE]: AgentTaskState.PAUSED,
-  [AgentTaskAction.RESUME]: AgentTaskState.RUNNING,
-  [AgentTaskAction.STOP]: AgentTaskState.STOPPED,
-};
-
-const IgnoreTaskStateMap: { [k: string]: AgentTaskState[] } = {
-  [AgentTaskAction.PAUSE]: [
-    AgentTaskState.INIT,
-    AgentTaskState.PAUSED,
-    AgentTaskState.STOPPED,
-    AgentTaskState.FINISHED,
-    AgentTaskState.AWAITING_USER_INPUT,
+const IgnoreTaskStateMap: { [k: string]: AgentState[] } = {
+  [AgentState.PAUSED]: [
+    AgentState.INIT,
+    AgentState.PAUSED,
+    AgentState.STOPPED,
+    AgentState.FINISHED,
+    AgentState.AWAITING_USER_INPUT,
  ],
-  [AgentTaskAction.RESUME]: [
-    AgentTaskState.INIT,
-    AgentTaskState.RUNNING,
-    AgentTaskState.STOPPED,
-    AgentTaskState.FINISHED,
-    AgentTaskState.AWAITING_USER_INPUT,
-  ],
-  [AgentTaskAction.STOP]: [
-    AgentTaskState.INIT,
-    AgentTaskState.STOPPED,
-    AgentTaskState.FINISHED,
+  [AgentState.RUNNING]: [
+    AgentState.INIT,
+    AgentState.RUNNING,
+    AgentState.STOPPED,
+    AgentState.FINISHED,
+    AgentState.AWAITING_USER_INPUT,
  ],
+  [AgentState.STOPPED]: [AgentState.INIT, AgentState.STOPPED],
 };

 interface ButtonProps {
  isDisabled: boolean;
  content: string;
-  action: AgentTaskAction;
-  handleAction: (action: AgentTaskAction) => void;
+  action: AgentState;
+  handleAction: (action: AgentState) => void;
  large?: boolean;
 }

@@ -75,53 +63,50 @@ ActionButton.defaultProps = {
 };

 function AgentControlBar() {
-  const { curTaskState } = useSelector((state: RootState) => state.agent);
-  const [desiredState, setDesiredState] = React.useState(AgentTaskState.INIT);
+  const { curAgentState } = useSelector((state: RootState) => state.agent);
+  const [desiredState, setDesiredState] = React.useState(AgentState.INIT);
  const [isLoading, setIsLoading] = React.useState(false);

-  const handleAction = (action: AgentTaskAction) => {
-    if (IgnoreTaskStateMap[action].includes(curTaskState)) {
+  const handleAction = (action: AgentState) => {
+    if (IgnoreTaskStateMap[action].includes(curAgentState)) {
      return;
    }

-    let act = action;
-
-    if (act === AgentTaskAction.STOP) {
-      act = AgentTaskAction.STOP;
+    if (action === AgentState.STOPPED) {
      clearMsgs().then().catch();
      store.dispatch(clearMessages());
    } else {
      setIsLoading(true);
    }

-    setDesiredState(TaskStateActionMap[act]);
-    changeTaskState(act);
+    setDesiredState(action);
+    changeAgentState(action);
  };

  useEffect(() => {
-    if (curTaskState === desiredState) {
-      if (curTaskState === AgentTaskState.STOPPED) {
+    if (curAgentState === desiredState) {
+      if (curAgentState === AgentState.STOPPED) {
        clearMsgs().then().catch();
        store.dispatch(clearMessages());
      }
      setIsLoading(false);
-    } else if (curTaskState === AgentTaskState.RUNNING) {
-      setDesiredState(AgentTaskState.RUNNING);
+    } else if (curAgentState === AgentState.RUNNING) {
+      setDesiredState(AgentState.RUNNING);
    }
-    // We only want to run this effect when curTaskState changes
+    // We only want to run this effect when curAgentState changes
    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [curTaskState]);
+  }, [curAgentState]);

  return (
    <div className="flex items-center gap-3">
-      {curTaskState === AgentTaskState.PAUSED ? (
+      {curAgentState === AgentState.PAUSED ? (
        <ActionButton
          isDisabled={
            isLoading ||
-            IgnoreTaskStateMap[AgentTaskAction.RESUME].includes(curTaskState)
+            IgnoreTaskStateMap[AgentState.RUNNING].includes(curAgentState)
          }
          content="Resume the agent task"
-          action={AgentTaskAction.RESUME}
+          action={AgentState.RUNNING}
          handleAction={handleAction}
          large
        >
@@ -131,10 +116,10 @@ function AgentControlBar() {
        <ActionButton
          isDisabled={
            isLoading ||
-            IgnoreTaskStateMap[AgentTaskAction.PAUSE].includes(curTaskState)
+            IgnoreTaskStateMap[AgentState.PAUSED].includes(curAgentState)
          }
-          content="Pause the agent task"
-          action={AgentTaskAction.PAUSE}
+          content="Pause the current task"
+          action={AgentState.PAUSED}
          handleAction={handleAction}
          large
        >
@@ -143,8 +128,8 @@ function AgentControlBar() {
      )}
      <ActionButton
        isDisabled={isLoading}
-        content="Restart a new agent task"
-        action={AgentTaskAction.STOP}
+        content="Start a new task"
+        action={AgentState.STOPPED}
        handleAction={handleAction}
      >
        <ArrowIcon />
@@ -3,40 +3,47 @@ import { useTranslation } from "react-i18next";
 import { useSelector } from "react-redux";
 import { I18nKey } from "#/i18n/declaration";
 import { RootState } from "#/store";
-import AgentTaskState from "#/types/AgentTaskState";
+import AgentState from "#/types/AgentState";

 const AgentStatusMap: { [k: string]: { message: string; indicator: string } } =
  {
-    [AgentTaskState.INIT]: {
+    [AgentState.INIT]: {
      message: "Agent is initialized, waiting for task...",
      indicator: "bg-blue-500",
    },
-    [AgentTaskState.RUNNING]: {
+    [AgentState.RUNNING]: {
      message: "Agent is running task...",
      indicator: "bg-green-500",
    },
-    [AgentTaskState.AWAITING_USER_INPUT]: {
+    [AgentState.AWAITING_USER_INPUT]: {
      message: "Agent is awaiting user input...",
      indicator: "bg-orange-500",
    },
-    [AgentTaskState.PAUSED]: {
+    [AgentState.PAUSED]: {
      message: "Agent has paused.",
      indicator: "bg-yellow-500",
    },
-    [AgentTaskState.STOPPED]: {
+    [AgentState.LOADING]: {
+      message: "Agent is initializing...",
+      indicator: "bg-yellow-500",
+    },
+    [AgentState.STOPPED]: {
      message: "Agent has stopped.",
      indicator: "bg-red-500",
    },
-    [AgentTaskState.FINISHED]: {
+    [AgentState.FINISHED]: {
      message: "Agent has finished the task.",
      indicator: "bg-green-500",
    },
+    [AgentState.ERROR]: {
+      message: "Agent encountered an error.",
+      indicator: "bg-red-500",
+    },
  };

 function AgentStatusBar() {
  const { t } = useTranslation();
-  const { initialized } = useSelector((state: RootState) => state.task);
-  const { curTaskState } = useSelector((state: RootState) => state.agent);
+  const { curAgentState } = useSelector((state: RootState) => state.agent);

  // TODO: Extend the agent status, e.g.:
  // - Agent is typing
@@ -46,13 +53,13 @@ function AgentStatusBar() {
  // - Agent is not available
  return (
    <div className="flex items-center">
-      {initialized ? (
+      {curAgentState !== AgentState.LOADING ? (
        <>
          <div
-            className={`w-3 h-3 mr-2 rounded-full animate-pulse ${AgentStatusMap[curTaskState].indicator}`}
+            className={`w-3 h-3 mr-2 rounded-full animate-pulse ${AgentStatusMap[curAgentState].indicator}`}
          />
          <span className="text-sm text-stone-400">
-            {AgentStatusMap[curTaskState].message}
+            {AgentStatusMap[curAgentState].message}
          </span>
        </>
      ) : (
@@ -1,7 +1,7 @@
 import Editor, { Monaco } from "@monaco-editor/react";
 import { Tab, Tabs } from "@nextui-org/react";
 import type { editor } from "monaco-editor";
-import React, { useState } from "react";
+import React, { useMemo, useState } from "react";
 import { useTranslation } from "react-i18next";
 import { VscCode } from "react-icons/vsc";
 import { useDispatch, useSelector } from "react-redux";
@@ -10,10 +10,19 @@ import { selectFile } from "#/services/fileService";
 import { setCode } from "#/state/codeSlice";
 import { RootState } from "#/store";
 import FileExplorer from "./file-explorer/FileExplorer";
+import { CodeEditorContext } from "./CodeEditorContext";

 function CodeEditor(): JSX.Element {
  const { t } = useTranslation();
-  const [selectedFileName, setSelectedFileName] = useState("");
+  const [selectedFileAbsolutePath, setSelectedFileAbsolutePath] = useState("");
+  const selectedFileName = useMemo(() => {
+    const paths = selectedFileAbsolutePath.split("/");
+    return paths[paths.length - 1];
+  }, [selectedFileAbsolutePath]);
+  const codeEditorContext = useMemo(
+    () => ({ selectedFileAbsolutePath }),
+    [selectedFileAbsolutePath],
+  );

  const dispatch = useDispatch();
  const code = useSelector((state: RootState) => state.code.code);
@@ -38,10 +47,9 @@ function CodeEditor(): JSX.Element {

  const onSelectFile = async (absolutePath: string) => {
    const paths = absolutePath.split("/");
-    const fileName = paths[paths.length - 1];
    const rootlessPath = paths.slice(1).join("/");

-    setSelectedFileName(fileName);
+    setSelectedFileAbsolutePath(absolutePath);

    const newCode = await selectFile(rootlessPath);
    dispatch(setCode(newCode));
@@ -49,42 +57,44 @@ function CodeEditor(): JSX.Element {

  return (
    <div className="flex h-full w-full bg-neutral-900 transition-all duration-500 ease-in-out">
-      <FileExplorer onFileClick={onSelectFile} />
-      <div className="flex flex-col min-h-0 w-full">
-        <Tabs
-          disableCursorAnimation
-          classNames={{
-            base: "border-b border-divider border-neutral-600 mb-4",
-            tabList:
-              "w-full relative rounded-none bg-neutral-900 p-0 border-divider",
-            cursor: "w-full bg-neutral-600 rounded-none",
-            tab: "max-w-fit px-4 h-[36px]",
-            tabContent: "group-data-[selected=true]:text-white",
-          }}
-          aria-label="Options"
-        >
-          <Tab
-            key={selectedFileName.toLocaleLowerCase()}
-            title={selectedFileName}
-          />
-        </Tabs>
-        <div className="flex grow items-center justify-center">
-          {selectedFileName === "" ? (
-            <div className="flex flex-col items-center text-neutral-400">
-              <VscCode size={100} />
-              {t(I18nKey.CODE_EDITOR$EMPTY_MESSAGE)}
-            </div>
-          ) : (
-            <Editor
-              height="100%"
-              path={selectedFileName.toLocaleLowerCase()}
-              defaultValue=""
-              value={code}
-              onMount={handleEditorDidMount}
+      <CodeEditorContext.Provider value={codeEditorContext}>
+        <FileExplorer onFileClick={onSelectFile} />
+        <div className="flex flex-col min-h-0 w-full">
+          <Tabs
+            disableCursorAnimation
+            classNames={{
+              base: "border-b border-divider border-neutral-600 mb-4",
+              tabList:
+                "w-full relative rounded-none bg-neutral-900 p-0 border-divider",
+              cursor: "w-full bg-neutral-600 rounded-none",
+              tab: "max-w-fit px-4 h-[36px]",
+              tabContent: "group-data-[selected=true]:text-white",
+            }}
+            aria-label="Options"
+          >
+            <Tab
+              key={selectedFileName.toLocaleLowerCase()}
+              title={selectedFileName}
            />
-          )}
+          </Tabs>
+          <div className="flex grow items-center justify-center">
+            {selectedFileName === "" ? (
+              <div className="flex flex-col items-center text-neutral-400">
+                <VscCode size={100} />
+                {t(I18nKey.CODE_EDITOR$EMPTY_MESSAGE)}
+              </div>
+            ) : (
+              <Editor
+                height="100%"
+                path={selectedFileName.toLocaleLowerCase()}
+                defaultValue=""
+                value={code}
+                onMount={handleEditorDidMount}
+              />
+            )}
+          </div>
        </div>
-      </div>
+      </CodeEditorContext.Provider>
    </div>
  );
 }
@@ -0,0 +1,5 @@
+import { createContext } from "react";
+
+export const CodeEditorContext = createContext({
+  selectedFileAbsolutePath: "",
+});
@@ -1,10 +1,14 @@
-import React, { useEffect, useRef } from "react";
+import React, { useRef } from "react";
 import { useSelector } from "react-redux";
 import SyntaxHighlighter from "react-syntax-highlighter";
 import Markdown from "react-markdown";
 import { atomOneDark } from "react-syntax-highlighter/dist/esm/styles/hljs";
+import { VscArrowDown } from "react-icons/vsc";
+import { useTranslation } from "react-i18next";
 import { RootState } from "#/store";
 import { Cell } from "#/state/jupyterSlice";
+import { useScrollToBottom } from "#/hooks/useScrollToBottom";
+import { I18nKey } from "#/i18n/declaration";

 interface IJupyterCell {
  cell: Cell;
@@ -75,27 +79,40 @@ function JupyterCell({ cell }: IJupyterCell): JSX.Element {
 }

 function Jupyter(): JSX.Element {
+  const { t } = useTranslation();
+
  const { cells } = useSelector((state: RootState) => state.jupyter);
  const jupyterRef = useRef<HTMLDivElement>(null);

-  function scrollDomToBottom() {
-    const dom = jupyterRef.current;
-    if (dom) {
-      requestAnimationFrame(() => {
-        dom.scrollTo(0, dom.scrollHeight);
-      });
-    }
-  }
-
-  useEffect(() => {
-    scrollDomToBottom();
-  });
+  const { hitBottom, scrollDomToBottom, onChatBodyScroll } =
+    useScrollToBottom(jupyterRef);

  return (
-    <div className="flex-1 overflow-y-auto flex flex-col" ref={jupyterRef}>
-      {cells.map((cell, index) => (
-        <JupyterCell key={index} cell={cell} />
-      ))}
+    <div className="flex-1">
+      <div
+        className="overflow-y-auto h-full"
+        ref={jupyterRef}
+        onScroll={(e) => onChatBodyScroll(e.currentTarget)}
+      >
+        {cells.map((cell, index) => (
+          <JupyterCell key={index} cell={cell} />
+        ))}
+      </div>
+      {!hitBottom && (
+        <div className="sticky bottom-2 flex items-center justify-center">
+          <button
+            type="button"
+            className="relative border-1 text-sm rounded px-3 py-1 border-neutral-600 bg-neutral-700 cursor-pointer select-none"
+          >
+            <span className="flex items-center" onClick={scrollDomToBottom}>
+              <VscArrowDown className="inline mr-2 w-3 h-3" />
+              <span className="inline-block" onClick={scrollDomToBottom}>
+                {t(I18nKey.CHAT_INTERFACE$TO_BOTTOM)}
+              </span>
+            </span>
+          </button>
+        </div>
+      )}
    </div>
  );
 }
@@ -11,7 +11,7 @@ import {
 import { VscListOrdered } from "react-icons/vsc";
 import { useSelector } from "react-redux";
 import { I18nKey } from "#/i18n/declaration";
-import { Plan, Task, TaskState } from "#/services/planService";
+import { Task, TaskState } from "#/services/taskService";
 import { RootState } from "#/store";

 function StatusIcon({ status }: { status: TaskState }): JSX.Element {
@@ -53,14 +53,11 @@ function TaskCard({ task, level }: { task: Task; level: number }): JSX.Element {
  );
 }

-interface PlanProps {
-  plan: Plan;
-}
-
-function PlanContainer({ plan }: PlanProps): JSX.Element {
+function Planner(): JSX.Element {
  const { t } = useTranslation();
+  const task = useSelector((state: RootState) => state.task.task);

-  if (plan.mainGoal === undefined) {
+  if (!task || !task.subtasks?.length) {
    return (
      <div className="w-full h-full flex flex-col text-neutral-400 items-center justify-center">
        <VscListOrdered size={100} />
@@ -68,19 +65,14 @@ function PlanContainer({ plan }: PlanProps): JSX.Element {
      </div>
    );
  }
-  return (
-    <div className="p-2 overflow-y-auto h-full flex flex-col gap-2">
-      <TaskCard task={plan.task} level={0} />
-    </div>
-  );
-}
-
-function Planner(): JSX.Element {
-  const plan = useSelector((state: RootState) => state.plan.plan);

  return (
    <div className="h-full w-full bg-neutral-800">
-      <PlanContainer plan={plan} />
+      <div className="p-2 overflow-y-auto h-full flex flex-col gap-2">
+        {task.subtasks.map((subtask) => (
+          <TaskCard key={subtask.id} task={subtask} level={0} />
+        ))}
+      </div>
    </div>
  );
 }
--- a/Show More
+++ b/Show More