Fix pr #8099 : Write configuration

🤖 Auto-fix Python linting issues
2026-04-29 03:00:45 -04:00 · 2025-05-04 00:23:33 +00:00 · 2025-05-04 00:05:04 +00:00 · 2025-05-03 23:11:13 +00:00 · 2025-05-04 01:10:08 +02:00 · 2025-05-03 21:42:02 +00:00
478 changed files with 35793 additions and 22736 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,12 +1,12 @@
 - [ ] This change is worth documenting at https://docs.all-hands.dev/
 - [ ] Include this change in the Release Notes. If checked, you **must** provide an **end-user friendly** description for your change below

-**End-user friendly description of the problem this fixes or functionality that this introduces.**
+**End-user friendly description of the problem this fixes or functionality this introduces.**


 ---
-**Give a summary of what the PR does, explaining any non-trivial design decisions.**
+**Summarize what the PR does, explaining any non-trivial design decisions.**


 ---
-**Link of any specific issues this addresses.**
+**Link of any specific issues this addresses:**
--- a/.github/workflows/dummy-agent-test.yml
+++ b/.github/workflows/dummy-agent-test.yml
@@ -1,53 +0,0 @@
-# Workflow that uses the DummyAgent to run a simple task
-name: Run E2E test with dummy agent
-
-# Always run on "main"
-# Always run on PRs
-on:
-  push:
-    branches:
-    - main
-  pull_request:
-
-# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
-concurrency:
-  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  test:
-    runs-on: blacksmith-4vcpu-ubuntu-2204
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Install tmux
-        run: sudo apt-get update && sudo apt-get install -y tmux
-      - name: Setup Node.js
-        uses: useblacksmith/setup-node@v5
-        with:
-          node-version: '22.x'
-      - name: Install poetry via pipx
-        run: pipx install poetry
-      - name: Set up Python
-        uses: useblacksmith/setup-python@v6
-        with:
-          python-version: '3.12'
-          cache: 'poetry'
-      - name: Install Python dependencies using Poetry
-        run: poetry install --without evaluation
-      - name: Build Environment
-        run: make build
-      - name: Run tests
-        run: |
-          set -e
-          SANDBOX_FORCE_REBUILD_RUNTIME=True poetry run python3 openhands/core/main.py -t "do a flip" -d ./workspace/ -c DummyAgent
-      - name: Check exit code
-        run: |
-          if [ $? -ne 0 ]; then
-            echo "Test failed"
-            exit 1
-          else
-            echo "Test passed"
-          fi
--- a/.github/workflows/openhands-resolver.yml
+++ b/.github/workflows/openhands-resolver.yml
@@ -24,7 +24,7 @@ on:
      LLM_MODEL:
        required: false
        type: string
-        default: "anthropic/claude-3-5-sonnet-20241022"
+        default: "anthropic/claude-3-7-sonnet-20250219"
      LLM_API_VERSION:
        required: false
        type: string
@@ -179,7 +179,7 @@ jobs:

          echo "MAX_ITERATIONS=${{ inputs.max_iterations || 50 }}" >> $GITHUB_ENV
          echo "SANDBOX_ENV_GITHUB_TOKEN=${{ secrets.PAT_TOKEN || github.token }}" >> $GITHUB_ENV
-          echo "SANDBOX_ENV_BASE_CONTAINER_IMAGE=${{ inputs.base_container_image }}" >> $GITHUB_ENV
+          echo "SANDBOX_BASE_CONTAINER_IMAGE=${{ inputs.base_container_image }}" >> $GITHUB_ENV

          # Set branch variables
          echo "TARGET_BRANCH=${{ inputs.target_branch || 'main' }}" >> $GITHUB_ENV
--- a/.openhands/microagents/config_write_task.md
+++ b/.openhands/microagents/config_write_task.md
@@ -0,0 +1,40 @@
+# Status: Implementing User Configuration Persistence
+
+This PR implements a new configuration persistence system that allows saving user-specific settings to a dedicated TOML file (`~/.openhands/config.toml`), while keeping secrets in a separate JSON file (`~/.openhands/secrets.json`). This split architecture provides better separation of concerns and more appropriate storage formats for each type of data.
+
+## Current Status
+
+### Completed
+1. **Core TOML Writing Mechanism**
+   * Implemented `save_setting_to_user_toml` in `config_save.py` with comprehensive validation and error handling
+   * Added snapshot mechanism to track TOML-sourced values vs. env/cli overrides
+   * Implemented helpers for safe value access and default value retrieval
+   * Added extensive unit tests covering all key scenarios
+
+2. **Settings Store Integration**
+   * Updated `FileSettingsStore` to use the new TOML writing mechanism
+   * Implemented mapping between Settings model and AppConfig paths
+   * Added unit tests for store operations
+
+3. **Secrets Handling**
+   * Secrets (API keys, tokens) are now stored separately in `secrets.json`
+   * Implemented `FileSecretsStore` for dedicated secrets management
+   * Secrets remain in JSON format for better compatibility with existing code
+
+### Remaining Tasks
+1. **Runtime Config Updates**
+   * Implement mechanism to update runtime `AppConfig` instance when settings change
+   * Add callbacks or direct access to global config object
+   * Add tests for runtime updates
+
+2. **CLI Integration**
+   * Add CLI commands for viewing/modifying user TOML settings
+   * Ensure CLI respects the same validation rules as the API
+   * Add CLI integration tests
+
+3. **Documentation & Cleanup**
+   * Update configuration documentation to reflect the new architecture
+   * Document the split between settings (TOML) and secrets (JSON)
+   * Remove references to old `settings.json` mechanism
+
+
--- a/Development.md
+++ b/Development.md
@@ -118,7 +118,7 @@ poetry run pytest ./tests/unit/test_*.py
 To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by
 setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.

-Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.32-nikolaik`
+Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.36-nikolaik`

 ## Develop inside Docker container

--- a/15
+++ b/15
@@ -39,6 +39,7 @@ ifeq ($(INSTALL_DOCKER),)
 	@$(MAKE) -s check-docker
 endif
 	@$(MAKE) -s check-poetry
+	@$(MAKE) -s check-tmux
 	@echo "$(GREEN)Dependencies checked successfully.$(RESET)"

 check-system:
@@ -101,6 +102,18 @@ check-docker:
 		exit 1; \
 	fi

+check-tmux:
+	@echo "$(YELLOW)Checking tmux installation...$(RESET)"
+	@if command -v tmux > /dev/null; then \
+		echo "$(BLUE)$(shell tmux -V) is already installed.$(RESET)"; \
+	else \
+		echo "$(YELLOW)╔════════════════════════════════════════════════════════════════════════════╗$(RESET)"; \
+		echo "$(YELLOW)║ OPTIONAL: tmux is not installed.                                          ║$(RESET)"; \
+		echo "$(YELLOW)║ Some advanced terminal features may not work without tmux.                ║$(RESET)"; \
+		echo "$(YELLOW)║ You can install it if needed, but it's not required for development.      ║$(RESET)"; \
+		echo "$(YELLOW)╚════════════════════════════════════════════════════════════════════════════╝$(RESET)"; \
+	fi
+
 check-poetry:
 	@echo "$(YELLOW)Checking Poetry installation...$(RESET)"
 	@if command -v poetry > /dev/null; then \
@@ -175,7 +188,7 @@ install-pre-commit-hooks:

 lint-backend:
 	@echo "$(YELLOW)Running linters...$(RESET)"
-	@poetry run pre-commit run --files openhands/**/* agenthub/**/* evaluation/**/* --show-diff-on-failure --config $(PRE_COMMIT_CONFIG_PATH)
+	@poetry run pre-commit run --files openhands/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config $(PRE_COMMIT_CONFIG_PATH)

 lint-frontend:
 	@echo "$(YELLOW)Running linters for frontend...$(RESET)"
--- a/README.md
+++ b/README.md
@@ -9,7 +9,6 @@
 <div align="center">
  <a href="https://github.com/All-Hands-AI/OpenHands/graphs/contributors"><img src="https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Contributors"></a>
  <a href="https://github.com/All-Hands-AI/OpenHands/stargazers"><img src="https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Stargazers"></a>
-  <a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue"></a>
  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License"></a>
  <br/>
  <a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
@@ -18,7 +17,7 @@
  <br/>
  <a href="https://docs.all-hands.dev/modules/usage/getting-started"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
  <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper%20on%20Arxiv-000?logoColor=FFE165&logo=arxiv&style=for-the-badge" alt="Paper on Arxiv"></a>
-  <a href="https://huggingface.co/spaces/OpenHands/evaluation"><img src="https://img.shields.io/badge/Benchmark%20score-000?logoColor=FFE165&logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark Score"></a>
+  <a href="https://docs.google.com/spreadsheets/d/1wOUdFCMyY6Nt0AIqF705KN4JKOWgeI4wUGUP60krXXs/edit?gid=0#gid=0"><img src="https://img.shields.io/badge/Benchmark%20score-000?logoColor=FFE165&logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark Score"></a>
  <hr>
 </div>

@@ -52,23 +51,23 @@ system requirements and more information.


 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands-state:/.openhands-state \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32
+    docker.all-hands.dev/all-hands-ai/openhands:0.36
 ```

 You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!

 When you open the application, you'll be asked to choose an LLM provider and add an API key.
-[Anthropic's Claude 3.5 Sonnet](https://www.anthropic.com/api) (`anthropic/claude-3-5-sonnet-20241022`)
+[Anthropic's Claude 3.7 Sonnet](https://www.anthropic.com/api) (`anthropic/claude-3-7-sonnet-20250219`)
 works best, but you have [many options](https://docs.all-hands.dev/modules/usage/llms).

 ## 💡 Other ways to run OpenHands
--- a/config.template.toml
+++ b/config.template.toml
@@ -221,9 +221,22 @@ enable_browsing = true
 # Whether the LLM draft editor is enabled
 enable_llm_editor = false

+# Whether the standard editor tool (str_replace_editor) is enabled
+# Only has an effect if enable_llm_editor is False
+enable_editor = true
+
 # Whether the IPython tool is enabled
 enable_jupyter = true

+# Whether the command tool is enabled
+enable_cmd = true
+
+# Whether the think tool is enabled
+enable_think = true
+
+# Whether the finish tool is enabled
+enable_finish = true
+
 # LLM config group to use
 #llm_config = 'your-llm-config-group'

@@ -378,7 +391,7 @@ type = "noop"
 #[llm.condenser]
 #model = "gpt-4o"
 #temperature = 0.1
-#max_tokens = 1024
+#max_input_tokens = 1024

 #################################### Eval ####################################
 # Configuration for the evaluation, please refer to the specific evaluation
--- a/containers/dev/Dockerfile
+++ b/containers/dev/Dockerfile
@@ -61,8 +61,8 @@ RUN add-apt-repository ppa:deadsnakes/ppa \
    && apt-get install -y python3.12 python3.12-venv python3.12-dev python3-pip \
    && ln -s /usr/bin/python3.12 /usr/bin/python

-# NodeJS >= 18.17.1
-RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
+# NodeJS >= 22.x
+RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
    && apt-get install -y nodejs

 # Poetry >= 1.8
@@ -108,7 +108,7 @@ WORKDIR /app

 # cache build dependencies
 RUN \
-  --mount=type=bind,source=./,target=/app/ \
+  --mount=type=bind,source=./,target=/app/,rw \
  <<EOF
 #!/bin/bash
 make -s clean
--- a/containers/dev/compose.yml
+++ b/containers/dev/compose.yml
@@ -11,7 +11,7 @@ services:
      - BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
      - SANDBOX_API_HOSTNAME=host.docker.internal
      #
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.32-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.36-nikolaik}
      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
    ports:
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -7,7 +7,7 @@ services:
    image: openhands:latest
    container_name: openhands-app-${DATE:-}
    environment:
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik}
      #- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} # enable this only if you want a specific non-root sandbox user but you will have to manually adjust permissions of openhands-state for this user
      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
    ports:
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -3,6 +3,7 @@

 # Production
 /build
+/static/swagger-ui

 # Generated files
 .docusaurus
--- a/docs/docusaurus.config.ts
+++ b/docs/docusaurus.config.ts
@@ -36,6 +36,14 @@ const config: Config = {
    mermaid: true,
  },
  themes: ['@docusaurus/theme-mermaid'],
+  plugins: [
+    [
+      require.resolve('docusaurus-lunr-search'),
+      {
+        languages: ['en', 'zh', 'fr', 'ja', 'pt']
+      }
+    ]
+  ],
  presets: [
    [
      'classic',
@@ -75,10 +83,19 @@ const config: Config = {
          position: 'left',
          label: 'User Guides',
        },
+        {
+          href: 'https://docs.all-hands.dev/swagger-ui/', // FIXME: this should be a relative path, but docusarus steals the click
+          label: 'API',
+          position: 'left',
+        },
        {
          type: 'localeDropdown',
          position: 'left',
        },
+        {
+          type: 'search',
+          position: 'left',
+        },
        {
          href: 'https://all-hands.dev',
          label: 'Company',
--- a/docs/generate-swagger-ui.js
+++ b/docs/generate-swagger-ui.js
@@ -0,0 +1,102 @@
+const fs = require('fs');
+const path = require('path');
+const swaggerUiDist = require('swagger-ui-dist');
+
+/**
+ * This script manually sets up Swagger UI for the Docusaurus documentation.
+ *
+ * Why we need this approach:
+ * 1. Docusaurus doesn't have a built-in way to integrate Swagger UI
+ * 2. We need to copy the necessary files from swagger-ui-dist to our static directory
+ * 3. We need to create a custom index.html file that points to our OpenAPI spec
+ * 4. This approach allows us to customize the Swagger UI to match our documentation style
+ */
+
+// Get the absolute path to the swagger-ui-dist package
+const swaggerUiDistPath = swaggerUiDist.getAbsoluteFSPath();
+
+// Create the target directory if it doesn't exist
+const targetDir = path.join(__dirname, 'static', 'swagger-ui');
+if (!fs.existsSync(targetDir)) {
+  fs.mkdirSync(targetDir, { recursive: true });
+}
+
+// Copy all files from swagger-ui-dist to our target directory
+const files = fs.readdirSync(swaggerUiDistPath);
+files.forEach(file => {
+  const sourcePath = path.join(swaggerUiDistPath, file);
+  const targetPath = path.join(targetDir, file);
+
+  // Skip directories and non-essential files
+  if (fs.statSync(sourcePath).isDirectory() ||
+      file === 'package.json' ||
+      file === 'README.md' ||
+      file.endsWith('.map')) {
+    return;
+  }
+
+  fs.copyFileSync(sourcePath, targetPath);
+});
+
+// Create a custom index.html file that points to our OpenAPI spec
+const indexHtml = `
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <title>OpenHands API Documentation</title>
+  <link rel="stylesheet" type="text/css" href="./swagger-ui.css" />
+  <link rel="icon" type="image/png" href="./favicon-32x32.png" sizes="32x32" />
+  <link rel="icon" type="image/png" href="./favicon-16x16.png" sizes="16x16" />
+  <style>
+    html {
+      box-sizing: border-box;
+      overflow: -moz-scrollbars-vertical;
+      overflow-y: scroll;
+    }
+
+    *,
+    *:before,
+    *:after {
+      box-sizing: inherit;
+    }
+
+    body {
+      margin: 0;
+      background: #fafafa;
+    }
+  </style>
+</head>
+
+<body>
+  <div id="swagger-ui"></div>
+
+  <script src="./swagger-ui-bundle.js" charset="UTF-8"> </script>
+  <script src="./swagger-ui-standalone-preset.js" charset="UTF-8"> </script>
+  <script>
+    window.onload = function() {
+      // Begin Swagger UI call region
+      const ui = SwaggerUIBundle({
+        url: "/openapi.json",
+        dom_id: '#swagger-ui',
+        deepLinking: true,
+        presets: [
+          SwaggerUIBundle.presets.apis,
+          SwaggerUIStandalonePreset
+        ],
+        plugins: [
+          SwaggerUIBundle.plugins.DownloadUrl
+        ],
+        layout: "StandaloneLayout"
+      });
+      // End Swagger UI call region
+      window.ui = ui;
+    };
+  </script>
+</body>
+</html>
+`;
+
+fs.writeFileSync(path.join(targetDir, 'index.html'), indexHtml);
+
+console.log('Swagger UI files generated successfully in static/swagger-ui/');
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
@@ -52,7 +52,7 @@ LLM_API_KEY="sk_test_12345"
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -61,7 +61,7 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.36 \
    python -m openhands.core.cli
 ```

--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
@@ -46,7 +46,7 @@ LLM_API_KEY="sk_test_12345"
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -56,6 +56,6 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.36 \
    python -m openhands.core.main -t "write a bash script that prints hi" --no-auto-continue
 ```
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/installation.mdx
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/installation.mdx
@@ -13,16 +13,16 @@
 La façon la plus simple d'exécuter OpenHands est avec Docker.

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32
+    docker.all-hands.dev/all-hands-ai/openhands:0.36
 ```

 Vous pouvez également exécuter OpenHands en mode [headless scriptable](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), en tant que [CLI interactive](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), ou en utilisant l'[Action GitHub OpenHands](https://docs.all-hands.dev/modules/usage/how-to/github-action).
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/runtimes.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/runtimes.md
@@ -13,7 +13,7 @@ C'est le Runtime par défaut qui est utilisé lorsque vous démarrez OpenHands.

 ```
 docker run # ...
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -v /var/run/docker.sock:/var/run/docker.sock \
    # ...
 ```
--- a/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
+++ b/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
@@ -34,7 +34,7 @@ Docker で OpenHands を CLI モードで実行するには:
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -44,7 +44,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.36 \
    python -m openhands.core.cli
 ```

--- a/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
+++ b/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
@@ -31,7 +31,7 @@ DockerでOpenHandsをヘッドレスモードで実行するには:
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -42,7 +42,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.36 \
    python -m openhands.core.main -t "write a bash script that prints hi"
 ```

--- a/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/prompting/microagents-overview.md
+++ b/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/prompting/microagents-overview.md
@@ -13,7 +13,7 @@ OpenHandsがリポジトリで動作する際:

 1. リポジトリに`.openhands/microagents/`が存在する場合、そこからリポジトリ固有の指示を読み込みます。
 2. 会話のキーワードによってトリガーされる一般的なガイドラインを読み込みます。
-現在の[パブリックMicroagents](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/knowledge)を参照してください。
+現在の[パブリックMicroagents](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents)を参照してください。

 ## Microagentのフォーマット

--- a/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/prompting/microagents-public.md
+++ b/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/prompting/microagents-public.md
@@ -88,4 +88,4 @@ triggers:
 - ビルド時間とイメージサイズを最適化
 ```

-より多くの例については、[現在のパブリックマイクロエージェント](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/knowledge)をご覧ください。
+より多くの例については、[現在のパブリックマイクロエージェント](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents)をご覧ください。
--- a/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/runtimes/docker.md
+++ b/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/runtimes/docker.md
@@ -25,7 +25,7 @@ nikolaik の `SANDBOX_RUNTIME_CONTAINER_IMAGE` は、ランタイムサーバー

    ```bash
    docker run # ...
-        -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+        -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
        -e SANDBOX_USER_ID=$(id -u) \
        -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
        -v $WORKSPACE_BASE:/opt/workspace_base \
@@ -82,5 +82,5 @@ docker network create openhands-network
 # 分離されたネットワークで OpenHands を実行
 docker run # ... \
    --network openhands-network \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32
+    docker.all-hands.dev/all-hands-ai/openhands:0.36
 ```
--- a/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
+++ b/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
@@ -35,7 +35,7 @@ Para executar o OpenHands no modo CLI com Docker:
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -45,7 +45,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.36 \
    python -m openhands.core.cli
 ```

--- a/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
+++ b/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
@@ -32,7 +32,7 @@ Para executar o OpenHands no modo Headless com Docker:
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -43,7 +43,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.36 \
    python -m openhands.core.main -t "escreva um script bash que imprima oi"
 ```

--- a/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/installation.mdx
+++ b/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/installation.mdx
@@ -58,17 +58,17 @@
 A maneira mais fácil de executar o OpenHands é no Docker.

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands-state:/.openhands-state \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32
+    docker.all-hands.dev/all-hands-ai/openhands:0.36
 ```

 Você encontrará o OpenHands em execução em http://localhost:3000!
--- a/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/prompting/microagents-overview.md
+++ b/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/prompting/microagents-overview.md
@@ -13,7 +13,7 @@ Quando o OpenHands trabalha com um repositório, ele:

 1. Carrega instruções específicas do repositório de `.openhands/microagents/`, se presentes no repositório.
 2. Carrega diretrizes gerais acionadas por palavras-chave nas conversas.
-Veja os [Microagentes Públicos](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/knowledge) atuais.
+Veja os [Microagentes Públicos](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents) atuais.

 ## Formato do Microagente

--- a/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/prompting/microagents-public.md
+++ b/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/prompting/microagents-public.md
@@ -4,7 +4,7 @@

 Microagentes públicos são diretrizes especializadas acionadas por palavras-chave para todos os usuários do OpenHands.
 Eles são definidos em arquivos markdown no diretório
-[`microagents/knowledge/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/knowledge).
+[`microagents/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents).

 Microagentes públicos:
 - Monitoram comandos recebidos em busca de suas palavras-chave de acionamento.
@@ -15,7 +15,7 @@ Microagentes públicos:
 ## Microagentes Públicos Atuais

 Para mais informações sobre microagentes específicos, consulte seus arquivos de documentação individuais no
-diretório [`microagents/knowledge/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/knowledge/).
+diretório [`microagents/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/).

 ### Agente GitHub
 **Arquivo**: `github.md`
@@ -59,7 +59,7 @@ yes | npm install package-name
 ## Contribuindo com um Microagente Público

 Você pode criar seus próprios microagentes públicos adicionando novos arquivos markdown ao
-diretório [`microagents/knowledge/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/knowledge/).
+diretório [`microagents/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/).

 ### Melhores Práticas para Microagentes Públicos

@@ -81,7 +81,7 @@ Antes de criar um microagente público, considere:

 #### 2. Crie o Arquivo

-Crie um novo arquivo markdown em [`microagents/knowledge/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/knowledge/)
+Crie um novo arquivo markdown em [`microagents/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/)
 com um nome descritivo (por exemplo, `docker.md` para um agente focado em Docker).

 Atualize o arquivo com o frontmatter necessário [de acordo com o formato exigido](./microagents-overview#microagent-format)
@@ -149,5 +149,5 @@ Lembre-se de:
 - Otimizar para tempo de build e tamanho da imagem
 ```

-Veja os [microagentes públicos atuais](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/knowledge) para
+Veja os [microagentes públicos atuais](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents) para
 mais exemplos.
--- a/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/runtimes.md
+++ b/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/runtimes.md
@@ -13,7 +13,7 @@ Este é o Runtime padrão que é usado quando você inicia o OpenHands. Você po

 ```
 docker run # ...
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -v /var/run/docker.sock:/var/run/docker.sock \
    # ...
 ```
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
@@ -50,7 +50,7 @@ LLM_API_KEY="sk_test_12345"
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -59,7 +59,7 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.36 \
    python -m openhands.core.cli
 ```

--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
@@ -47,7 +47,7 @@ LLM_API_KEY="sk_test_12345"
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -57,6 +57,6 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.36 \
    python -m openhands.core.main -t "write a bash script that prints hi" --no-auto-continue
 ```
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/installation.mdx
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/installation.mdx
@@ -11,16 +11,16 @@
 在 Docker 中运行 OpenHands 是最简单的方式。

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32
+    docker.all-hands.dev/all-hands-ai/openhands:0.36
 ```

 你也可以在可脚本化的[无头模式](https://docs.all-hands.dev/modules/usage/how-to/headless-mode)下运行 OpenHands，作为[交互式 CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode)，或使用 [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action)。
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/runtimes.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/runtimes.md
@@ -11,7 +11,7 @@

 ```
 docker run # ...
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -v /var/run/docker.sock:/var/run/docker.sock \
    # ...
 ```
--- a/docs/modules/usage/cloud/openhands-cloud.mdx
+++ b/docs/modules/usage/cloud/openhands-cloud.mdx
@@ -8,18 +8,22 @@ OpenHands Cloud can be accessed at https://app.all-hands.dev/.

 ## Getting Started

-After visiting OpenHands Cloud, you will be asked to connect with your GitHub account:
-1. After reading and accepting the terms of service, click `Connect to GitHub`.
+After visiting OpenHands Cloud, you will be asked to connect with your GitHub or GitLab account:
+
+1. After reading and accepting the terms of service, click `Log in with GitHub` or `Log in with GitLab`.
 2. Review the permissions requested by OpenHands and then click `Authorize OpenHands AI`.
-   - OpenHands will require some permissions from your GitHub account. To read more about these permissions,
-     you can click the `Learn more` link on the GitHub authorize page.
+   - OpenHands will require some permissions from your GitHub or GitLab account. To read more about these permissions:
+     - GitHub: You can click the `Learn more` link on the GitHub authorize page.
+     - GitLab: You can expand each permission request on the GitLab authorize page.

 ## Repository Access

-### Adding Repository Access
+### GitHub
+
+#### Adding Repository Access

 You can grant OpenHands specific repository access:
-1. Click the `Select a GitHub project` dropdown, select `Add more repositories...`.
+1. Click `Add GitHub repos` on the Home page.
 2. Select the organization, then choose the specific repositories to grant OpenHands access to.
   <details>
     <summary>Permission Details for Repository Access</summary>
@@ -42,11 +46,15 @@ You can grant OpenHands specific repository access:

 3. Click on `Install & Authorize`.

-### Modifying Repository Access
+#### Modifying Repository Access

-You can modify repository access at any time by:
-* Using the same `Select a GitHub project > Add more repositories` workflow, or
-* Visiting the Settings page and selecting `Configure GitHub Repositories` under the `GitHub Settings` section.
+You can modify GitHub repository access at any time by:
+* Using the same `Add GitHub repos` workflow, or
+* Visiting the Settings page and selecting `Configure GitHub Repositories` under the `Git Settings` section.
+
+### GitLab
+
+When using your GitLab account, OpenHands will automatically have access to your repositories.

 ## Conversation Persistence

--- a/docs/modules/usage/customization/repository.md
+++ b/docs/modules/usage/customization/repository.md
@@ -1,18 +1,17 @@
 # Repository Customization

-You can customize how OpenHands works with your repository by creating a
+You can customize how OpenHands interacts with your repository by creating a
 `.openhands` directory at the root level.

 ## Microagents
-You can use microagents to extend the OpenHands prompts with information
-about your project and how you want OpenHands to work. See
-[Repository Microagents](../prompting/microagents-repo) for more information.
+
+Microagents allow you to extend OpenHands prompts with information specific to your project and define how OpenHands
+should function. See [Microagents Overview](../prompting/microagents-overview) for more information.


 ## Setup Script
-You can add `.openhands/setup.sh`, which will be run every time OpenHands begins
-working with your repository. This is a good place to install dependencies, set
-environment variables, etc.
+You can add a `.openhands/setup.sh` file, which will run every time OpenHands begins working with your repository.
+This is an ideal location for installing dependencies, setting environment variables, and performing other setup tasks.

 For example:
 ```bash
--- a/docs/modules/usage/how-to/cli-mode.md
+++ b/docs/modules/usage/how-to/cli-mode.md
@@ -35,7 +35,7 @@ To run OpenHands in CLI mode with Docker:
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -45,7 +45,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.36 \
    python -m openhands.core.cli
 ```

--- a/docs/modules/usage/how-to/custom-sandbox-guide.md
+++ b/docs/modules/usage/how-to/custom-sandbox-guide.md
@@ -1,7 +1,8 @@
 # Custom Sandbox

 :::note
-This guide is for users that would like to use their own custom Docker image for the runtime, e.g. with certain tools or programming languages pre-installed
+This guide is for users that would like to use their own custom Docker image for the runtime. For example
+with certain tools or programming languages pre-installed.
 :::

 The sandbox is where the agent performs its tasks. Instead of running commands directly on your computer
--- a/docs/modules/usage/how-to/gui-mode.md
+++ b/docs/modules/usage/how-to/gui-mode.md
@@ -24,9 +24,8 @@ OpenHands supports multiple version control providers. You can configure tokens

 #### GitHub Token Setup

-OpenHands automatically exports a `GITHUB_TOKEN` to the shell environment if it is available. This can happen in two ways:
+OpenHands automatically exports a `GITHUB_TOKEN` to the shell environment if provided:

-**Local Installation**: The user directly inputs their GitHub token.
 <details>
  <summary>Setting Up a GitHub Token</summary>

@@ -40,9 +39,8 @@ OpenHands automatically exports a `GITHUB_TOKEN` to the shell environment if it
     - Minimal Permissions ( Select `Meta Data = Read-only` read for search, `Pull Requests = Read and Write` and `Content = Read and Write` for branch creation)
  2. **Enter Token in OpenHands**:
   - Click the Settings button (gear icon).
-   - Navigate to the `Git Provider Settings` section.
   - Paste your token in the `GitHub Token` field.
-   - Click `Save Changes` to apply the changes.
+   - Click `Save` to apply the changes.
 </details>

 <details>
@@ -83,26 +81,9 @@ OpenHands automatically exports a `GITHUB_TOKEN` to the shell environment if it
     - Check the browser console for any error messages.
 </details>

-**OpenHands Cloud**: The token is obtained through GitHub OAuth authentication.
-
-<details>
-  <summary>OAuth Authentication</summary>
-
-  When using OpenHands Cloud, the GitHub OAuth flow requests the following permissions:
-   - Repository access (read/write)
-   - Workflow management
-   - Organization read access
-
-  To authenticate OpenHands:
-   - Click `Sign in with GitHub` when prompted.
-   - Review the requested permissions.
-   - Authorize OpenHands to access your GitHub account.
-   - If using an organization, authorize organization access if prompted.
-</details>
-
 #### GitLab Token Setup

-OpenHands automatically exports a `GITLAB_TOKEN` to the shell environment, for local installations only, if it is available.
+OpenHands automatically exports a `GITLAB_TOKEN` to the shell environment if provided:

 <details>
  <summary>Setting Up a GitLab Token</summary>
@@ -117,10 +98,9 @@ OpenHands automatically exports a `GITLAB_TOKEN` to the shell environment, for l
   - Set an expiration date or leave it blank for a non-expiring token.
  2. **Enter Token in OpenHands**:
   - Click the Settings button (gear icon).
-   - Navigate to the `Git Provider Settings` section.
   - Paste your token in the `GitLab Token` field.
   - Enter your GitLab instance URL if using self-hosted GitLab.
-   - Click `Save Changes` to apply the changes.
+   - Click `Save` to apply the changes.
 </details>

 <details>
@@ -156,7 +136,6 @@ OpenHands automatically exports a `GITLAB_TOKEN` to the shell environment, for l
 ## Tips for Effective Use

 - Be specific in your requests to get the most accurate and helpful responses, as described in the [prompting best practices](../prompting/prompting-best-practices).
- Use the workspace panel to explore your project structure.
 - Use one of the recommended models, as described in the [LLMs section](usage/llms/llms.md).

 Remember, the GUI mode of OpenHands is designed to make your interaction with the AI assistant as smooth and intuitive
--- a/docs/modules/usage/how-to/headless-mode.md
+++ b/docs/modules/usage/how-to/headless-mode.md
@@ -32,7 +32,7 @@ To run OpenHands in Headless mode with Docker:
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -43,7 +43,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.36 \
    python -m openhands.core.main -t "write a bash script that prints hi"
 ```

--- a/docs/modules/usage/installation.mdx
+++ b/docs/modules/usage/installation.mdx
@@ -58,17 +58,17 @@ A system with a modern processor and a minimum of **4GB RAM** is recommended to
 The easiest way to run OpenHands is in Docker.

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.32-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.36-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands-state:/.openhands-state \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.32
+    docker.all-hands.dev/all-hands-ai/openhands:0.36
 ```

 You'll find OpenHands running at http://localhost:3000!
--- a/docs/modules/usage/key-features.md
+++ b/docs/modules/usage/key-features.md
@@ -6,23 +6,24 @@
 - Displays the conversation between the user and OpenHands.
 - OpenHands explains its actions in this panel.

-### Workspace
- Browse project files and directories.
- Use the `Open in VS Code` option to:
-  * Modify files
-  * Upload and download files
+### Changes
+- Shows the file changes performed by OpenHands.
+
+### VS Code
+- Embedded VS Code for browsing and modifying files.
+- Can also be used to upload and download files.
+
+### Terminal
+- A space for OpenHands and users to run terminal commands.

 ### Jupyter
 - Shows all Python commands that were executed by OpenHands.
 - Particularly handy when using OpenHands to perform data visualization tasks.

 ### App
- Shows the web server when OpenHands runs an application.
+- Displays the web server when OpenHands runs an application.
 - Users can interact with the running application.

 ### Browser
 - Used by OpenHands to browse websites.
 - The browser is non-interactive.
-
-### Terminal
- A space for OpenHands and users to run terminal commands.
--- a/docs/modules/usage/llms/llms.md
+++ b/docs/modules/usage/llms/llms.md
@@ -17,6 +17,8 @@ Based on these findings and community feedback, the following models have been v
 - [gemini/gemini-2.5-pro](https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/)
 - [deepseek/deepseek-chat](https://api-docs.deepseek.com/)
 - [openai/o3-mini](https://openai.com/index/openai-o3-mini/)
+- [openai/o3](https://openai.com/index/introducing-o3-and-o4-mini/)
+- [openai/o4-mini](https://openai.com/index/introducing-o3-and-o4-mini/)
 - [all-hands/openhands-lm-32b-v0.1](https://www.all-hands.dev/blog/introducing-openhands-lm-32b----a-strong-open-coding-agent-model) -- available through [OpenRouter](https://openrouter.ai/all-hands/openhands-lm-32b-v0.1)


--- a/docs/modules/usage/llms/local-llms.md
+++ b/docs/modules/usage/llms/local-llms.md
@@ -15,7 +15,7 @@ It is highly recommended that you use GPUs to serve local models for optimal exp
 For example, to download [OpenHands LM 32B v0.1](https://huggingface.co/all-hands/openhands-lm-32b-v0.1):

 ```bash
-huggingface-cli download all-hands/openhands-lm-32b-v0.1 --local-dir my_folder/openhands-lm-32b-v0.1
+huggingface-cli download all-hands/openhands-lm-32b-v0.1 --local-dir all-hands/openhands-lm-32b-v0.1
 ```

 ## Create an OpenAI-Compatible Endpoint With a Model Serving Framework
@@ -27,7 +27,7 @@ huggingface-cli download all-hands/openhands-lm-32b-v0.1 --local-dir my_folder/o

 ```bash
 SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 python3 -m sglang.launch_server \
-    --model my_folder/openhands-lm-32b-v0.1 \
+    --model all-hands/openhands-lm-32b-v0.1 \
    --served-model-name openhands-lm-32b-v0.1 \
    --port 8000 \
    --tp 2 --dp 1 \
@@ -41,7 +41,7 @@ SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 python3 -m sglang.launch_server \
 - Example launch command for OpenHands LM 32B (with at least 2 GPUs):

 ```bash
-vllm serve my_folder/openhands-lm-32b-v0.1 \
+vllm serve all-hands/openhands-lm-32b-v0.1 \
    --host 0.0.0.0 --port 8000 \
    --api-key mykey \
    --tensor-parallel-size 2 \
@@ -67,7 +67,7 @@ Ensure `config.toml` exists by running `make setup-config` which will create one
 workspace_base="/path/to/your/workspace"

 [llm]
-embedding_model="local"
+model="openhands-lm-32b-v0.1"
 ollama_base_url="http://localhost:8000"
 ```

--- a/docs/modules/usage/mcp.md
+++ b/docs/modules/usage/mcp.md
@@ -0,0 +1,96 @@
+# Model Context Protocol (MCP)
+
+:::note
+This page outlines how to configure and use the Model Context Protocol (MCP) in OpenHands, allowing you to extend the agent's capabilities with custom tools.
+:::
+
+## Overview
+
+Model Context Protocol (MCP) is a mechanism that allows OpenHands to communicate with external tool servers. These servers can provide additional functionality to the agent, such as specialized data processing, external API access, or custom tools. MCP is based on the open standard defined at [modelcontextprotocol.io](https://modelcontextprotocol.io).
+
+## Configuration
+
+MCP configuration is defined in the `[mcp]` section of your `config.toml` file.
+
+### Configuration Example
+
+```toml
+[mcp]
+# SSE Servers - External servers that communicate via Server-Sent Events
+sse_servers = [
+    # Basic SSE server with just a URL
+    "http://example.com:8080/mcp",
+
+    # SSE server with API key authentication
+    {url="https://secure-example.com/mcp", api_key="your-api-key"}
+]
+
+# Stdio Servers - Local processes that communicate via standard input/output
+stdio_servers = [
+    # Basic stdio server
+    {name="fetch", command="uvx", args=["mcp-server-fetch"]},
+
+    # Stdio server with environment variables
+    {
+        name="data-processor",
+        command="python",
+        args=["-m", "my_mcp_server"],
+        env={
+            "DEBUG": "true",
+            "PORT": "8080"
+        }
+    }
+]
+```
+
+## Configuration Options
+
+### SSE Servers
+
+SSE servers are configured using either a string URL or an object with the following properties:
+
+- `url` (required)
+  - Type: `str`
+  - Description: The URL of the SSE server
+
+- `api_key` (optional)
+  - Type: `str`
+  - Default: `None`
+  - Description: API key for authentication with the SSE server
+
+### Stdio Servers
+
+Stdio servers are configured using an object with the following properties:
+
+- `name` (required)
+  - Type: `str`
+  - Description: A unique name for the server
+
+- `command` (required)
+  - Type: `str`
+  - Description: The command to run the server
+
+- `args` (optional)
+  - Type: `list of str`
+  - Default: `[]`
+  - Description: Command-line arguments to pass to the server
+
+- `env` (optional)
+  - Type: `dict of str to str`
+  - Default: `{}`
+  - Description: Environment variables to set for the server process
+
+## How MCP Works
+
+When OpenHands starts, it:
+
+1. Reads the MCP configuration from `config.toml`
+2. Connects to any configured SSE servers
+3. Starts any configured stdio servers
+4. Registers the tools provided by these servers with the agent
+
+The agent can then use these tools just like any built-in tool. When the agent calls an MCP tool:
+
+1. OpenHands routes the call to the appropriate MCP server
+2. The server processes the request and returns a response
+3. OpenHands converts the response to an observation and presents it to the agent
--- a/docs/modules/usage/prompting/microagents-keyword.md
+++ b/docs/modules/usage/prompting/microagents-keyword.md
@@ -0,0 +1,38 @@
+# Keyword-Triggered Microagents
+
+## Purpose
+
+Keyword-triggered microagents provide OpenHands with specific instructions that are activated when certain keywords
+appear in the prompt. This is useful for tailoring behavior based on particular tools, languages, or frameworks.
+
+## Usage
+
+These microagents are only loaded when a prompt includes one of the trigger words.
+
+## Frontmatter Syntax
+
+Frontmatter is required for keyword-triggered microagents. It must be placed at the top of the file,
+above the guidelines.
+
+Enclose the frontmatter in triple dashes (---) and include the following fields:
+
+| Field      | Description                                      | Required | Default          |
+|------------|--------------------------------------------------|----------|------------------|
+| `triggers` | A list of keywords that activate the microagent. | Yes      | None             |
+| `agent`    | The agent this microagent applies to.            | No       | 'CodeActAgent'   |
+
+
+## Example
+
+Keyword-triggered microagent file example located at `.openhands/microagents/yummy.md`:
+```
+---
+triggers:
+- yummyhappy
+- happyyummy
+---
+
+The user has said the magic word. Respond with "That was delicious!"
+```
+
+[See examples of microagents triggered by keywords in the official OpenHands repository](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents)
--- a/docs/modules/usage/prompting/microagents-overview.md
+++ b/docs/modules/usage/prompting/microagents-overview.md
@@ -1,31 +1,40 @@
 # Microagents Overview

-Microagents are specialized prompts that enhance OpenHands with domain-specific knowledge, repository-specific context
-and task-specific workflows. They help by providing expert guidance, automating common tasks, and ensuring
-consistent practices across projects.
+Microagents are specialized prompts that enhance OpenHands with domain-specific knowledge.
+They provide expert guidance, automate common tasks, and ensure consistent practices across projects.

-## Microagent Categories
+## Microagent Types

-Currently OpenHands supports two categories of microagents:
+Currently OpenHands supports the following types of microagents:

- [Repository-specific Microagents](./microagents-repo): Repository-specific context and guidelines for OpenHands.
- [Public Microagents](./microagents-public): General guidelines triggered by keywords for all OpenHands users.
+- [General Repository Microagents](./microagents-repo): General guidelines for OpenHands about the repository.
+- [Keyword-Triggered Microagents](./microagents-keyword): Guidelines activated by specific keywords in prompts.

-A microagent is classified as repository-specific or public depending on its location:
+To customize OpenHands' behavior, create a .openhands/microagents/ directory in the root of your repository and
+add `<microagent_name>.md` files inside.

- Repository-specific microagents are located in a repository's `.openhands/microagents/` directory
- Public microagents are located in the official OpenHands repository inside the `/microagents` folder
+:::note
+Loaded microagents take up space in the context window.
+These microagents, alongside user messages, inform OpenHands about the task and the environment.
+:::

-When OpenHands works with a repository, it:
+Example repository structure:

-1. Loads **repository-specific** microagents from `.openhands/microagents/` if present in the repository.
-2. Loads **public knowledge** microagents triggered by keywords in conversations
-3. Loads **public tasks** microagents when explicitly requested by the user
+```
+some-repository/
+└── .openhands/
+    └── microagents/
+        └── repo.md            # General repository guidelines
+        └── trigger_this.md    # Microagent triggered by specific keywords
+        └── trigger_that.md    # Microagent triggered by specific keywords
+```

-You can check out the existing public microagents at the [official OpenHands repository](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/).
+## Microagents Frontmatter Requirements

-## Microagent Format
+Each microagent file may include frontmatter that provides additional information. In some cases, this frontmatter
+is required:

-All microagents use markdown files with YAML frontmatter that have special instructions to help OpenHands activate them.
-
-Check out the [syntax documentation](./microagents-syntax) for a comprehensive guide on how to configure your microagents.
+| Microagent Type                  | Required |
+|----------------------------------|----------|
+| `General Repository Microagents` | No       |
+| `Keyword-Triggered Microagents`  | Yes      |
--- a/docs/modules/usage/prompting/microagents-public.md
+++ b/docs/modules/usage/prompting/microagents-public.md
@@ -1,35 +1,17 @@
-# Public Microagents
+# Global Microagents

 ## Overview

-Public microagents provide specialized context and capabilities for all OpenHands users, regardless of their repository configuration. Unlike repository-specific microagents, public microagents are globally available across all repositories.
+Global microagents are [keyword-triggered microagents](./microagents-keyword) that apply to all OpenHands users. A list of the current
+global microagents can be found [in the OpenHands repository](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents).

-Public microagents come in two types:
+## Contributing a Global Microagent

- **Knowledge microagents**: Automatically activated when keywords in conversations match their triggers
- **Task microagents**: Explicitly invoked by users to guide through specific workflows
-
-Both types follow the same syntax and structure as repository-specific microagents, using markdown files with YAML frontmatter that define their behavior and capabilities. They are located in the official OpenHands repository under:
-
- [`microagents/knowledge/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/knowledge) for knowledge microagents
- [`microagents/tasks/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/tasks) for task microagents
-
-Public microagents:
-
- Monitor incoming commands for their trigger words.
- Activate when relevant triggers are detected.
- Apply their specialized knowledge and capabilities.
- Follow their specific guidelines and restrictions.
-
-When loading public microagents, OpenHands scans the official repository's microagents directories recursively, processing all markdown files except README.md. The system categorizes each microagent based on its `type` field in the YAML frontmatter, regardless of its exact file location within the knowledge or tasks directories.
-
-## Contributing a Public Microagent
-
-You can create public microagents and share with the community by opening a pull request to the official repository.
+You can create global microagents and share with the community by opening a pull request to the official repository.

 See the [CONTRIBUTING.md](https://github.com/All-Hands-AI/OpenHands/blob/main/CONTRIBUTING.md) for specific instructions on how to contribute to OpenHands.

-### Public Microagents Best Practices
+### Global Microagents Best Practices

 - **Clear Scope**: Keep the microagent focused on a specific domain or task.
 - **Explicit Instructions**: Provide clear, unambiguous guidelines.
@@ -37,11 +19,11 @@ See the [CONTRIBUTING.md](https://github.com/All-Hands-AI/OpenHands/blob/main/CO
 - **Safety First**: Include necessary warnings and constraints.
 - **Integration Awareness**: Consider how the microagent interacts with other components.

-### Steps to Contribute a Public Microagent
+### Steps to Contribute a Global Microagent

-#### 1. Plan the Public Microagent
+#### 1. Plan the Global Microagent

-Before creating a public microagent, consider:
+Before creating a global microagent, consider:

 - What specific problem or use case will it address?
 - What unique capabilities or knowledge should it have?
@@ -51,23 +33,19 @@ Before creating a public microagent, consider:
 #### 2. Create File

 Create a new Markdown file with a descriptive name in the appropriate directory:
+[`microagents/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents)

- [`microagents/knowledge/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/knowledge) for knowledge microagents
- [`microagents/tasks/`](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/tasks) for task microagents
+#### 3. Testing the Global Microagent

-Ensure it follows the correct [syntax](./microagents-syntax.md) and [best practices](./microagents-syntax.md#markdown-content-best-practices).
-
-#### 3. Testing the Public Microagent
-
- Test the agent with various prompts
- Verify trigger words activate the agent correctly
- Ensure instructions are clear and comprehensive
- Check for potential conflicts and overlaps with existing agents
+- Test the agent with various prompts.
+- Verify trigger words activate the agent correctly.
+- Ensure instructions are clear and comprehensive.
+- Check for potential conflicts and overlaps with existing agents.

 #### 4. Submission Process

 Submit a pull request with:

- The new microagent file
- Updated documentation if needed
- Description of the agent's purpose and capabilities
+- The new microagent file.
+- Updated documentation if needed.
+- Description of the agent's purpose and capabilities.
--- a/docs/modules/usage/prompting/microagents-repo.md
+++ b/docs/modules/usage/prompting/microagents-repo.md
@@ -1,117 +1,31 @@
-# Repository-specific Microagents
+# General Repository Microagents

-## Overview
+## Purpose

-OpenHands can be customized to work more effectively with specific repositories by providing repository-specific context and guidelines.
+General guidelines for OpenHands to work more effectively with the repository.

-This section explains how to optimize OpenHands for your project.
+## Usage

-## Creating Repository Microagents
+These microagents are always loaded as part of the context.

-You can customize OpenHands' behavior for your repository by creating a `.openhands/microagents/` directory in your repository's root.
+## Frontmatter Syntax

-You can enhance OpenHands' performance by adding custom microagents to your repository:
+The frontmatter for this type of microagent is optional.

-1. For overall repository-specific instructions, create a `.openhands/microagents/repo.md` file
-2. For reusable domain knowledge triggered by keywords, add multiple `.md` files to `.openhands/microagents/knowledge/`
-3. For common workflows and tasks, create multiple `.md` files to `.openhands/microagents/tasks/`
+Frontmatter should be enclosed in triple dashes (---) and may include the following fields:

-Check out the [best practices](./microagents-syntax.md#markdown-content-best-practices) for formatting the content of your custom microagent.
+| Field     | Description                             | Required | Default        |
+|-----------|-----------------------------------------|----------|----------------|
+| `agent`   | The agent this microagent applies to    | No       | 'CodeActAgent' |

-Keep in mind that loaded microagents take up space in the context window. It's crucial to strike a balance between the additional context provided by microagents and the instructions provided in the user's inputs.
-
-Note that you can use OpenHands to create new microagents. The public microagent [`add_agent`](https://github.com/All-Hands-AI/OpenHands/blob/main/microagents/knowledge/add_agent.md) is loaded to all OpenHands instance and can support you on this.
-
-## Types of Microagents
-
-OpenHands supports three primary types of microagents, each with specific purposes and features to enhance agent performance:
-
- [repository](#repository-microagents)
- [knowledge](#knowledge-microagents)
- [tasks](#tasks-microagents)
-
-The standard directory structure within a repository is:
-
- One main `repo.md` file containing repository-specific instructions
- Additional `Knowledge` agents in `.openhands/microagents/knowledge/` directory
- Additional `Task` agents in `.openhands/microagents/tasks/` directory
-
-When processing the `.openhands/microagents/` directory, OpenHands will recursively scan all subfolders and process any `.md` files (except `README.md`) it finds. The system determines the microagent type based on the `type` field in the YAML frontmatter, not by the file's location. However, for organizational clarity, it's recommended to follow the standard directory structure.
-
-### Repository Microagents
-
-The `Repository` microagent is loaded specifically from `.openhands/microagents/repo.md` and serves as the main
-repository-specific instruction file. This single file is automatically loaded whenever OpenHands works with that repository
-without requiring any keyword matching or explicit call from the user.
-
-OpenHands does not support multiple `repo.md` files in different locations or multiple microagents with type `repo`.
-
-If you need to organize different types of repository information, the recommended approach is to use a single `repo.md` file with well-structured sections rather than trying to create multiple microagents with the type `repo`.
-
-The best practice is to include project-specific instructions, team practices, coding standards, and architectural guidelines that are relevant for **all** prompts in that repository.
-
-Example structure:
+## Example

+General repository microagent file example located at `.openhands/microagents/repo.md`:
 ```
-your-repository/
-└── .openhands/
-    └── microagents/
-        └── repo.md    # Repository-specific instructions
+This project is a TODO application that allows users to track TODO items.
+
+To set it up, you can run `npm run build`.
+Always make sure the tests are passing before committing changes. You can run the tests by running `npm run test`.
 ```

-[See the example in the official OpenHands repository](https://github.com/All-Hands-AI/OpenHands/blob/main/.openhands/microagents/repo.md?plain=1)
-
-### Knowledge Microagents
-
-Knowledge microagents provide specialized domain expertise:
-
- Recommended to be located in `.openhands/microagents/knowledge/`
- Triggered by specific keywords in conversations
- Contain expertise on tools, languages, frameworks, and common practices
-
-Use knowledge microagents to trigger additional context relevant to specific technologies, tools, or workflows. For example, mentioning "git" in your conversation will automatically trigger git-related expertise to help with Git operations.
-
-Examples structure:
-
-```
-your-repository/
-└── .openhands/
-    └── microagents/
-        └── knowledge/
-            └── git.md
-            └── docker.md
-            └── python.md
-            └── ...
-        └── repo.md
-```
-
-You can find several real examples of `Knowledge` microagents in the [offical OpenHands repository](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/knowledge)
-
-### Tasks Microagents
-
-Task microagents guide users through interactive workflows:
-
- Recommended to be located in `.openhands/microagents/tasks/`
- Provide step-by-step processes for common development tasks
- Accept inputs and adapt to different scenarios
- Ensure consistent outcomes for complex operations
-
-Task microagents are a convenient way to store multi-step processes you perform regularly. For instance, you can create a `update_pr_description.md` microagent to automatically generate better pull request descriptions based on code changes.
-
-Examples structure:
-
-```
-your-repository/
-└── .openhands/
-    └── microagents/
-        └── tasks/
-            └── update_pr_description.md
-            └── address_pr_comments.md
-            └── get_test_to_pass.md
-            └── ...
-        └── knowledge/
-            └── ...
-        └── repo.md
-```
-
-You can find several real examples of `Tasks` microagents in the [offical OpenHands repository](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/tasks)
+[See more examples of general repository microagents here.](https://github.com/All-Hands-AI/OpenHands/tree/main/.openhands/microagents)
--- a/docs/modules/usage/prompting/microagents-syntax.md
+++ b/docs/modules/usage/prompting/microagents-syntax.md
@@ -1,128 +0,0 @@
-# Microagents Syntax
-
-Microagents are defined using markdown files with YAML frontmatter that specify their behavior, triggers, and capabilities.
-
-Find below a comprehensive description of the frontmatter syntax and other details about how to use each type of microagent available at OpenHands.
-
-## Frontmatter Schema
-
-Every microagent requires a YAML frontmatter section at the beginning of the file, enclosed by triple dashes (`---`). The fields are:
-
-| Field      | Description                                        | Required                 | Used By          |
-| ---------- | -------------------------------------------------- | ------------------------ | ---------------- |
-| `name`     | Unique identifier for the microagent               | Yes                      | All types        |
-| `type`     | Type of microagent: `repo`, `knowledge`, or `task` | Yes                      | All types        |
-| `version`  | Version number (Semantic versioning recommended)   | Yes                      | All types        |
-| `agent`    | The agent type (typically `CodeActAgent`)          | Yes                      | All types        |
-| `author`   | Creator of the microagent                          | No                       | All types        |
-| `triggers` | List of keywords that activate the microagent      | Yes for knowledge agents | Knowledge agents |
-| `inputs`   | Defines required user inputs for task execution    | Yes for task agents      | Task agents      |
-
-## Core Fields
-
-### `agent`
-
-**Purpose**: Specifies which agent implementation processes the microagent (typically `CodeActAgent`).
-
- Defines a single agent responsible for processing the microagent
- Must be available in the OpenHands system (see the [agent hub](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub))
- If the specified agent is not active, the microagent will not be used
-
-### `triggers`
-
-**Purpose**: Defines keywords that activate the `knowledge` microagent.
-
-**Example**:
-
-```yaml
-triggers:
-  - kubernetes
-  - k8s
-  - docker
-  - security
-  - containers cluster
-```
-
-**Key points**:
-
- Can include both single words and multi-word phrases
- Case-insensitive matching is typically used
- More specific triggers (like "docker compose") prevent false activations
- Multiple triggers increase the chance of activation in relevant contexts
- Unique triggers like "flarglebargle" can be used for testing or special functionality
- Triggers should be carefully chosen to avoid unwanted activations or conflicts with other microagents
- Common terms used in many conversations may cause the microagent to be activated too frequently
-
-When using multiple triggers, the microagent will be activated if any of the trigger words or phrases appear in the
-conversation.
-
-### `inputs`
-
-**Purpose**: Defines parameters required from the user when a `task` microagent is activated.
-
-**Schema**:
-
-```yaml
-inputs:
-  - name: INPUT_NAME # Used with {{ INPUT_NAME }}
-    description: 'Description of what this input is for'
-    required: true # Optional, defaults to true
-```
-
-**Key points**:
-
- The `name` and `description` properties are required for each input
- The `required` property is optional and defaults to `true`
- Input values are referenced in the microagent body using double curly braces (e.g., `{{ INPUT_NAME }}`)
- All inputs defined will be collected from the user before the task microagent executes
-
-**Variable Usage**: Reference input values using double curly braces `{{ INPUT_NAME }}`.
-
-## Example Formats
-
-### Repository Microagent
-
-Repository microagents provide context and guidelines for a specific repository.
-
- Located at: `.openhands/microagents/repo.md`
- Automatically loaded when working with the repository
- Only one per repository
-
-The `Repository` microagent is loaded specifically from `.openhands/microagents/repo.md` and serves as the main
-repository-specific instruction file. This single file is automatically loaded whenever OpenHands works with that repository
-without requiring any keyword matching or explicit call from the user.
-
-[See the example in the official OpenHands repository](https://github.com/All-Hands-AI/OpenHands/blob/main/.openhands/microagents/repo.md?plain=1)
-
-### Knowledge Microagent
-
-Provides specialized domain expertise triggered by keywords.
-
-You can find several real examples of `Knowledge` microagents in the [offical OpenHands repository](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/knowledge)
-
-### Task Microagent
-
-When explicitly asked by the user, will guide through interactive workflows with specific inputs.
-
-You can find several real examples of `Tasks` microagents in the [offical OpenHands repository](https://github.com/All-Hands-AI/OpenHands/tree/main/microagents/tasks)
-
-## Markdown Content Best Practices
-
-After the frontmatter, compose the microagent body using Markdown syntax. Examples of elements you can include are:
-
- Clear, concise instructions outlining the microagent's purpose and responsibilities
- Specific guidelines and constraints the microagent should adhere to
- Relevant code snippets and practical examples to illustrate key points
- Step-by-step procedures for task agents, guiding users through workflows
-
-**Design Tips**:
-
- Keep microagents focused with a clear purpose
- Provide specific guidelines rather than general advice
- Use distinctive triggers for knowledge agents
- Keep content concise to minimize context window usage
- Break large microagents into smaller, focused ones
-
-Aim for clarity, brevity, and practicality in your writing. Use formatting like bullet points, code blocks, and emphasis to enhance readability and comprehension.
-
-Remember that balancing microagents details with user input space is important for maintaining effective interactions.
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
--- a/docs/package.json
+++ b/docs/package.json
@@ -4,23 +4,27 @@
  "private": true,
  "scripts": {
    "docusaurus": "docusaurus",
-    "start": "docusaurus start",
-    "build": "docusaurus build",
+    "start": "node generate-swagger-ui.js && docusaurus start",
+    "build": "node generate-swagger-ui.js && docusaurus build",
    "swizzle": "docusaurus swizzle",
    "deploy": "docusaurus deploy",
    "clear": "docusaurus clear",
    "serve": "docusaurus serve",
    "write-translations": "docusaurus write-translations",
    "write-heading-ids": "docusaurus write-heading-ids",
-    "typecheck": "tsc"
+    "typecheck": "tsc",
+    "generate-swagger-ui": "node generate-swagger-ui.js"
  },
+  "// Note": "The OpenAPI spec is stored in docs/static/openapi.json so it's accessible at /openapi.json in the deployed site",
  "dependencies": {
    "@docusaurus/core": "^3.7.0",
    "@docusaurus/plugin-content-pages": "^3.7.0",
    "@docusaurus/preset-classic": "^3.7.0",
    "@docusaurus/theme-mermaid": "^3.7.0",
    "@mdx-js/react": "^3.1.0",
+    "@node-rs/jieba": "^2.0.1",
    "clsx": "^2.0.0",
+    "docusaurus-lunr-search": "^3.6.0",
    "prism-react-renderer": "^2.4.1",
    "react": "^19.1.0",
    "react-dom": "^19.1.0",
@@ -31,6 +35,8 @@
    "@docusaurus/module-type-aliases": "^3.5.1",
    "@docusaurus/tsconfig": "^3.7.0",
    "@docusaurus/types": "^3.5.1",
+    "swagger-cli": "^4.0.4",
+    "swagger-ui-dist": "^5.21.0",
    "typescript": "~5.8.3"
  },
  "browserslist": {
@@ -47,5 +53,6 @@
  },
  "engines": {
    "node": ">=18.0"
-  }
+  },
+  "packageManager": "npm@10.5.0"
 }
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -66,18 +66,18 @@ const sidebars: SidebarsConfig = {
            },
            {
              type: 'doc',
-              label: 'Repository-specific',
+              label: 'General Repository Microagents',
              id: 'usage/prompting/microagents-repo',
            },
            {
              type: 'doc',
-              label: 'Public',
-              id: 'usage/prompting/microagents-public',
+              label: 'Keyword-Triggered Microagents',
+              id: 'usage/prompting/microagents-keyword',
            },
            {
              type: 'doc',
-              label: 'Syntax',
-              id: 'usage/prompting/microagents-syntax',
+              label: 'Global Microagents',
+              id: 'usage/prompting/microagents-public',
            },
          ],
        },
--- a/docs/src/components/HomepageHeader/HomepageHeader.tsx
+++ b/docs/src/components/HomepageHeader/HomepageHeader.tsx
@@ -45,7 +45,6 @@ export function HomepageHeader() {
        <div align="center" className="header-links">
          <a href="https://github.com/All-Hands-AI/OpenHands/graphs/contributors"><img src="https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Contributors" /></a>
          <a href="https://github.com/All-Hands-AI/OpenHands/stargazers"><img src="https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Stargazers" /></a>
-          <a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" /></a>
          <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License" /></a>
          <br/>
          <a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
@@ -53,7 +52,7 @@ export function HomepageHeader() {
          <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits" /></a>
          <br/>
          <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper%20on%20Arxiv-000?logoColor=FFE165&logo=arxiv&style=for-the-badge" alt="Paper on Arxiv" /></a>
-          <a href="https://huggingface.co/spaces/OpenHands/evaluation"><img src="https://img.shields.io/badge/Benchmark%20score-000?logoColor=FFE165&logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark Score" /></a>
+          <a href="https://docs.google.com/spreadsheets/d/1wOUdFCMyY6Nt0AIqF705KN4JKOWgeI4wUGUP60krXXs/edit?gid=0#gid=0"><img src="https://img.shields.io/badge/Benchmark%20score-000?logoColor=FFE165&logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark Score" /></a>
        </div>
      </div>
    </div>
--- a/docs/static/README.md
+++ b/docs/static/README.md
@@ -0,0 +1,15 @@
+# Static Files for OpenHands Documentation
+
+This directory contains static files that are copied directly to the build output of the Docusaurus documentation.
+
+## OpenAPI Specification
+
+The `openapi.json` file in this directory is the OpenAPI specification for the OpenHands API. It is copied to the build output and is accessible at `/openapi.json` in the deployed site.
+
+This file is used by the Swagger UI interface, which is accessible at `/swagger-ui/` in the deployed site.
+
+## Why is the OpenAPI spec in the static directory?
+
+The OpenAPI specification is placed in the static directory so that it's accessible at a predictable URL in the deployed site. This allows the Swagger UI to reference it directly.
+
+We only need one copy of the OpenAPI spec file, which is this one in the static directory.
--- a/docs/static/img/oh-features.png
+++ b/docs/static/img/oh-features.png
--- a/docs/static/openapi.json
+++ b/docs/static/openapi.json
--- a/docs/yarn.lock
+++ b/docs/yarn.lock
--- a/evaluation/benchmarks/lca_ci_build_repair/.gitignore
+++ b/evaluation/benchmarks/lca_ci_build_repair/.gitignore
@@ -0,0 +1 @@
+config.yaml
--- a/evaluation/benchmarks/lca_ci_build_repair/README.MD
+++ b/evaluation/benchmarks/lca_ci_build_repair/README.MD
@@ -0,0 +1,35 @@
+# CI Builds Repair Benchmark Integration
+
+This module integrates the CI Builds Repair benchmark developed by [JetBrains-Research](https://github.com/JetBrains-Research/lca-baselines/tree/main/ci-builds-repair/ci-builds-repair-benchmark).
+
+For more information, refer to the [GitHub repository](https://github.com/JetBrains-Research/lca-baselines/tree/main/ci-builds-repair/ci-builds-repair-benchmark) and the associated [research paper](https://arxiv.org/abs/2406.11612).
+See notice below for details
+
+## Setup
+
+Before running any scripts, make sure to configure the benchmark by setting up `config.yaml`.
+This benchmark pushes to JetBrains' private GitHub repository. You will to request a `token_gh` provided by their team, to run this benchmark.
+
+## Inference
+
+To run inference with your model:
+
+```bash
+./evaluation/benchmarks/lca_ci_build_repair/scripts/run_infer.sh llm.yourmodel
+```
+
+## Evaluation
+
+To evaluate the predictions:
+
+```bash
+./evaluation/benchmarks/lca_ci_build_repair/scripts/eval_infer.sh predictions_path_containing_output
+```
+
+## Results
+The benchmark contains 68 instances, we skip instances #126 and #145, and only run 66 instances due to dockerization errors.
+
+Due to running in live GitHub machines, the benchmark is sensitive to the date it is run. Even the golden patches in the dataset might present failures due to updates.
+For example, on 2025-04-09, running the benchmark against the golden patches gave 57/67 successes, with 1 job left in the waiting list.
+
+On 2025-04-10, running the benchmark full with OH and no oracle, 37 succeeded. That is 54% of the complete set of 68 instances and 64% of the 57 that succeed with golden patches.
--- a/evaluation/benchmarks/lca_ci_build_repair/config_template.yaml
+++ b/evaluation/benchmarks/lca_ci_build_repair/config_template.yaml
@@ -0,0 +1,11 @@
+LCA_PATH: path #where to clone lca-ci rep
+model_name: OpenHands
+benchmark_owner: ICML-25-BenchName-builds-repair
+token_gh: your_token
+#for lca-ci-repo
+repos_folder: /path/to/repos # here the cloned repos would be stored
+out_folder: /out/folder # here the result files would be stored
+data_cache_dir: /data/cache/dir/ # here the cached dataset would be stored
+username_gh: username-gh # your GitHub username
+# test_username: test_user # username that would be displayed in the benchmark. Optional. If ommitted, username_gh would be used
+language: Python # dataset language (now only Python is available)
--- a/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py
+++ b/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py
@@ -0,0 +1,242 @@
+"""Implements evaluation on JetBrains CI builds repair baselines
+
+Please see https://github.com/JetBrains-Research/lca-baselines/tree/main/ci-builds-repair
+and https://huggingface.co/datasets/JetBrains-Research/lca-ci-builds-repair
+
+TODOs:
+- Add more flags
+"""
+
+import json
+import os
+from pathlib import Path
+
+import ruamel.yaml
+
+from evaluation.utils.shared import (
+    EvalMetadata,
+    get_default_sandbox_config_for_eval,
+    make_metadata,
+)
+from openhands.core.config import (
+    AppConfig,
+    LLMConfig,
+    get_parser,
+    load_app_config,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime
+from openhands.events.action import CmdRunAction
+from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime='docker',
+        max_iterations=metadata.max_iterations,
+        sandbox=sandbox_config,
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    agent_config = config.get_agent_config(metadata.agent_class)
+    agent_config.enable_prompt_extensions = False
+    return config
+
+
+config = load_app_config()
+
+
+def load_bench_config():
+    script_dir = os.path.dirname(
+        os.path.abspath(__file__)
+    )  # Get the absolute path of the script
+    config_path = os.path.join(script_dir, 'config.yaml')
+    yaml = ruamel.yaml.YAML(typ='rt')
+    with open(config_path, 'r') as file:
+        return yaml.load(file)
+
+
+bench_config = load_bench_config()
+
+
+def run_eval(
+    runtime: Runtime,
+):
+    """Run the evaluation and create report"""
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    lca_path = bench_config['LCA_PATH']
+    lca_ci_path = os.path.join(
+        lca_path, 'lca-baselines', 'ci-builds-repair', 'ci-builds-repair-benchmark'
+    )
+
+    model_name = bench_config['model_name']
+
+    action = CmdRunAction(command=f'mkdir {lca_path}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command=f'cd {lca_path}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    lca_repo_url = 'https://github.com/juanmichelini/lca-baselines'
+    action = CmdRunAction(command=f'git clone {lca_repo_url}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command=f'cd {lca_ci_path}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='git switch open-hands-integration')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    script_dir = os.path.dirname(
+        os.path.abspath(__file__)
+    )  # Get the absolute path of the script
+    config_path = os.path.join(script_dir, 'config.yaml')
+    runtime.copy_to(config_path, lca_ci_path)
+
+    token_gh = bench_config['token_gh']
+    commandf = f'export TOKEN_GH={token_gh}'
+    action = CmdRunAction(command=commandf)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+
+    action = CmdRunAction(command='poetry install')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+
+    # Set up the task environment
+    commandf = f'poetry run python run_eval_jobs.py --model-name "{model_name}" --config-path "{lca_ci_path}/config.yaml" --job-ids-file "/tmp/output_lca.jsonl" --result-filename "testfile.jsonl"  > /tmp/single_output.txt'
+    action = CmdRunAction(command=commandf)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(f'run_eval_jobs.py gave {obs.content} !')
+    # assert obs.exit_code == 0
+
+    commandf = 'cat /tmp/single_output.txt'
+    action = CmdRunAction(command=commandf)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(f' {commandf} gave {obs.content}!')
+
+    testfile_path = os.path.join(bench_config['out_folder'], 'testfile.jsonl')
+    commandf = f'cat {testfile_path}'
+    action = CmdRunAction(command=commandf)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    report_str = obs.content
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    return report_str
+
+
+def process_predictions(predictions_path: str):
+    output_path = Path(predictions_path)
+    if output_path.suffix != '.jsonl':
+        raise ValueError('output_path must end in .jsonl')
+
+    output_lca_path = output_path.with_name(output_path.stem + '_lca.jsonl')
+
+    with output_path.open() as infile, output_lca_path.open('w') as outfile:
+        for line in infile:
+            data = json.loads(line)
+            json.dump(data.get('test_result'), outfile)
+            outfile.write('\n')
+
+    return str(output_lca_path)
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '-s',
+        '--eval-split',
+        type=str,
+        default='test',
+        choices=['test'],
+        help='data split to evaluate on, must be test',
+    )
+    parser.add_argument(
+        '--predictions-path',
+        type=str,
+        help='Path to the directory containing the output.jsonl with the predictions.',
+    )
+    args, _ = parser.parse_known_args()
+
+    data_split = args.eval_split
+
+    llm_config = LLMConfig(model='dummy_model')
+
+    metadata = make_metadata(
+        llm_config,
+        f'jetbrains-lca-ci--{data_split}',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.predictions_path,
+    )
+
+    # prepare image
+    config = get_config(metadata)
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+    logger.info('Converting output.jsonl into output_lca.jsonl')
+    predictions_lca_path = process_predictions(
+        os.path.join(args.predictions_path, 'output.jsonl')
+    )
+    runtime.copy_to(predictions_lca_path, '/tmp')
+
+    # get results
+    results_str = run_eval(runtime)
+    results_path = os.path.join(args.predictions_path, 'results.jsonl')
+    with open(results_path, 'w') as file:
+        file.write(results_str)
+    logger.info(f'Saved results to {results_path}')
+
+    # make a summary
+    resolved_instances = []
+    unresolved_instances = []
+    for line in results_str.strip().splitlines():
+        data = json.loads(line)
+        conclusion = data.get('conclusion')
+        if conclusion == 'success':
+            resolved_instances.append(data)
+        elif conclusion == 'failure':
+            unresolved_instances.append(data)
+
+    completed_instances = resolved_instances + unresolved_instances
+
+    report = {
+        'success': len(resolved_instances),
+        'failure': len(unresolved_instances),
+        'resolved_instances': resolved_instances,
+        'unresolved_instances': unresolved_instances,
+        'completed_instances': completed_instances,
+    }
+
+    print(f'Results: {report}')
+    report_path = os.path.join(args.predictions_path, 'report.jsonl')
+    with open(report_path, 'w') as out_f:
+        out_f.write(json.dumps(report) + '\n')
+
+    logger.info(f'Saved report of results in swebench format to {report_path}')
--- a/evaluation/benchmarks/lca_ci_build_repair/run_infer.py
+++ b/evaluation/benchmarks/lca_ci_build_repair/run_infer.py
@@ -0,0 +1,406 @@
+"""Implements inference on JetBrains CI builds repair baselines
+
+Please see https://github.com/JetBrains-Research/lca-baselines/tree/main/ci-builds-repair
+and https://huggingface.co/datasets/JetBrains-Research/lca-ci-builds-repair
+
+TODOs:
+- Add EXP_NAME
+"""
+
+import asyncio
+import json
+import os
+from typing import Any
+
+import pandas as pd
+import ruamel.yaml
+from datasets import load_dataset
+
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    codeact_user_response,
+    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    get_llm_config_arg,
+    get_parser,
+    load_app_config,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime='docker',
+        max_iterations=metadata.max_iterations,
+        sandbox=sandbox_config,
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    agent_config = config.get_agent_config(metadata.agent_class)
+    agent_config.enable_prompt_extensions = False
+    return config
+
+
+config = load_app_config()
+
+
+def load_bench_config():
+    script_dir = os.path.dirname(
+        os.path.abspath(__file__)
+    )  # Get the absolute path of the script
+    config_path = os.path.join(script_dir, 'config.yaml')
+    yaml = ruamel.yaml.YAML(typ='rt')
+    with open(config_path, 'r') as file:
+        return yaml.load(file)
+
+
+bench_config = load_bench_config()
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have completed the task, please finish the interaction using the "finish" tool.\n'
+}
+
+
+def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    lca_path = bench_config['LCA_PATH']
+    lca_ci_path = os.path.join(
+        lca_path, 'lca-baselines', 'ci-builds-repair', 'ci-builds-repair-benchmark'
+    )
+
+    repo_name = instance['repo_name']
+    repos_path = bench_config['repos_folder']
+    repo_owner = instance['repo_owner']
+    repo_path = os.path.join(repos_path, f'{repo_owner}__{repo_name}')
+    model_name = bench_config['model_name']
+
+    action = CmdRunAction(command=f'mkdir {lca_path}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command=f'cd {lca_path}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    lca_repo_url = 'https://github.com/juanmichelini/lca-baselines'
+    action = CmdRunAction(command=f'git clone {lca_repo_url}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command=f'cd {lca_ci_path}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='git switch open-hands-integration')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    script_dir = os.path.dirname(
+        os.path.abspath(__file__)
+    )  # Get the absolute path of the script
+    config_path = os.path.join(script_dir, 'config.yaml')
+    with open(config_path, 'r') as file:
+        config_as_text = file.read()
+
+    commandf = f"echo '{config_as_text}' > config.yaml"
+    action = CmdRunAction(command=commandf)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+
+    token_gh = bench_config['token_gh']
+    commandf = f'export TOKEN_GH={token_gh}'
+    action = CmdRunAction(command=commandf)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+
+    action = CmdRunAction(command='poetry install')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+
+    # Set up the task environment
+    commandf = f'poetry run python run_get_datapoint.py --model-name {model_name} --id {instance["id"]} > branch_name.txt'
+    action = CmdRunAction(command=commandf)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    if obs.exit_code != 0:
+        print(f'run_get_datapoint.py failed at {instance["id"]} with {obs.content}')
+    assert obs.exit_code == 0
+
+    commandf = 'cat branch_name.txt'
+    action = CmdRunAction(command=commandf)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    bench_config['user_branch_name'] = obs.content
+
+    # Navigate to the task's code path
+    action = CmdRunAction(command=f'cd {repo_path}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    model_name = bench_config['model_name']
+
+    lca_path = bench_config['LCA_PATH']
+    lca_ci_path = os.path.join(
+        lca_path, 'lca-baselines', 'ci-builds-repair', 'ci-builds-repair-benchmark'
+    )
+
+    user_branch_name = bench_config['user_branch_name']
+
+    token_gh = bench_config['token_gh']
+    commandf = f'export TOKEN_GH={token_gh}'
+    action = CmdRunAction(command=commandf)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+
+    # Navigate to the lca-baseslines scripts path
+    action = CmdRunAction(command=f'cd {lca_ci_path}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    commandf = f'poetry run python run_push_datapoint.py --id {instance["id"]} --model-name {model_name} --user-branch-name {user_branch_name} > single_output.json'
+    logger.info(f'Running push script: {commandf}')
+    action = CmdRunAction(command=commandf)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    # assert obs.exit_code == 0
+
+    commandf = 'cat single_output.json'
+    action = CmdRunAction(command=commandf)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    result = json.loads(obs.content)
+
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+
+    return result
+
+
+def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
+    config = get_config(metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance['instance_id'], log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')
+
+    repo_name = instance['repo_name']
+    repo_workflow = instance['workflow_path']
+    repo_logs = instance['logs']
+    repos_path = bench_config['repos_folder']
+    repo_owner = instance['repo_owner']
+    repo_path = os.path.join(repos_path, f'{repo_owner}__{repo_name}')
+
+    # Prepare the task instruction
+    instruction_no_oracle = f"""
+<uploaded_files>
+{repo_path}
+</uploaded_files>
+
+I've uploaded a python code repository in the directory {repo_path}, Consider the following issue:
+
+<issue_description>
+The repository must pass the CI workflow {repo_workflow}.
+but it gave the following error
+{repo_logs}
+</issue_description>
+
+Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?
+I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
+Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.
+Your task is to make the minimal changes to non-test files in the {repo_path} directory to ensure the <issue_description> is satisfied.
+
+Follow these phases to resolve the issue:
+
+Phase 1. READING: read the problem and reword it in clearer terms
+   1.1 If there are code or config snippets. Express in words any best practices or conventions in them.
+   1.2 Hightlight message errors, method names, variables, file names, stack traces, and technical details.
+   1.3 Explain the problem in clear terms.
+   1.4 Enumerate the steps to reproduce the problem.
+   1.5 Hightlight any best practices to take into account when testing and fixing the issue
+
+Phase 2. RUNNING: install and run the tests on the repository
+   2.1 Follow the readme
+   2.2 Install the environment and anything needed
+   2.2 Iterate and figure out how to run the tests
+
+Phase 3. EXPLORATION: find the files that are related to the problem and possible solutions
+   3.1 Use `grep` to search for relevant methods, classes, keywords and error messages.
+   3.2 Identify all files related to the problem statement.
+   3.3 Propose the methods and files to fix the issue and explain why.
+   3.4 From the possible file locations, select the most likely location to fix the issue.
+
+Phase 4. TEST CREATION: before implementing any fix, create a script to reproduce and verify the issue.
+   4.1 Look at existing test files in the repository to understand the test format/structure.
+   4.2 Create a minimal reproduction script that reproduces the located issue.
+   4.3 Run the reproduction script to confirm you are reproducing the issue.
+   4.4 Adjust the reproduction script as necessary.
+
+Phase 5. FIX ANALYSIS: state clearly the problem and how to fix it
+   5.1 State clearly what the problem is.
+   5.2 State clearly where the problem is located.
+   5.3 State clearly how the test reproduces the issue.
+   5.4 State clearly the best practices to take into account in the fix.
+   5.5 State clearly how to fix the problem.
+
+Phase 6. FIX IMPLEMENTATION: Edit the source code to implement your chosen solution.
+   6.1 Make minimal, focused changes to fix the issue.
+
+Phase 7. VERIFICATION: Test your implementation thoroughly.
+   7.1 Run your reproduction script to verify the fix works.
+   7.2 Add edge cases to your test script to ensure comprehensive coverage.
+   7.3 Run existing tests related to the modified code to ensure you haven't broken anything. Run any tests in the repository related to:
+     7.2.1 The issue you are fixing
+     7.2.2 The files you modified
+     7.2.3 The functions you changed
+   7.4 If any tests fail, revise your implementation until all tests pass
+
+Phase 8. REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance["sha_fail"]}.
+   8.1 Ensure you've fully addressed all requirements.
+
+Once all phases are done, announce: 'Agent Task Complete'.
+Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity.
+"""
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+    initialize_runtime(runtime, instance)
+
+    # Run the agent
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction_no_oracle),
+            runtime=runtime,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+                metadata.agent_class
+            ),
+        )
+    )
+    assert state is not None
+    metrics = state.metrics.get() if state.metrics else {}
+
+    test_result = complete_runtime(runtime, instance)
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = compatibility_for_eval_history_pairs(state.history)
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=instance['instance_id'],
+        # instance=instance.to_dict(orient='recorods'),
+        instruction=instruction_no_oracle,
+        metadata=metadata,
+        history=histories,
+        test_result=test_result,
+        metrics=metrics,
+    )
+    return output
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '-s',
+        '--eval-split',
+        type=str,
+        default='test',
+        choices=['test'],
+        help='data split to evaluate on, must be test',
+    )
+    args, _ = parser.parse_known_args()
+
+    data_split = args.eval_split
+
+    bench = load_dataset(
+        'JetBrains-Research/lca-ci-builds-repair', split=data_split
+    ).to_pandas()
+    # todo: see why 126 is giving problems on inference
+    # todo: see why 145 is giving problems on eval
+    bench = bench[bench['id'] != 126]
+    bench = bench[bench['id'] != 145]
+    # bench = bench.iloc[0:56]
+    # add column instnace_id for compatibility with oh repo, old id column must be kept for lca repo
+    bench['instance_id'] = bench['id'].astype(str)
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+        # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
+        llm_config.modify_params = False
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    metadata = make_metadata(
+        llm_config,
+        f'jetbrains-lca-ci--{data_split}',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    instances = prepare_dataset(bench, output_file, args.eval_n_limit)
+
+    run_evaluation(
+        instances, metadata, output_file, args.eval_num_workers, process_instance
+    )
--- a/evaluation/benchmarks/lca_ci_build_repair/scripts/eval_infer.sh
+++ b/evaluation/benchmarks/lca_ci_build_repair/scripts/eval_infer.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+PROCESS_FILEPATH=$1
+if [ -z "$PROCESS_FILEPATH" ]; then
+    echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
+    exit 1
+fi
+
+get_openhands_version
+
+PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
+echo "PROCESS_FILEPATH: $PROCESS_FILEPATH"
+
+EVAL_NOTE="$OPENHANDS_VERSION"
+if [ -n "$EXP_NAME" ]; then
+  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
+fi
+
+function run_eval() {
+  COMMAND="poetry run python ./evaluation/benchmarks/lca_ci_build_repair/eval_infer.py \
+    --predictions-path $PROCESS_FILEPATH "
+
+  echo "RUNNING: $COMMAND"
+  # Run the command
+  eval $COMMAND
+}
+
+unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
+run_eval
--- a/evaluation/benchmarks/lca_ci_build_repair/scripts/run_infer.sh
+++ b/evaluation/benchmarks/lca_ci_build_repair/scripts/run_infer.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+
+get_openhands_version
+
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE="$OPENHANDS_VERSION"
+if [ -n "$EXP_NAME" ]; then
+  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
+fi
+
+function run_eval() {
+  COMMAND="poetry run python ./evaluation/benchmarks/lca_ci_build_repair/run_infer.py \
+    --llm-config $MODEL_CONFIG "
+
+  # Run the command
+  eval $COMMAND
+}
+
+#unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
+run_eval
--- a/evaluation/benchmarks/lca_ci_build_repair/setup.py
+++ b/evaluation/benchmarks/lca_ci_build_repair/setup.py
@@ -0,0 +1,60 @@
+"""Installs LCA CI Build Repair benchmark with scripts for OH integration."""
+
+import os
+import shutil
+import subprocess
+
+import yaml
+
+
+def setup():
+    # Read config.yaml
+    print('Reading config.yaml')
+    script_dir = os.path.dirname(
+        os.path.abspath(__file__)
+    )  # Get the absolute path of the script
+    config_path = os.path.join(script_dir, 'config.yaml')
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+
+    lca_path = config['LCA_PATH']
+    lca_ci_path = os.path.join(
+        lca_path, 'lca-baselines', 'ci-builds-repair', 'ci-builds-repair-benchmark'
+    )
+    repo_url = 'https://github.com/juanmichelini/lca-baselines'
+
+    # Clone the repository to LCA_CI_PATH
+    print(f'Cloning lca-baselines repository from {repo_url} into {lca_path}')
+    result = subprocess.run(
+        ['git', 'clone', repo_url], cwd=lca_path, capture_output=True, text=True
+    )
+    if result.returncode != 0:
+        print(f'Warning cloning repository: {result.stderr}')
+
+    # Clone the repository to LCA_CI_PATH
+    print('Switching branches')
+    result = subprocess.run(
+        ['git', 'switch', 'open-hands-integration'],
+        cwd=lca_ci_path,
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        print(f'Warning switching repository: {result.stderr}')
+
+    # Move and rename config_lca.yaml (overwrite if exists)
+    lca_ci_config_path = os.path.join(lca_ci_path, 'config.yaml')
+    print(f'Copying config.yaml to {lca_ci_config_path}')
+    shutil.copy(config_path, lca_ci_config_path)
+
+    # Run poetry install in LCA_CI_PATH
+    print(f"Running 'poetry install' in {lca_ci_path}")
+    result = subprocess.run(
+        ['poetry', 'install'], cwd=lca_ci_path, capture_output=True, text=True
+    )
+    if result.returncode != 0:
+        print(f'Warning during poetry install: {result.stderr}')
+
+
+if __name__ == '__main__':
+    setup()
--- a/evaluation/benchmarks/multi_swe_bench/README.md
+++ b/evaluation/benchmarks/multi_swe_bench/README.md
@@ -0,0 +1,65 @@
+# Multi-swe-bench Evaluation with OpenHands
+
+## LLM Setup
+
+Please follow [here](../../README.md#setup).
+
+## Dataset Preparing
+
+Please download the [**Multi-SWE-Bench** dataset](https://huggingface.co/datasets/bytedance-research/Multi-SWE-Bench).
+And change the dataset following [script](scripts/data/data_change.py).
+
+```bash
+python evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py
+```
+
+## Docker image download
+
+Please download the multi-swe-bench dokcer images from [here](https://github.com/multi-swe-bench/multi-swe-bench?tab=readme-ov-file#run-evaluation).
+
+## Generate patch
+
+Please edit the [script](infer.sh) and run it.
+
+```bash
+bash evaluation/benchmarks/multi_swe_bench/infer.sh
+```
+
+Script variable explanation:
+
+- `models`, e.g. `llm.eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting to `CodeActAgent`.
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
+default, the script evaluates the (500 issues), which will no exceed the maximum of the dataset number.
+- `max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
+default, it is set to 50.
+- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
+default, it is set to 1.
+- `language`, the language of your evaluating dataset.
+- `dataset`, the absolute position of the dataset jsonl.
+
+The results will be generated in evaluation/evaluation_outputs/outputs/XXX/CodeActAgent/YYY/output.jsonl, you can refer to the [example](examples/output.jsonl).
+
+## Runing evaluation
+
+First, install [multi-swe-bench](https://github.com/multi-swe-bench/multi-swe-bench).
+
+```bash
+pip install multi-swe-bench
+```
+
+Second, convert the output.jsonl to patch.jsonl with [script](scripts/eval/convert.py), you can refer to the [example](examples/patch.jsonl).
+
+```bash
+python evaluation/benchmarks/multi_swe_bench/scripts/eval/convert.py
+```
+
+Finally, evaluate with multi-swe-bench.
+The config file config.json can be refer to the [example](examples/config.json) or [github](https://github.com/multi-swe-bench/multi-swe-bench/tree/main?tab=readme-ov-file#configuration-file-example).
+
+```bash
+python -m multi_swe_bench.harness.run_evaluation --config config.json
+```
--- a/evaluation/benchmarks/multi_swe_bench/init.py
+++ b/evaluation/benchmarks/multi_swe_bench/init.py
--- a/evaluation/benchmarks/multi_swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/multi_swe_bench/eval_infer.py
@@ -0,0 +1,456 @@
+import json
+import os
+import subprocess
+import tempfile
+import time
+from functools import partial
+
+import pandas as pd
+from swebench.harness.grading import get_eval_report
+from swebench.harness.run_evaluation import (
+    APPLY_PATCH_FAIL,
+    APPLY_PATCH_PASS,
+)
+from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
+from swebench.harness.utils import load_swebench_dataset
+from tqdm import tqdm
+
+from evaluation.benchmarks.swe_bench.resource.mapping import (
+    get_instance_resource_factor,
+)
+from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    get_default_sandbox_config_for_eval,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+)
+from openhands.core.config import (
+    AppConfig,
+    LLMConfig,
+    get_parser,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime
+from openhands.events.action import CmdRunAction
+from openhands.events.observation import CmdOutputObservation
+from openhands.utils.async_utils import call_async_from_sync
+
+# TODO: migrate all swe-bench docker to ghcr.io/openhands
+DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
+logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
+
+
+def process_git_patch(patch):
+    if not isinstance(patch, str):
+        return ''
+
+    if not patch.strip():
+        # skip empty patches
+        return ''
+
+    patch = patch.replace('\r\n', '\n')
+    # There might be some weird characters at the beginning of the patch
+    # due to some OpenHands inference command outputs
+
+    # FOR EXAMPLE:
+    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
+    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
+    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
+    # new file mode 100644
+    # index 0000000000..fc13db5948
+
+    # We "find" the first line that starts with "diff" and then we remove lines before it
+    lines = patch.split('\n')
+    for i, line in enumerate(lines):
+        if line.startswith('diff --git'):
+            patch = '\n'.join(lines[i:])
+            break
+
+    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
+    return patch
+
+
+def get_config(metadata: EvalMetadata, instance: pd.Series) -> AppConfig:
+    # We use a different instance image for the each instance of swe-bench eval
+    base_container_image = get_instance_docker_image(instance['instance_id'])
+    logger.info(
+        f'Using instance container image: {base_container_image}. '
+        f'Please make sure this image exists. '
+        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
+    )
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = base_container_image
+    sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
+        dataset_name=metadata.dataset,
+        instance_id=instance['instance_id'],
+    )
+    config = AppConfig(
+        run_as_openhands=False,
+        runtime=os.environ.get('RUNTIME', 'docker'),
+        sandbox=sandbox_config,
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    return config
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+    log_dir: str | None = None,
+    runtime_failure_count: int = 0,
+) -> EvalOutput:
+    """
+    Evaluate agent performance on a SWE-bench problem instance.
+
+    Note that this signature differs from the expected input to `run_evaluation`. Use
+    `functools.partial` to provide optional arguments before passing to the evaluation harness.
+
+    Args:
+        log_dir (str | None, default=None): Path to directory where log files will be written. Must
+        be provided if `reset_logger` is set.
+
+    Raises:
+        AssertionError: if the `reset_logger` flag is set without a provided log directory.
+    """
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        assert (
+            log_dir is not None
+        ), "Can't reset logger without a provided log directory."
+        os.makedirs(log_dir, exist_ok=True)
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
+
+    config = get_config(metadata, instance)
+    instance_id = instance.instance_id
+    model_patch = instance['model_patch']
+    test_spec: TestSpec = instance['test_spec']
+    logger.info(f'Starting evaluation for instance {instance_id}.')
+
+    if 'test_result' not in instance.keys():
+        instance['test_result'] = {}
+    instance['test_result']['report'] = {
+        'empty_generation': False,
+        'resolved': False,
+        'failed_apply_patch': False,
+        'error_eval': False,
+        'test_timeout': False,
+    }
+
+    if model_patch == '':
+        instance['test_result']['report']['empty_generation'] = True
+        return EvalOutput(
+            instance_id=instance_id,
+            test_result=instance['test_result'],
+            metadata=metadata,
+        )
+
+    # Increase resource_factor with increasing attempt_id
+    if runtime_failure_count > 0:
+        config.sandbox.remote_runtime_resource_factor = min(
+            config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
+            8,
+        )
+        logger.warning(
+            f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
+        )
+
+    try:
+        runtime = create_runtime(config)
+        call_async_from_sync(runtime.connect)
+        # Get patch and save it to /tmp/patch.diff
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Patch file
+            patch_file_path = os.path.join(temp_dir, 'patch.diff')
+            with open(patch_file_path, 'w') as f:
+                f.write(model_patch)
+            runtime.copy_to(patch_file_path, '/tmp')
+            # Eval script
+            eval_script_path = os.path.join(temp_dir, 'eval.sh')
+            with open(eval_script_path, 'w') as f:
+                f.write(test_spec.eval_script)
+            runtime.copy_to(eval_script_path, '/tmp')
+
+        # Set +x
+        action = CmdRunAction(command='chmod +x /tmp/eval.sh')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+
+        # Apply patch
+        exec_command = (
+            'cd /testbed && '
+            "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
+            "(echo 'Failed to apply patch with git apply, trying with patch command...' && "
+            "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
+            "echo 'APPLY_PATCH_FAIL')))"
+        )
+        action = CmdRunAction(command=exec_command)
+        action.set_hard_timeout(600)
+        obs = runtime.run_action(action)
+        assert isinstance(obs, CmdOutputObservation)
+        apply_patch_output = obs.content
+        assert isinstance(apply_patch_output, str)
+        instance['test_result']['apply_patch_output'] = apply_patch_output
+
+        if 'APPLY_PATCH_FAIL' in apply_patch_output:
+            logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}')
+            instance['test_result']['report']['failed_apply_patch'] = True
+
+            return EvalOutput(
+                instance_id=instance_id,
+                test_result=instance['test_result'],
+                metadata=metadata,
+            )
+        elif 'APPLY_PATCH_PASS' in apply_patch_output:
+            logger.info(f'[{instance_id}] {APPLY_PATCH_PASS}:\n{apply_patch_output}')
+
+            # Run eval script in background and save output to log file
+            log_file = '/tmp/eval_output.log'
+            action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!')
+            action.set_hard_timeout(300)  # Short timeout just to get the process ID
+            obs = runtime.run_action(action)
+
+            if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
+                pid = obs.content.split()[-1].strip()
+                logger.info(
+                    f'[{instance_id}] Evaluation process started with PID: {pid}'
+                )
+
+                # Poll for completion
+                start_time = time.time()
+                timeout = 1800  # 30 minutes
+                while True:
+                    seconds_elapsed = time.time() - start_time
+                    if seconds_elapsed > timeout:
+                        logger.info(
+                            f'[{instance_id}] Evaluation timed out after {timeout} seconds'
+                        )
+                        instance['test_result']['report']['test_timeout'] = True
+                        break
+                    check_action = CmdRunAction(
+                        command=f'ps -p {pid} > /dev/null; echo $?'
+                    )
+                    check_action.set_hard_timeout(300)
+                    check_obs = runtime.run_action(check_action)
+                    if (
+                        isinstance(check_obs, CmdOutputObservation)
+                        and check_obs.content.split()[-1].strip() == '1'
+                    ):
+                        logger.info(
+                            f'[{instance_id}] Evaluation process completed after {seconds_elapsed} seconds'
+                        )
+                        break
+                    logger.info(
+                        f'[{instance_id}] [{seconds_elapsed:.0f}s] Evaluation still running, waiting...'
+                    )
+                    time.sleep(30)  # Wait for 30 seconds before checking again
+
+                # Read the log file
+                cat_action = CmdRunAction(command=f'cat {log_file}')
+                cat_action.set_hard_timeout(300)
+                cat_obs = runtime.run_action(cat_action)
+
+                # Grade answer
+                if isinstance(cat_obs, CmdOutputObservation) and cat_obs.exit_code == 0:
+                    test_output = cat_obs.content
+                    assert isinstance(test_output, str)
+                    instance['test_result']['test_output'] = test_output
+
+                    # Get report from test output
+                    logger.info(f'[{instance_id}] Grading answer...')
+                    with tempfile.TemporaryDirectory() as temp_dir:
+                        # Create a directory structure that matches the expected format
+                        # NOTE: this is a hack to make the eval report format consistent
+                        # with the original SWE-Bench eval script
+                        log_dir = os.path.join(temp_dir, 'logs', instance_id.lower())
+                        os.makedirs(log_dir, exist_ok=True)
+                        test_output_path = os.path.join(log_dir, 'test_output.txt')
+                        with open(test_output_path, 'w') as f:
+                            f.write(test_output)
+                        try:
+                            _report = get_eval_report(
+                                test_spec=test_spec,
+                                prediction={
+                                    'model_patch': model_patch,
+                                    'instance_id': instance_id,
+                                },
+                                log_path=test_output_path,
+                                include_tests_status=True,
+                            )
+                            report = _report[instance_id]
+                            logger.info(
+                                f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
+                            )
+                            instance['test_result']['report']['resolved'] = report[
+                                'resolved'
+                            ]
+                        except Exception as e:
+                            logger.error(
+                                f'[{instance_id}] Error when getting eval report: {e}'
+                            )
+                            instance['test_result']['report']['resolved'] = False
+                            instance['test_result']['report']['error_eval'] = True
+            else:
+                logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
+                instance['test_result']['report']['error_eval'] = True
+
+            return EvalOutput(
+                instance_id=instance_id,
+                test_result=instance['test_result'],
+                metadata=metadata,
+            )
+        else:
+            logger.info(
+                f'[{instance_id}] Unexpected output when applying patch:\n{apply_patch_output}'
+            )
+            raise RuntimeError(
+                instance_id,
+                f'Unexpected output when applying patch:\n{apply_patch_output}',
+                logger,
+            )
+    finally:
+        runtime.close()
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '--input-file',
+        type=str,
+        help='Path to input predictions file',
+        required=True,
+    )
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='princeton-nlp/SWE-bench',
+        help='data set to evaluate on, either full-test or lite-test',
+    )
+    parser.add_argument(
+        '--split',
+        type=str,
+        default='test',
+        help='split to evaluate on',
+    )
+    args, _ = parser.parse_known_args()
+
+    # Load SWE-Bench dataset
+    full_dataset: list[SWEbenchInstance] = load_swebench_dataset(
+        args.dataset, args.split
+    )
+    instance_id_to_instance = {
+        instance['instance_id']: instance for instance in full_dataset
+    }
+    logger.info(
+        f'Loaded dataset {args.dataset} with split {args.split} to run inference on.'
+    )
+
+    # Load predictions
+    assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
+    required_fields = ['instance_id', 'model_patch', 'test_result']
+    with open(args.input_file) as f:
+        predictions = pd.DataFrame.from_records(
+            [
+                {k: v for k, v in json.loads(line).items() if k in required_fields}
+                for line in tqdm(f, desc='Loading predictions')
+            ]
+        )
+    assert (
+        'instance_id' in predictions.columns
+    ), 'Input file must contain instance_id column.'
+
+    if 'model_patch' not in predictions.columns and (
+        'test_result' in predictions.columns
+        and 'model_patch' in predictions['test_result'].iloc[0]
+    ):
+        raise ValueError(
+            'Input file must contain model_patch column OR test_result column with model_patch field.'
+        )
+    assert len(predictions['instance_id'].unique()) == len(
+        predictions
+    ), 'instance_id column must be unique.'
+
+    if 'model_patch' not in predictions.columns:
+        predictions['model_patch'] = predictions['test_result'].apply(
+            lambda x: x.get('git_patch', '')
+        )
+    assert {'instance_id', 'model_patch'}.issubset(
+        set(predictions.columns)
+    ), 'Input file must contain instance_id and model_patch columns.'
+
+    # Process model_patch
+    predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch)
+
+    # Merge predictions with dataset
+    predictions['instance'] = predictions['instance_id'].apply(
+        lambda x: instance_id_to_instance[x]
+    )
+    predictions['test_spec'] = predictions['instance'].apply(make_test_spec)
+
+    # Prepare dataset
+    output_file = args.input_file.replace('.jsonl', '.swebench_eval.jsonl')
+    instances = prepare_dataset(predictions, output_file, args.eval_n_limit)
+
+    # If possible, load the relevant metadata to avoid issues with `run_evaluation`.
+    metadata: EvalMetadata | None = None
+    metadata_filepath = os.path.join(os.path.dirname(args.input_file), 'metadata.json')
+    if os.path.exists(metadata_filepath):
+        with open(metadata_filepath, 'r') as metadata_file:
+            data = metadata_file.read()
+            metadata = EvalMetadata.model_validate_json(data)
+    else:
+        # Initialize with a dummy metadata when file doesn't exist
+        metadata = EvalMetadata(
+            agent_class='dummy_agent',  # Placeholder agent class
+            llm_config=LLMConfig(model='dummy_model'),  # Minimal LLM config
+            max_iterations=1,  # Minimal iterations
+            eval_output_dir=os.path.dirname(
+                args.input_file
+            ),  # Use input file dir as output dir
+            start_time=time.strftime('%Y-%m-%d %H:%M:%S'),  # Current time
+            git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+            .decode('utf-8')
+            .strip(),  # Current commit
+            dataset=args.dataset,  # Dataset name from args
+        )
+
+    # The evaluation harness constrains the signature of `process_instance_func` but we need to
+    # pass extra information. Build a new function object to avoid issues with multiprocessing.
+    process_instance_func = partial(
+        process_instance, log_dir=output_file.replace('.jsonl', '.logs')
+    )
+
+    run_evaluation(
+        instances,
+        metadata=metadata,
+        output_file=output_file,
+        num_workers=args.eval_num_workers,
+        process_instance_func=process_instance_func,
+    )
+
+    # Load evaluated predictions & print number of resolved predictions
+    evaluated_predictions = pd.read_json(output_file, lines=True)
+    fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation']
+
+    def count_report_field(row, field):
+        return row['test_result']['report'][field]
+
+    report = {}
+    for field in fields:
+        count = evaluated_predictions.apply(
+            count_report_field, args=(field,), axis=1
+        ).sum()
+        report[field] = count
+        logger.info(
+            f'# {field}: {count} / {len(evaluated_predictions)}. ({count / len(evaluated_predictions):.2%})'
+        )
--- a/evaluation/benchmarks/multi_swe_bench/examples/config.json
+++ b/evaluation/benchmarks/multi_swe_bench/examples/config.json
@@ -0,0 +1,24 @@
+{
+    "mode": "evaluation",
+    "workdir": "./data/workdir",
+    "patch_files": [
+        "./data/patches/<your_patch_file>.jsonl"
+    ],
+    "dataset_files": [
+        "./data/patches/<to_evaluate_dataset_file>.jsonl"
+    ],
+    "force_build": false,
+    "output_dir": "./data/dataset",
+    "specifics": [],
+    "skips": [],
+    "repo_dir": "./data/repos",
+    "need_clone": false,
+    "global_env": [],
+    "clear_env": true,
+    "stop_on_error": true,
+    "max_workers": 8,
+    "max_workers_build_image": 8,
+    "max_workers_run_instance": 8,
+    "log_dir": "./data/logs",
+    "log_level": "DEBUG"
+}
--- a/evaluation/benchmarks/multi_swe_bench/examples/output.jsonl
+++ b/evaluation/benchmarks/multi_swe_bench/examples/output.jsonl
--- a/evaluation/benchmarks/multi_swe_bench/examples/patch.jsonl
+++ b/evaluation/benchmarks/multi_swe_bench/examples/patch.jsonl
@@ -0,0 +1,3 @@
+{"org": "ponylang", "repo": "ponyc", "number": "4595", "fix_patch": "diff --git a/src/libponyc/ast/parser.c b/src/libponyc/ast/parser.c\nindex 9852922f..2c37d6b8 100644\n--- a/src/libponyc/ast/parser.c\n+++ b/src/libponyc/ast/parser.c\n@@ -693,6 +693,7 @@ DEF(idseqsingle);\n   AST_NODE(TK_LET);\n   TOKEN(\"variable name\", TK_ID);\n   AST_NODE(TK_NONE);  // Type\n+  SET_FLAG(AST_FLAG_IN_PARENS);\n   DONE();\n \n // idseq"}
+{"org": "ponylang", "repo": "ponyc", "number": "4593", "fix_patch": "diff --git a/packages/cli/command_parser.pony b/packages/cli/command_parser.pony\nindex a5acce8e..fa97808b 100644\n--- a/packages/cli/command_parser.pony\n+++ b/packages/cli/command_parser.pony\n@@ -100,6 +100,7 @@ class CommandParser\n             | let cs: CommandSpec box =>\n               return CommandParser._sub(cs, this).\n                 _parse_command(tokens, options, args, envsmap, opt_stop)\n+// Correctly handle parent default options\n             end\n           else\n             return SyntaxError(token, \"unknown command\")"}
+{"org": "ponylang", "repo": "ponyc", "number": "4588", "fix_patch": "diff --git a/src/libponyc/expr/match.c b/src/libponyc/expr/match.c\nindex 7d16066f..c2ec7056 100644\n--- a/src/libponyc/expr/match.c\n+++ b/src/libponyc/expr/match.c\n@@ -314,8 +314,10 @@ static ast_t* make_pattern_type(pass_opt_t* opt, ast_t* pattern)\n     case TK_DONTCAREREF:\n     case TK_MATCH_CAPTURE:\n     case TK_MATCH_DONTCARE:\n+      if (ast_id(pattern_type) == TK_ISO) pattern_type = set_cap_and_ephemeral(pattern_type, TK_TRN, TK_EPHEMERAL);\n       return pattern_type;\n \n+\n     case TK_TUPLE:\n     {\n       ast_t* pattern_child = ast_child(pattern);"}
--- a/evaluation/benchmarks/multi_swe_bench/infer.sh
+++ b/evaluation/benchmarks/multi_swe_bench/infer.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+
+BASE_SCRIPT="./evaluation/benchmarks/multi_swe_bench/scripts/run_infer.sh"
+
+MODELS=("aaa" "bbb" "ccc" "ddd" "fff")
+GIT_VERSION="HEAD"
+AGENT_NAME="CodeActAgent"
+EVAL_LIMIT="500"
+MAX_ITER="50"
+NUM_WORKERS="1"
+LANGUAGE="XXX"
+DATASET="XXX"
+
+
+for MODEL in "${MODELS[@]}"; do
+    echo "=============================="
+    echo "Running benchmark for MODEL: $MODEL"
+    echo "=============================="
+
+    $BASE_SCRIPT \
+        "$MODEL" \
+        "$GIT_VERSION" \
+        "$AGENT_NAME" \
+        "$EVAL_LIMIT" \
+        "$MAX_ITER" \
+        "$NUM_WORKERS" \
+        "$DATASET" \
+        "$LANGUAGE"
+
+    echo "Completed $MODEL"
+done
--- a/evaluation/benchmarks/multi_swe_bench/resource/mapping.py
+++ b/evaluation/benchmarks/multi_swe_bench/resource/mapping.py
@@ -0,0 +1,39 @@
+"""Mapping instance_id to resource_factor.
+
+Different instances may have different resource requirements.
+e.g., some instances may require more memory/CPU to run inference.
+This file tracks the resource requirements of different instances.
+"""
+
+import json
+import os
+
+from openhands.core.logger import openhands_logger as logger
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_RUNTIME_RESOURCE_FACTOR = int(
+    os.environ.get('DEFAULT_RUNTIME_RESOURCE_FACTOR', 1)
+)
+
+# dataset to resource mapping
+_global_resource_mapping: dict[str, dict[str, float]] = {}
+
+
+def get_resource_mapping(dataset_name: str) -> dict[str, float]:
+    if dataset_name not in _global_resource_mapping:
+        file_path = os.path.join(CUR_DIR, f'{dataset_name}.json')
+        if not os.path.exists(file_path):
+            logger.warning(f'Resource mapping for {dataset_name} not found.')
+            return None
+
+        with open(file_path, 'r') as f:
+            _global_resource_mapping[dataset_name] = json.load(f)
+        logger.info(f'Loaded resource mapping for {dataset_name}')
+    return _global_resource_mapping[dataset_name]
+
+
+def get_instance_resource_factor(dataset_name: str, instance_id: str) -> int:
+    resource_mapping = get_resource_mapping(dataset_name)
+    if resource_mapping is None:
+        return DEFAULT_RUNTIME_RESOURCE_FACTOR
+    return int(resource_mapping.get(instance_id, DEFAULT_RUNTIME_RESOURCE_FACTOR))
--- a/evaluation/benchmarks/multi_swe_bench/run_infer.py
+++ b/evaluation/benchmarks/multi_swe_bench/run_infer.py
@@ -0,0 +1,847 @@
+import asyncio
+import json
+import os
+import tempfile
+from typing import Any
+
+import pandas as pd
+import toml
+from datasets import load_dataset
+
+import openhands.agenthub
+from evaluation.benchmarks.swe_bench.resource.mapping import (
+    get_instance_resource_factor,
+)
+from evaluation.utils.shared import (
+    EvalException,
+    EvalMetadata,
+    EvalOutput,
+    assert_and_raise,
+    codeact_user_response,
+    get_default_sandbox_config_for_eval,
+    get_metrics,
+    is_fatal_evaluation_error,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+    update_llm_config_for_completions_logging,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AgentConfig,
+    AppConfig,
+    get_llm_config_arg,
+    get_parser,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import CmdRunAction, FileReadAction, MessageAction
+from openhands.events.observation import CmdOutputObservation, ErrorObservation
+from openhands.events.serialization.event import event_to_dict
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+from openhands.utils.shutdown_listener import sleep_if_should_continue
+
+USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
+USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'true').lower() == 'true'
+RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
+
+# TODO: migrate all swe-bench docker to ghcr.io/openhands
+# TODO: 适应所有的语言
+DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', '')
+LANGUAGE = os.environ.get('LANGUAGE', 'python')
+logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+}
+
+
+def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
+    return f'{instance.repo}__{instance.version}'.replace('/', '__')
+
+
+def get_instruction(instance: pd.Series, metadata: EvalMetadata):
+    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
+    # Prepare instruction
+
+    # Instruction based on Anthropic's official trajectory
+    # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
+    instructions = {
+        'python': (
+            '<uploaded_files>\n'
+            f'/workspace/{workspace_dir_name}\n'
+            '</uploaded_files>\n'
+            f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
+            f'<issue_description>\n'
+            f'{instance.problem_statement}\n'
+            '</issue_description>\n\n'
+            'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
+            "I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
+            "Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
+            'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
+            'Follow these steps to resolve the issue:\n'
+            '1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
+            '2. Create a script to reproduce the error and execute it with `python <filename.py>` using the BashTool, to confirm the error.\n'
+            '3. Edit the sourcecode of the repo to resolve the issue.\n'
+            '4. Rerun your reproduce script and confirm that the error is fixed!\n'
+            '5. Think about edgecases, add comprehensive tests for them in your reproduce script, and run them to make sure your fix handles them as well.\n'
+            f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
+            '   - The issue you are fixing\n'
+            '   - The files you modified\n'
+            '   - The functions you changed\n'
+            '   Make sure all these tests pass with your changes.\n'
+            "Your thinking should be thorough and so it's fine if it's very long.\n"
+        ),
+        'java': (
+            '<uploaded_files>\n'
+            f'/workspace/{workspace_dir_name}\n'
+            '</uploaded_files>\n'
+            f"I've uploaded a Java code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
+            f'<issue_description>\n'
+            f'{instance.problem_statement}\n'
+            '</issue_description>\n\n'
+            "Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n"
+            "I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
+            "Also the development Java environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
+            "Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n"
+            "Follow these steps to resolve the issue:\n"
+            "1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n"
+            '2. Create a Java class to reproduce the error and execute it by first compiling with `javac <classname>.java` and then running with `java <classname>` using the BashTool, to confirm the error\n'
+            "3. Edit the sourcecode of the repo to resolve the issue.\n"
+            "4. Rerun your reproduce script or class and confirm that the error is fixed!\n"
+            "5. Think about edgecases, add comprehensive tests for them in your reproduce class or script, and run them to make sure your fix handles these cases as well.\n"
+            f"6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance['base_commit']}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n"
+            "   - The issue you are fixing\n"
+            "   - The files you modified\n"
+            "   - The functions or classes you changed\n"
+            "   Make sure all these tests pass with your changes.\n"
+            "Your thinking should be thorough and so it's fine if it's very long.\n"
+        ),
+        'go': (
+            '<uploaded_files>\n'
+            f'/workspace/{workspace_dir_name}\n'
+            '</uploaded_files>\n'
+            f"I've uploaded a Go code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
+            f'<issue_description>\n'
+            f'{instance.problem_statement}\n'
+            '</issue_description>\n\n'
+            'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
+            "I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
+            "Also the development Go environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
+            'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
+            'Follow these steps to resolve the issue:\n'
+            '1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
+            '2. Create a script or a function to reproduce the error and execute it with `go run <filename.go>` using the BashTool, to confirm the error.\n'
+            '3. Edit the sourcecode of the repo to resolve the issue.\n'
+            '4. Rerun your reproduce script and confirm that the error is fixed!\n'
+            '5. Think about edgecases, add comprehensive tests for them in your reproduce script, and run them to make sure your fix handles them as well.\n'
+            f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
+            '   - The issue you are fixing\n'
+            '   - The files you modified\n'
+            '   - The functions you changed\n'
+            '   Make sure all these tests pass with your changes.\n'
+            "Your thinking should be thorough and so it's fine if it's very long.\n"
+        ),
+        'c': (
+            '<uploaded_files>\n'
+            f'/workspace/{workspace_dir_name}\n'
+            '</uploaded_files>\n'
+            f"I've uploaded a C code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
+            f'<issue_description>\n'
+            f'{instance.problem_statement}\n'
+            '</issue_description>\n\n'
+            'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
+            "I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
+            "Also the development C environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
+            'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
+            'Follow these steps to resolve the issue:\n'
+            '1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
+            '2. Create a script to reproduce the error by compiling your C code (for example, using `gcc <filename.c> -o <executable>`) and then running the executable using the BashTool, to confirm the error.\n'
+            '3. Edit the sourcecode of the repo to resolve the issue.\n'
+            '4. Rerun your reproduce script and confirm that the error is fixed!\n'
+            '5. Think about edgecases, add comprehensive tests for them in your reproduce script, and run them to make sure your fix handles them as well.\n'
+            f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
+            '   - The issue you are fixing\n'
+            '   - The files you modified\n'
+            '   - The functions you changed\n'
+            '   Make sure all these tests pass with your changes.\n'
+            "Your thinking should be thorough and so it's fine if it's very long.\n"
+        ),
+        'cpp': (
+            '<uploaded_files>\n'
+            f'/workspace/{workspace_dir_name}\n'
+            '</uploaded_files>\n'
+            f"I've uploaded a C++ code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
+            f'<issue_description>\n'
+            f'{instance.problem_statement}\n'
+            '</issue_description>\n\n'
+            'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
+            "I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
+            "Also the development C++ environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
+            'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
+            'Follow these steps to resolve the issue:\n'
+            '1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
+            '2. Create or adapt a small executable (e.g., a main file or a test driver) to reproduce the issue. Build and run it (for example, by using `g++ -o reproduce reproduce.cpp && ./reproduce` via the BashTool) to confirm the error.\n'
+            '3. Edit the sourcecode of the repo to resolve the issue.\n'
+            '4. Rerun your reproduce script and confirm that the error is fixed!\n'
+            '5. Think about edgecases, add comprehensive tests for them in your reproduce script, and run them to make sure your fix handles them as well.\n'
+            f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
+            '   - The issue you are fixing\n'
+            '   - The files you modified\n'
+            '   - The functions you changed\n'
+            '   Make sure all these tests pass with your changes.\n'
+            "Your thinking should be thorough and so it's fine if it's very long.\n"
+        ),
+        'javascript': (
+            '<uploaded_files>\n'
+            f'/workspace/{workspace_dir_name}\n'
+            '</uploaded_files>\n'
+            f"I've uploaded a Javascript code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
+            f'<issue_description>\n'
+            f'{instance.problem_statement}\n'
+            '</issue_description>\n\n'
+            'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
+            "I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
+            "Also the development Javascript environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
+            'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
+            'Follow these steps to resolve the issue:\n'
+            '1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
+            '2. Create a script to reproduce the error and execute it with `node <filename.js>` using the BashTool, to confirm the error.\n'
+            '3. Edit the sourcecode of the repo to resolve the issue.\n'
+            '4. Rerun your reproduce script and confirm that the error is fixed!\n'
+            '5. Think about edgecases, add comprehensive tests for them in your reproduce script, and run them to make sure your fix handles them as well.\n'
+            f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
+            '   - The issue you are fixing\n'
+            '   - The files you modified\n'
+            '   - The functions you changed\n'
+            '   Make sure all these tests pass with your changes.\n'
+            "Your thinking should be thorough and so it's fine if it's very long.\n"
+        ),
+        'typescript': (
+            '<uploaded_files>\n'
+            f'/workspace/{workspace_dir_name}\n'
+            '</uploaded_files>\n'
+            f"I've uploaded a Typescript code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
+            f'<issue_description>\n'
+            f'{instance.problem_statement}\n'
+            '</issue_description>\n\n'
+            'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
+            "I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
+            "Also the development Typescript environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
+            'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
+            'Follow these steps to resolve the issue:\n'
+            '1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
+            '2. Create a script to reproduce the error and execute it with `ts-node <filename.ts>` using the BashTool, to confirm the error.\n'
+            '3. Edit the sourcecode of the repo to resolve the issue.\n'
+            '4. Rerun your reproduce script and confirm that the error is fixed!\n'
+            '5. Think about edgecases, add comprehensive tests for them in your reproduce script, and run them to make sure your fix handles them as well.\n'
+            f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
+            '   - The issue you are fixing\n'
+            '   - The files you modified\n'
+            '   - The functions you changed\n'
+            '   Make sure all these tests pass with your changes.\n'
+            "Your thinking should be thorough and so it's fine if it's very long.\n"
+        ),
+        'rust': (
+            '<uploaded_files>\n'
+            f'/workspace/{workspace_dir_name}\n'
+            '</uploaded_files>\n'
+            f"I've uploaded a Rust code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
+            f'<issue_description>\n'
+            f'{instance.problem_statement}\n'
+            '</issue_description>\n\n'
+            'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
+            "I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
+            "Also the development Rust environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
+            'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
+            'Follow these steps to resolve the issue:\n'
+            '1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
+            '2. Create a reproduction script (or binary) that triggers the error and execute it with `cargo run --bin <filename>` using the BashTool, to confirm the error.\n'
+            '3. Edit the sourcecode of the repo to resolve the issue.\n'
+            '4. Rerun your reproduce script and confirm that the error is fixed!\n'
+            '5. Think about edgecases, add comprehensive tests for them in your reproduce script, and run them to make sure your fix handles them as well.\n'
+            f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
+            '   - The issue you are fixing\n'
+            '   - The files you modified\n'
+            '   - The functions you changed\n'
+            '   Make sure all these tests pass with your changes.\n'
+            "Your thinking should be thorough and so it's fine if it's very long.\n"
+        ),
+    }
+    instruction = instructions.get(LANGUAGE.lower())
+
+    if instruction and RUN_WITH_BROWSING:
+        instruction += (
+            '<IMPORTANT!>\n'
+            'You SHOULD NEVER attempt to browse the web. '
+            '</IMPORTANT!>\n'
+        )
+    return instruction
+
+
+# TODO: 适应所有的语言
+# def get_instance_docker_image(instance_id: str) -> str:
+#     image_name = 'sweb.eval.x86_64.' + instance_id
+#     if LANGUAGE == 'python':
+#         image_name = image_name.replace(
+#             '__', '_s_'
+#         )  # to comply with docker image naming convention
+#         return (DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + image_name).lower()
+#     else:
+#         return image_name.lower() ##加载本地的
+def get_instance_docker_image(instance: pd.Series):
+    if LANGUAGE == 'python':
+        image_name = 'sweb.eval.x86_64.' + instance['instance_id']
+        image_name = image_name.replace(
+            '__', '_s_'
+        )  # to comply with docker image naming convention
+        return (DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + image_name).lower()
+    else:
+        container_name = instance.get('repo', '').lower()
+        container_name = container_name.replace('/', '_m_')
+        instance_id = instance.get('instance_id', '')
+        tag_suffix = instance_id.split('-')[-1] if instance_id else ''
+        container_tag = f'pr-{tag_suffix}'
+        # pdb.set_trace()
+        return f'mswebench/{container_name}:{container_tag}'
+        # return "kong/insomnia:pr-8284"
+        # return "'sweb.eval.x86_64.local_insomnia"
+        # return "local_insomnia_why"
+        # return "local/kong-insomnia:pr-8117"
+
+
+def get_config(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+) -> AppConfig:
+    SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'
+    if USE_INSTANCE_IMAGE:
+        # We use a different instance image for the each instance of swe-bench eval
+        # base_container_image = get_instance_docker_image(instance['instance_id'])
+        base_container_image = get_instance_docker_image(instance)
+        logger.info(
+            f'Using instance container image: {base_container_image}. '
+            f'Please make sure this image exists. '
+            f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
+        )
+    else:
+        base_container_image = SWE_BENCH_CONTAINER_IMAGE
+        logger.info(f'Using swe-bench container image: {base_container_image}')
+
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = base_container_image
+    sandbox_config.enable_auto_lint = True
+    sandbox_config.use_host_network = False
+    # Add platform to the sandbox config to solve issue 4401
+    sandbox_config.platform = 'linux/amd64'
+    sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
+        dataset_name=metadata.dataset,
+        instance_id=instance['instance_id'],
+    )
+
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        max_iterations=metadata.max_iterations,
+        runtime=os.environ.get('RUNTIME', 'docker'),
+        sandbox=sandbox_config,
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(
+        update_llm_config_for_completions_logging(
+            metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
+        )
+    )
+    agent_config = AgentConfig(
+        enable_jupyter=False,
+        enable_browsing=RUN_WITH_BROWSING,
+        enable_llm_editor=False,
+        condenser=metadata.condenser_config,
+        enable_prompt_extensions=False,
+    )
+    config.set_agent_config(agent_config)
+    return config
+
+
+def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Initialization Fn')
+    logger.info('-' * 30)
+    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
+    obs: CmdOutputObservation
+
+    REPO_NAME = instance['repo'].split('/')[-1]
+    # Set instance id
+    action = CmdRunAction(
+        command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc && echo 'export REPO_NAME={REPO_NAME}' >> ~/.bashrc"""
+    )
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0, f'Failed to export SWE_INSTANCE_ID: {str(obs)}'
+    )
+    # pdb.set_trace()
+    action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
+
+    if USE_INSTANCE_IMAGE:
+        # inject the init script
+        script_dir = os.path.dirname(__file__)
+
+        # inject the instance info
+        action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            obs.exit_code == 0,
+            f'Failed to create /swe_util/eval_data/instances: {str(obs)}',
+        )
+
+        swe_instance_json_name = 'swe-bench-instance.json'
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Construct the full path for the desired file name within the temporary directory
+            temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
+            # Write to the file with the desired name within the temporary directory
+            with open(temp_file_path, 'w') as f:
+                if not isinstance(instance, dict):
+                    json.dump([instance.to_dict()], f)
+                else:
+                    json.dump([instance], f)
+
+            # Copy the file to the desired location
+            runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
+
+        # inject the instance swe entry
+        runtime.copy_to(
+            str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
+            '/swe_util/',
+        )
+        action = CmdRunAction(command='cat ~/.bashrc')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
+
+        action = CmdRunAction(command='source ~/.bashrc')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        if isinstance(obs, ErrorObservation):
+            logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
+        assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
+
+        action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            obs.exit_code == 0,
+            f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}',
+        )
+    else:
+        action = CmdRunAction(command='source /swe_util/swe_entry.sh')
+        action.set_hard_timeout(1800)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            obs.exit_code == 0,
+            f'Failed to source /swe_util/swe_entry.sh: {str(obs)}',
+        )
+
+    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
+    )
+
+    action = CmdRunAction(command='git reset --hard')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}')
+
+    action = CmdRunAction(
+        command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
+    )
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
+    ##TODO:这里看看需不需要判断其他语言的环境
+    # action = CmdRunAction(command='which python')
+    # action.set_hard_timeout(600)
+    # logger.info(action, extra={'msg_type': 'ACTION'})
+    # obs = runtime.run_action(action)
+    # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    # assert_and_raise(
+    #     obs.exit_code == 0 and 'testbed' in obs.content,
+    #     f'Expected to find python interpreter from testbed, but got: {str(obs)}',
+    # )
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Initialization Fn')
+    logger.info('-' * 30)
+
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Completion Fn')
+    logger.info('-' * 30)
+    obs: CmdOutputObservation
+    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
+
+    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    if obs.exit_code == -1:
+        # The previous command is still running
+        # We need to kill previous command
+        logger.info('The previous command is still running, trying to kill it...')
+        action = CmdRunAction(command='C-c')
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Then run the command again
+        action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
+    )
+
+    action = CmdRunAction(command='git config --global core.pager ""')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to git config --global core.pager "": {str(obs)}',
+    )
+
+    action = CmdRunAction(command='git add -A')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to git add -A: {str(obs)}',
+    )
+
+    ##删除二进制文件
+    action = CmdRunAction(
+        command="""
+        for file in $(git status --porcelain | grep -E "^(M| M|\\?\\?|A| A)" | cut -c4-); do
+            if [ -f "$file" ] && (file "$file" | grep -q "executable" || git check-attr binary "$file" | grep -q "binary: set"); then
+                git rm -f "$file" 2>/dev/null || rm -f "$file"
+                echo "Removed: $file"
+            fi
+        done
+        """
+    )
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to remove binary files: {str(obs)}',
+    )
+
+    # pdb.set_trace()
+
+    n_retries = 0
+    git_patch = None
+    while n_retries < 5:
+        action = CmdRunAction(
+            command=f'git diff --no-color --cached {instance["base_commit"]} > patch.diff'
+        )
+        action.set_hard_timeout(max(300 + 100 * n_retries, 600))
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        n_retries += 1
+        if isinstance(obs, CmdOutputObservation):
+            if obs.exit_code == 0:
+                # git_patch = obs.content.strip()
+                break
+            else:
+                logger.info('Failed to get git diff, retrying...')
+                sleep_if_should_continue(10)
+        elif isinstance(obs, ErrorObservation):
+            logger.error(f'Error occurred: {obs.content}. Retrying...')
+            sleep_if_should_continue(10)
+        else:
+            assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
+
+    action = FileReadAction(path='patch.diff')
+    action.set_hard_timeout(max(300 + 100 * n_retries, 600))
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    git_patch = obs.content
+    # pdb.set_trace()
+
+    assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Completion Fn')
+    logger.info('-' * 30)
+    return {'git_patch': git_patch}
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+    runtime_failure_count: int = 0,
+) -> EvalOutput:
+    config = get_config(instance, metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
+
+    # Increase resource_factor with increasing attempt_id
+    if runtime_failure_count > 0:
+        config.sandbox.remote_runtime_resource_factor = min(
+            config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
+            8,
+        )
+        logger.warning(
+            f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
+        )
+    # pdb.set_trace()
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    try:
+        initialize_runtime(runtime, instance)
+
+        instruction = get_instruction(instance, metadata)
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State | None = asyncio.run(
+            run_controller(
+                config=config,
+                initial_user_action=MessageAction(content=instruction),
+                runtime=runtime,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                    metadata.agent_class
+                ],
+            )
+        )
+
+        # if fatal error, throw EvalError to trigger re-run
+        if is_fatal_evaluation_error(state.last_error):
+            raise EvalException('Fatal error detected: ' + state.last_error)
+
+        # ======= THIS IS SWE-Bench specific =======
+        # Get git patch
+        return_val = complete_runtime(runtime, instance)
+        git_patch = return_val['git_patch']
+        logger.info(
+            f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
+        )
+    finally:
+        runtime.close()
+    # ==========================================
+
+    # ======= Attempt to evaluate the agent's edits =======
+    # we use eval_infer.sh to evaluate the agent's edits, not here
+    # because the agent may alter the environment / testcases
+    ###remove binary diffs
+    def remove_binary_diffs(patch_text):
+        lines = patch_text.splitlines()
+        cleaned_lines = []
+        block = []
+        is_binary_block = False
+
+        for line in lines:
+            if line.startswith('diff --git '):
+                if block and not is_binary_block:
+                    cleaned_lines.extend(block)
+                block = [line]
+                is_binary_block = False
+            elif 'Binary files' in line:
+                is_binary_block = True
+                block.append(line)
+            else:
+                block.append(line)
+
+        if block and not is_binary_block:
+            cleaned_lines.extend(block)
+        return '\n'.join(cleaned_lines)
+
+    git_patch = remove_binary_diffs(git_patch)
+    test_result = {
+        'git_patch': git_patch,
+    }
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
+    histories = [event_to_dict(event) for event in state.history]
+    metrics = get_metrics(state)
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instruction=instruction,
+        instance=instance.to_dict(),  # SWE Bench specific
+        test_result=test_result,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+    )
+    return output
+
+
+def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
+    file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml')
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as file:
+            data = toml.load(file)
+            if 'selected_ids' in data:
+                selected_ids = data['selected_ids']
+                logger.info(
+                    f'Filtering {len(selected_ids)} tasks from "selected_ids"...'
+                )
+                subset = dataset[dataset[filter_column].isin(selected_ids)]
+                logger.info(f'Retained {subset.shape[0]} tasks after filtering')
+                return subset
+    skip_ids = os.environ.get('SKIP_IDS', '').split(',')
+    if len(skip_ids) > 0:
+        logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
+        return dataset[~dataset[filter_column].isin(skip_ids)]
+    return dataset
+
+
+if __name__ == '__main__':
+    # pdb.set_trace()
+    parser = get_parser()
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='princeton-nlp/SWE-bench',
+        help='data set to evaluate on, either full-test or lite-test',
+    )
+    parser.add_argument(
+        '--split',
+        type=str,
+        default='test',
+        help='split to evaluate on',
+    )
+    args, _ = parser.parse_known_args()
+
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenHands's repo
+    # dataset = load_dataset(args.dataset, split=args.split)
+    # dataset = load_dataset(args.dataset)
+    dataset = load_dataset('json', data_files=args.dataset)
+    dataset = dataset[args.split]
+    swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
+    logger.info(
+        f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
+    )
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+        llm_config.log_completions = True
+        # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
+        llm_config.modify_params = False
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    details = {}
+    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
+
+    dataset_descrption = (
+        args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__')
+    )
+    metadata = make_metadata(
+        llm_config,
+        dataset_descrption,
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+        details=details,
+    )
+
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    print(f'### OUTPUT FILE: {output_file} ###')
+    instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
+
+    if len(instances) > 0 and not isinstance(
+        instances['FAIL_TO_PASS'][instances['FAIL_TO_PASS'].index[0]], str
+    ):
+        for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']:
+            instances[col] = instances[col].apply(lambda x: str(x))
+    # if LANGUAGE == "java": ##TODO:适配多语言的版本
+    #     for col in ['issue_numbers', 'created_at']:
+    #         instances[col] = instances[col].apply(lambda x: str(x))
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+        timeout_seconds=120 * 60,  # 2 hour PER instance should be more than enough
+        max_retries=5,
+    )
--- a/evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py
@@ -0,0 +1,36 @@
+import json
+
+input_file = 'XXX.jsonl'
+output_file = 'YYY.jsonl'
+
+with open(input_file, 'r', encoding='utf-8') as fin, open(
+    output_file, 'w', encoding='utf-8'
+) as fout:
+    for line in fin:
+        line = line.strip()
+        if not line:
+            continue
+
+        data = json.loads(line)
+        item = data
+
+        # 提取原始数据
+        org = item.get('org', '')
+        repo = item.get('repo', '')
+        number = str(item.get('number', ''))
+
+        new_item = {}
+        new_item['repo'] = f'{org}/{repo}'
+        new_item['instance_id'] = f'{org}__{repo}-{number}'
+        new_item['problem_statement'] = (
+            item['resolved_issues'][0].get('title', '')
+            + '\n'
+            + item['resolved_issues'][0].get('body', '')
+        )
+        new_item['FAIL_TO_PASS'] = []
+        new_item['PASS_TO_PASS'] = []
+        new_item['base_commit'] = item['base'].get('sha', '')
+        new_item['version'] = '0.1'  # depends
+
+        output_data = new_item
+        fout.write(json.dumps(output_data, ensure_ascii=False) + '\n')
--- a/evaluation/benchmarks/multi_swe_bench/scripts/eval/convert.py
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/eval/convert.py
@@ -0,0 +1,24 @@
+import json
+import re
+
+IN_FILE = 'output.jsonl'
+OUT_FILE = 'patch.jsonl'
+
+
+def main():
+    with open(IN_FILE, 'r') as fin:
+        with open(OUT_FILE, 'w') as fout:
+            for line in fin:
+                data = json.loads(line)
+                groups = re.match(r'(.*)__(.*)-(.*)', data['instance_id'])
+                patch = {
+                    'org': groups.group(1),
+                    'repo': groups.group(2),
+                    'number': groups.group(3),
+                    'fix_patch': data['test_result']['git_patch'],
+                }
+                fout.write(json.dumps(patch) + '\n')
+
+
+if __name__ == '__main__':
+    main()
--- a/evaluation/benchmarks/multi_swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/run_infer.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+MAX_ITER=$5
+NUM_WORKERS=$6
+DATASET=$7
+# SPLIT=$8
+LANGUAGE=$8
+# N_RUNS=$10
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+if [ -z "$MAX_ITER" ]; then
+  echo "MAX_ITER not specified, use default 100"
+  MAX_ITER=100
+fi
+
+if [ -z "$USE_INSTANCE_IMAGE" ]; then
+  echo "USE_INSTANCE_IMAGE not specified, use default true"
+  USE_INSTANCE_IMAGE=true
+fi
+
+if [ -z "$RUN_WITH_BROWSING" ]; then
+  echo "RUN_WITH_BROWSING not specified, use default false"
+  RUN_WITH_BROWSING=false
+fi
+
+
+if [ -z "$DATASET" ]; then
+  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
+  DATASET="princeton-nlp/SWE-bench_Lite"
+fi
+
+if [ -z "$LANGUAGE" ]; then
+  echo "LANUGUAGE not specified, use default python"
+  LANGUAGE="python"
+fi
+
+if [ -z "$SPLIT" ]; then
+  echo "LANUGUAGE not specified, use default python"
+  SPLIT="train"
+fi
+
+##TODO:适配多语言的版本
+# if [ -z "$SPLIT" ]; then
+#   if [ "$LANGUAGE" = "python" ]; then
+#   echo "SPLIT is test as LANUGUAGE is python"
+#     SPLIT="test"
+#   elif [ "$LANGUAGE" = "java" ]; then
+#   echo "SPLIT is java_verified as LANUGUAGE is java"
+#     SPLIT="java_verified"
+#   fi
+# fi
+
+if [ -z "$EVAL_DOCKER_IMAGE_PREFIX" ]; then
+  if [ "$LANGUAGE" = "python" ]; then
+  echo "EVAL_DOCKER_IMAGE_PREFIX is docker.io/xingyaoww/ as default as LANUGUAGE is python"
+    EVAL_DOCKER_IMAGE_PREFIX="docker.io/xingyaoww/"
+  elif [ "$LANGUAGE" = "java" ]; then
+  echo "EVAL_DOCKER_IMAGE_PREFIX is java_verified as LANUGUAGE is java"
+    EVAL_DOCKER_IMAGE_PREFIX=""
+  fi
+fi
+
+export EVAL_DOCKER_IMAGE_PREFIX=$EVAL_DOCKER_IMAGE_PREFIX
+echo "EVAL_DOCKER_IMAGE_PREFIX: $EVAL_DOCKER_IMAGE_PREFIX"
+export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
+echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
+export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
+echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
+export LANGUAGE=$LANGUAGE
+echo "LANGUAGE: $LANGUAGE"
+
+get_openhands_version
+
+echo "AGENT: $AGENT"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+echo "DATASET: $DATASET"
+echo "SPLIT: $SPLIT"
+
+# Default to NOT use Hint
+if [ -z "$USE_HINT_TEXT" ]; then
+  export USE_HINT_TEXT=false
+fi
+echo "USE_HINT_TEXT: $USE_HINT_TEXT"
+EVAL_NOTE="$OPENHANDS_VERSION"
+# if not using Hint, add -no-hint to the eval note
+if [ "$USE_HINT_TEXT" = false ]; then
+  EVAL_NOTE="$EVAL_NOTE-no-hint"
+fi
+
+if [ "$RUN_WITH_BROWSING" = true ]; then
+  EVAL_NOTE="$EVAL_NOTE-with-browsing"
+fi
+
+if [ -n "$EXP_NAME" ]; then
+  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
+fi
+
+function run_eval() {
+  local eval_note=$1
+  COMMAND="poetry run python evaluation/benchmarks/multi_swe_bench/run_infer.py \
+    --agent-cls $AGENT \
+    --llm-config $MODEL_CONFIG \
+    --max-iterations $MAX_ITER \
+    --eval-num-workers $NUM_WORKERS \
+    --eval-note $eval_note \
+    --dataset $DATASET \
+    --split $SPLIT"
+
+  if [ -n "$EVAL_LIMIT" ]; then
+    echo "EVAL_LIMIT: $EVAL_LIMIT"
+    COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+  fi
+
+  # Run the command
+  eval $COMMAND
+}
+
+unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
+if [ -z "$N_RUNS" ]; then
+  N_RUNS=1
+  echo "N_RUNS not specified, use default $N_RUNS"
+fi
+
+# Skip runs if the run number is in the SKIP_RUNS list
+# read from env variable SKIP_RUNS as a comma separated list of run numbers
+SKIP_RUNS=(${SKIP_RUNS//,/ })
+for i in $(seq 1 $N_RUNS); do
+  if [[ " ${SKIP_RUNS[@]} " =~ " $i " ]]; then
+    echo "Skipping run $i"
+    continue
+  fi
+  current_eval_note="$EVAL_NOTE-run_$i"
+  echo "EVAL_NOTE: $current_eval_note"
+  run_eval $current_eval_note
+done
+
+checkout_original_branch
--- a/evaluation/benchmarks/multi_swe_bench/scripts/setup/compare_patch_filename.py
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/setup/compare_patch_filename.py
@@ -0,0 +1,54 @@
+"""This script compares gold patches with OpenHands-generated patches and check whether
+OpenHands found the right (set of) files to modify.
+"""
+
+import argparse
+import json
+import re
+
+
+def extract_modified_files(patch):
+    modified_files = set()
+    file_pattern = re.compile(r'^diff --git a/(.*?) b/')
+
+    for line in patch.split('\n'):
+        match = file_pattern.match(line)
+        if match:
+            modified_files.add(match.group(1))
+
+    return modified_files
+
+
+def process_report(oh_output_file):
+    succ = 0
+    fail = 0
+    for line in open(oh_output_file):
+        line = json.loads(line)
+        instance_id = line['instance_id']
+        gold_patch = line['swe_instance']['patch']
+        generated_patch = line['git_patch']
+        gold_modified_files = extract_modified_files(gold_patch)
+        # swe-bench lite only: a gold patch always contains exactly one file
+        assert len(gold_modified_files) == 1
+        generated_modified_files = extract_modified_files(generated_patch)
+
+        # Check if all files in gold_patch are also in generated_patch
+        all_files_in_generated = gold_modified_files.issubset(generated_modified_files)
+        if all_files_in_generated:
+            succ += 1
+        else:
+            fail += 1
+            print(
+                f'{instance_id}: file mismatch, gold = {gold_modified_files}, generated = {generated_modified_files}'
+            )
+    print(
+        f'\nSUMMARY: {succ} out of {succ + fail} instances found correct files to edit, success rate = {succ / float(succ + fail)}'
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--oh_output_file', help='Path to the OH output file')
+    args = parser.parse_args()
+
+    process_report(args.oh_output_file)
--- a/evaluation/benchmarks/multi_swe_bench/scripts/setup/instance_swe_entry.sh
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/setup/instance_swe_entry.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+source ~/.bashrc
+SWEUTIL_DIR=/swe_util
+
+# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
+# SWE_INSTANCE_ID=django__django-11099
+if [ -z "$SWE_INSTANCE_ID" ]; then
+    echo "Error: SWE_INSTANCE_ID is not set." >&2
+    exit 1
+fi
+
+if [ -z "$REPO_NAME" ]; then
+    echo "Error: REPO_NAME is not set." >&2
+    exit 1
+fi
+
+# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
+item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
+
+if [[ -z "$item" ]]; then
+  echo "No item found for the provided instance ID."
+  exit 1
+fi
+
+WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")')
+
+echo "WORKSPACE_NAME: $WORKSPACE_NAME"
+
+# Clear the workspace
+if [ -d /workspace ]; then
+    rm -rf /workspace/*
+else
+    mkdir /workspace
+fi
+# Copy repo to workspace
+if [ -d /workspace/$WORKSPACE_NAME ]; then
+    rm -rf /workspace/$WORKSPACE_NAME
+fi
+mkdir -p /workspace
+cp -r /home/$REPO_NAME /workspace/$WORKSPACE_NAME
+
+# Activate instance-specific environment
+# . /opt/miniconda3/etc/profile.d/conda.sh
+# conda activate testbed
--- a/evaluation/benchmarks/multi_swe_bench/scripts/setup/prepare_swe_utils.sh
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/setup/prepare_swe_utils.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -e
+EVAL_WORKSPACE="evaluation/benchmarks/swe_bench/eval_workspace"
+mkdir -p $EVAL_WORKSPACE
+
+# 1. Prepare REPO
+echo "==== Prepare SWE-bench repo ===="
+OH_SWE_BENCH_REPO_PATH="https://github.com/All-Hands-AI/SWE-bench.git"
+OH_SWE_BENCH_REPO_BRANCH="eval"
+git clone -b $OH_SWE_BENCH_REPO_BRANCH $OH_SWE_BENCH_REPO_PATH $EVAL_WORKSPACE/OH-SWE-bench
+
+# 2. Prepare DATA
+echo "==== Prepare SWE-bench data ===="
+EVAL_IMAGE=ghcr.io/all-hands-ai/eval-swe-bench:builder_with_conda
+EVAL_WORKSPACE=$(realpath $EVAL_WORKSPACE)
+chmod +x $EVAL_WORKSPACE/OH-SWE-bench/swebench/harness/prepare_data.sh
+if [ -d $EVAL_WORKSPACE/eval_data ]; then
+    rm -r $EVAL_WORKSPACE/eval_data
+fi
+docker run \
+    -v $EVAL_WORKSPACE:/workspace \
+    -w /workspace \
+    -u $(id -u):$(id -g) \
+    -e HF_DATASETS_CACHE="/tmp" \
+    --rm -it $EVAL_IMAGE \
+    bash -c "cd OH-SWE-bench/swebench/harness && /swe_util/miniforge3/bin/conda run -n swe-bench-eval ./prepare_data.sh && mv eval_data /workspace/"
--- a/evaluation/benchmarks/multi_swe_bench/scripts/setup/swe_entry.sh
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/setup/swe_entry.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+set -e
+
+# assert user name is `root`
+if [ "$USER" != "root" ]; then
+    echo "Error: This script is intended to be run by the 'root' user only." >&2
+    exit 1
+fi
+
+source ~/.bashrc
+
+SWEUTIL_DIR=/swe_util
+
+# Create logs directory
+LOG_DIR=/openhands/logs
+mkdir -p $LOG_DIR && chmod 777 $LOG_DIR
+
+# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
+# SWE_INSTANCE_ID=django__django-11099
+if [ -z "$SWE_INSTANCE_ID" ]; then
+    echo "Error: SWE_INSTANCE_ID is not set." >&2
+    exit 1
+fi
+
+# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
+item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-test-lite.json)
+
+if [[ -z "$item" ]]; then
+  echo "No item found for the provided instance ID."
+  exit 1
+fi
+
+CONDA_ENV_NAME=$(echo "$item" | jq -r '.repo + "__" + .version | gsub("/"; "__")')
+
+echo "CONDA_ENV_NAME: $CONDA_ENV_NAME"
+
+SWE_TASK_DIR=/openhands/swe_tasks
+mkdir -p $SWE_TASK_DIR
+# Dump test_patch to /workspace/test.patch
+echo "$item" | jq -r '.test_patch' > $SWE_TASK_DIR/test.patch
+# Dump patch to /workspace/gold.patch
+echo "$item" | jq -r '.patch' > $SWE_TASK_DIR/gold.patch
+# Dump the item to /workspace/instance.json except for the "test_patch" and "patch" fields
+echo "$item" | jq 'del(.test_patch, .patch)' > $SWE_TASK_DIR/instance.json
+
+# Clear the workspace
+rm -rf /workspace/*
+# Copy repo to workspace
+if [ -d /workspace/$CONDA_ENV_NAME ]; then
+    rm -rf /workspace/$CONDA_ENV_NAME
+fi
+cp -r $SWEUTIL_DIR/eval_data/testbeds/$CONDA_ENV_NAME /workspace
+
+# Reset swe-bench testbed and install the repo
+. $SWEUTIL_DIR/miniforge3/etc/profile.d/conda.sh
+conda config --set changeps1 False
+conda config --append channels conda-forge
+conda activate swe-bench-eval
+
+mkdir -p $SWE_TASK_DIR/reset_testbed_temp
+mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
+SWE_BENCH_DIR=/swe_util/OH-SWE-bench
+output=$(
+    export PYTHONPATH=$SWE_BENCH_DIR && \
+    cd $SWE_BENCH_DIR && \
+    python swebench/harness/reset_swe_env.py \
+    --swe_bench_tasks $SWEUTIL_DIR/eval_data/instances/swe-bench-test.json \
+    --temp_dir $SWE_TASK_DIR/reset_testbed_temp \
+    --testbed /workspace \
+    --conda_path $SWEUTIL_DIR/miniforge3 \
+    --instance_id $SWE_INSTANCE_ID \
+    --log_dir $SWE_TASK_DIR/reset_testbed_log_dir \
+    --timeout 900 \
+    --verbose
+)
+
+REPO_PATH=$(echo "$output" | awk -F': ' '/repo_path:/ {print $2}')
+TEST_CMD=$(echo "$output" | awk -F': ' '/test_cmd:/ {print $2}')
+echo "Repo Path: $REPO_PATH"
+echo "Test Command: $TEST_CMD"
+
+echo "export SWE_BENCH_DIR=\"$SWE_BENCH_DIR\"" >> ~/.bashrc
+echo "export REPO_PATH=\"$REPO_PATH\"" >> ~/.bashrc
+echo "export TEST_CMD=\"$TEST_CMD\"" >> ~/.bashrc
+
+if [[ "$REPO_PATH" == "None" ]]; then
+    echo "Error: Failed to retrieve repository path. Tests may not have passed or output was not as expected." >&2
+    exit 1
+fi
+
+# Activate instance-specific environment
+. $SWEUTIL_DIR/miniforge3/etc/profile.d/conda.sh
+conda activate $CONDA_ENV_NAME
+
+set +e
--- a/evaluation/benchmarks/swe_bench/README.md
+++ b/evaluation/benchmarks/swe_bench/README.md
@@ -2,6 +2,8 @@

 This folder contains the evaluation harness that we built on top of the original [SWE-Bench benchmark](https://www.swebench.com/) ([paper](https://arxiv.org/abs/2310.06770)).

+**UPDATE (4/8/2025): We now support running SWT-Bench evaluation! For more details, checkout [the corresponding section](#SWT-Bench-Evaluation).**
+
 **UPDATE (03/27/2025): We now support SWE-Bench multimodal evaluation! Simply use "princeton-nlp/SWE-bench_Multimodal" as the dataset name in the `run_infer.sh` script to evaluate on multimodal instances.**

 **UPDATE (2/18/2025): We now support running SWE-Gym using the same evaluation harness here. For more details, checkout [this README](./SWE-Gym.md).**
@@ -141,7 +143,7 @@ With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patc
 ./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split]

 # Example
-./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
+./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
 ```

 The script now accepts optional arguments:
@@ -182,3 +184,58 @@ To clean-up all existing runtimes that you've already started, run:
 ```bash
 ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/utils/scripts/cleanup_remote_runtime.sh
 ```
+
+## SWT-Bench Evaluation
+
+[SWT-Bench](https://swtbench.com/) ([paper](https://arxiv.org/abs/2406.12952)) is a benchmark for evaluating the capability of LLMs at creating unit tests. It is performed on the same instances as SWE-Bench, but requires a separate evaluation harness to capture coverage and issue reproduction. We therefore detail below how to leverage the inference script in this folder to run inference on SWT-Bench and how to use the SWT-Bench evaluation harness to evaluate them.
+
+### Run inference on SWT-Bench
+
+To run inference on SWT-Bench, you can use the same `run_infer.sh` script as described for evaluation on plain SWE-Bench. The only differences is that you need to specify the `mode` parameter to `swt` or `swt-ci` when running the script. For example, to run inference on SWT-Bench Verified, run the following command:
+
+```bash
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [swe-dataset] test 1 swt
+
+# Example - This runs evaluation on CodeActAgent for 500 instances on "SWT-bench_Verified"'s test set (corresponding to SWE-bench_Verified), with max 100 iteration per instances, with 1 number of workers running in parallel
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4o-2024-11-20 HEAD CodeActAgent 500 100 1 princeton-nlp/SWE-bench_Verified test 1 swt
+```
+
+The two modes `swt` and `swt-ci` have the following effect:
+- `swt`: This mode will change the prompt to instruct the agent to generate reproducing test cases instead of resolving the issue.
+- `swt-ci`: In addition to the changes by `swt`, this mode sets up the CI environment by i) pre-installing the environment in the docker image, such that the test framework can be executed without errors and ii) telling the model the exact command to run the test framework.
+
+### Run evaluation for SWT-bench
+
+The evaluation of these results is done leveraging [the SWT-Bench evaluation harness](https://github.com/logic-star-ai/swt-bench/tree/master).
+
+#### Extracting results into SWT-Bench harness format
+In order to run evaluation of the obtained inference results in the SWT-Bench harness, we transform the results to a format that the SWT-Bench evaluation harness expects.
+
+```bash
+python3 evaluation/benchmarks/swe_bench/scripts/swtbench/convert.py --prediction_file [output.jsonl] > [output_swt.jsonl]
+
+# Example
+python3 evaluation/benchmarks/swe_bench/scripts/swtbench/convert.py --prediction_file "evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Verified-test/CodeActAgent/gpt-4o-2024-11-20_maxiter_100_N_v0.31.0-no-hint-swt-run_1/output.jsonl" > OpenHands-gpt-4o-2024-11-20.jsonl
+```
+
+#### Running the results in SWT-Bench
+
+Next, we run the [SWT-Bench evaluation harness](https://github.com/logic-star-ai/swt-bench/tree/master) with these results.
+First set-up and validate the setup as described in the harness [here](https://github.com/logic-star-ai/swt-bench/tree/master?tab=readme-ov-file#-set-up).
+Then, run the evaluation with the following command:
+
+```bash
+# Example
+python3 -m src.main \
+    --dataset_name princeton-nlp/SWE-bench_Verified \
+    --predictions_path <pathTo>/OpenHands-gpt-4o-2024-11-20.jsonl \
+    --max_workers 12 \
+    --run_id OpenHands-CodeAct-gpt-4o-2024-11-20  --patch_types vanilla  --build_mode api
+```
+
+The results of the evaluation can be obtained by running the reporting script of the harness.
+
+```bash
+# Example
+python -m src.report run_instance_swt_logs/OpenHands-CodeAct-gpt-4o-2024-11-20/OpenHands__CodeActAgent__gpt-4o-2024-11-20 --dataset verified
+```
--- a/evaluation/benchmarks/swe_bench/resource/swt_bench_constants.py
+++ b/evaluation/benchmarks/swe_bench/resource/swt_bench_constants.py
@@ -0,0 +1,842 @@
+# Based on https://github.com/logic-star-ai/swt-bench/blob/master/src/constants.py
+
+# Constants - Installation Specifications
+MAP_VERSION_TO_INSTALL_SKLEARN = {
+    k: {
+        'python': '3.6',
+        'packages': 'numpy scipy cython pytest pandas matplotlib',
+        'install': 'python -m pip install -v --no-use-pep517 --no-build-isolation -e .',
+        'pip_packages': [
+            'cython',
+            'numpy==1.19.2',
+            'setuptools',
+            'scipy==1.5.2',
+        ],
+    }
+    for k in ['0.20', '0.21', '0.22']
+}
+MAP_VERSION_TO_INSTALL_SKLEARN.update(
+    {
+        k: {
+            'python': '3.9',
+            'packages': "'numpy==1.19.2' 'scipy==1.5.2' 'cython==3.0.10' pytest 'pandas<2.0.0' 'matplotlib<3.9.0' setuptools pytest joblib threadpoolctl",
+            'install': 'python -m pip install -v --no-use-pep517 --no-build-isolation -e .',
+            'pip_packages': ['cython', 'setuptools', 'numpy', 'scipy'],
+        }
+        for k in ['1.3', '1.4']
+    }
+)
+MAP_VERSION_TO_INSTALL_FLASK = {
+    '2.0': {
+        'python': '3.9',
+        'packages': 'requirements.txt',
+        'install': 'python -m pip install -e .',
+        'pip_packages': [
+            'setuptools==70.0.0',
+            'Werkzeug==2.3.7',
+            'Jinja2==3.0.1',
+            'itsdangerous==2.1.2',
+            'click==8.0.1',
+            'MarkupSafe==2.1.3',
+        ],
+    },
+    '2.1': {
+        'python': '3.10',
+        'packages': 'requirements.txt',
+        'install': 'python -m pip install -e .',
+        'pip_packages': [
+            'click==8.1.3',
+            'itsdangerous==2.1.2',
+            'Jinja2==3.1.2',
+            'MarkupSafe==2.1.1',
+            'Werkzeug==2.3.7',
+        ],
+    },
+}
+MAP_VERSION_TO_INSTALL_FLASK.update(
+    {
+        k: {
+            'python': '3.11',
+            'packages': 'requirements.txt',
+            'install': 'python -m pip install -e .',
+            'pip_packages': [
+                'click==8.1.3',
+                'itsdangerous==2.1.2',
+                'Jinja2==3.1.2',
+                'MarkupSafe==2.1.1',
+                'Werkzeug==2.3.7',
+            ],
+        }
+        for k in ['2.2', '2.3']
+    }
+)
+MAP_VERSION_TO_INSTALL_DJANGO = {
+    k: {
+        'python': '3.5',
+        'packages': 'requirements.txt',
+        'pre_install': [
+            'apt-get update && apt-get install -y locales',
+            "echo 'en_US UTF-8' > /etc/locale.gen",
+            'locale-gen en_US.UTF-8',
+        ],
+        'install': 'python setup.py install',
+        'pip_packages': ['setuptools'],
+        'eval_commands': [
+            'export LANG=en_US.UTF-8',
+            'export LC_ALL=en_US.UTF-8',
+            'export PYTHONIOENCODING=utf8',
+            'export LANGUAGE=en_US:en',
+        ],
+    }
+    for k in ['1.7', '1.8', '1.9', '1.10', '1.11', '2.0', '2.1', '2.2']
+}
+MAP_VERSION_TO_INSTALL_DJANGO.update(
+    {
+        k: {'python': '3.5', 'install': 'python setup.py install'}
+        for k in ['1.4', '1.5', '1.6']
+    }
+)
+MAP_VERSION_TO_INSTALL_DJANGO.update(
+    {
+        k: {
+            'python': '3.6',
+            'packages': 'requirements.txt',
+            'install': 'python -m pip install -e .',
+            'eval_commands': [
+                "sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen",
+                'export LANG=en_US.UTF-8',
+                'export LANGUAGE=en_US:en',
+                'export LC_ALL=en_US.UTF-8',
+            ],
+        }
+        for k in ['3.0', '3.1', '3.2']
+    }
+)
+MAP_VERSION_TO_INSTALL_DJANGO.update(
+    {
+        k: {
+            'python': '3.8',
+            'packages': 'requirements.txt',
+            'install': 'python -m pip install -e .',
+        }
+        for k in ['4.0']
+    }
+)
+MAP_VERSION_TO_INSTALL_DJANGO.update(
+    {
+        k: {
+            'python': '3.9',
+            'packages': 'requirements.txt',
+            'install': 'python -m pip install -e .',
+        }
+        for k in ['4.1', '4.2']
+    }
+)
+MAP_VERSION_TO_INSTALL_DJANGO.update(
+    {
+        k: {
+            'python': '3.11',
+            'packages': 'requirements.txt',
+            'install': 'python -m pip install -e .',
+        }
+        for k in ['5.0']
+    }
+)
+MAP_VERSION_TO_INSTALL_REQUESTS = {
+    k: {'python': '3.9', 'packages': 'pytest', 'install': 'python -m pip install .'}
+    for k in ['0.7', '0.8', '0.9', '0.11', '0.13', '0.14', '1.1', '1.2', '2.0', '2.2']
+    + ['2.3', '2.4', '2.5', '2.7', '2.8', '2.9', '2.10', '2.11', '2.12', '2.17']
+    + ['2.18', '2.19', '2.22', '2.26', '2.25', '2.27', '3.0']
+}
+MAP_VERSION_TO_INSTALL_SEABORN = {
+    k: {
+        'python': '3.9',
+        'install': 'python -m pip install -e .',
+        'pip_packages': [
+            'contourpy==1.1.0',
+            'cycler==0.11.0',
+            'fonttools==4.42.1',
+            'importlib-resources==6.0.1',
+            'kiwisolver==1.4.5',
+            'matplotlib==3.7.2',
+            'numpy==1.25.2',
+            'packaging==23.1',
+            'pandas==1.3.5',  # 2.0.3
+            'pillow==10.0.0',
+            'pyparsing==3.0.9',
+            'pytest',
+            'python-dateutil==2.8.2',
+            'pytz==2023.3.post1',
+            'scipy==1.11.2',
+            'six==1.16.0',
+            'tzdata==2023.1',
+            'zipp==3.16.2',
+        ],
+    }
+    for k in ['0.11']
+}
+MAP_VERSION_TO_INSTALL_SEABORN.update(
+    {
+        k: {
+            'python': '3.9',
+            'install': 'python -m pip install -e .[dev]',
+            'pip_packages': [
+                'contourpy==1.1.0',
+                'cycler==0.11.0',
+                'fonttools==4.42.1',
+                'importlib-resources==6.0.1',
+                'kiwisolver==1.4.5',
+                'matplotlib==3.7.2',
+                'numpy==1.25.2',
+                'packaging==23.1',
+                'pandas==2.0.0',
+                'pillow==10.0.0',
+                'pyparsing==3.0.9',
+                'pytest',
+                'python-dateutil==2.8.2',
+                'pytz==2023.3.post1',
+                'scipy==1.11.2',
+                'six==1.16.0',
+                'tzdata==2023.1',
+                'zipp==3.16.2',
+            ],
+        }
+        for k in ['0.12', '0.13']
+    }
+)
+MAP_VERSION_TO_INSTALL_PYTEST = {
+    k: {'python': '3.9', 'install': 'python -m pip install -e .'}
+    for k in [
+        '4.4',
+        '4.5',
+        '4.6',
+        '5.0',
+        '5.1',
+        '5.2',
+        '5.3',
+        '5.4',
+        '6.0',
+        '6.2',
+        '6.3',
+        '7.0',
+        '7.1',
+        '7.2',
+        '7.4',
+        '8.0',
+    ]
+}
+MAP_VERSION_TO_INSTALL_PYTEST['4.4']['pip_packages'] = [
+    'atomicwrites==1.4.1',
+    'attrs==23.1.0',
+    'more-itertools==10.1.0',
+    'pluggy==0.13.1',
+    'py==1.11.0',
+    'setuptools==68.0.0',
+    'six==1.16.0',
+]
+MAP_VERSION_TO_INSTALL_PYTEST['4.5']['pip_packages'] = [
+    'atomicwrites==1.4.1',
+    'attrs==23.1.0',
+    'more-itertools==10.1.0',
+    'pluggy==0.11.0',
+    'py==1.11.0',
+    'setuptools==68.0.0',
+    'six==1.16.0',
+    'wcwidth==0.2.6',
+]
+MAP_VERSION_TO_INSTALL_PYTEST['4.6']['pip_packages'] = [
+    'atomicwrites==1.4.1',
+    'attrs==23.1.0',
+    'more-itertools==10.1.0',
+    'packaging==23.1',
+    'pluggy==0.13.1',
+    'py==1.11.0',
+    'six==1.16.0',
+    'wcwidth==0.2.6',
+]
+for k in ['5.0', '5.1', '5.2']:
+    MAP_VERSION_TO_INSTALL_PYTEST[k]['pip_packages'] = [
+        'atomicwrites==1.4.1',
+        'attrs==23.1.0',
+        'more-itertools==10.1.0',
+        'packaging==23.1',
+        'pluggy==0.13.1',
+        'py==1.11.0',
+        'wcwidth==0.2.6',
+    ]
+MAP_VERSION_TO_INSTALL_PYTEST['5.3']['pip_packages'] = [
+    'attrs==23.1.0',
+    'more-itertools==10.1.0',
+    'packaging==23.1',
+    'pluggy==0.13.1',
+    'py==1.11.0',
+    'wcwidth==0.2.6',
+]
+MAP_VERSION_TO_INSTALL_PYTEST['5.4']['pip_packages'] = [
+    'py==1.11.0',
+    'packaging==23.1',
+    'attrs==23.1.0',
+    'more-itertools==10.1.0',
+    'pluggy==0.13.1',
+]
+MAP_VERSION_TO_INSTALL_PYTEST['6.0']['pip_packages'] = [
+    'attrs==23.1.0',
+    'iniconfig==2.0.0',
+    'more-itertools==10.1.0',
+    'packaging==23.1',
+    'pluggy==0.13.1',
+    'py==1.11.0',
+    'toml==0.10.2',
+]
+for k in ['6.2', '6.3']:
+    MAP_VERSION_TO_INSTALL_PYTEST[k]['pip_packages'] = [
+        'attrs==23.1.0',
+        'iniconfig==2.0.0',
+        'packaging==23.1',
+        'pluggy==0.13.1',
+        'py==1.11.0',
+        'toml==0.10.2',
+    ]
+MAP_VERSION_TO_INSTALL_PYTEST['7.0']['pip_packages'] = [
+    'attrs==23.1.0',
+    'iniconfig==2.0.0',
+    'packaging==23.1',
+    'pluggy==0.13.1',
+    'py==1.11.0',
+]
+for k in ['7.1', '7.2']:
+    MAP_VERSION_TO_INSTALL_PYTEST[k]['pip_packages'] = [
+        'attrs==23.1.0',
+        'iniconfig==2.0.0',
+        'packaging==23.1',
+        'pluggy==0.13.1',
+        'py==1.11.0',
+        'tomli==2.0.1',
+    ]
+MAP_VERSION_TO_INSTALL_PYTEST['7.4']['pip_packages'] = [
+    'iniconfig==2.0.0',
+    'packaging==23.1',
+    'pluggy==1.3.0',
+    'exceptiongroup==1.1.3',
+    'tomli==2.0.1',
+]
+MAP_VERSION_TO_INSTALL_PYTEST['8.0']['pip_packages'] = [
+    'iniconfig==2.0.0',
+    'packaging==23.1',
+    'pluggy==1.3.0',
+    'exceptiongroup==1.1.3',
+    'tomli==2.0.1',
+]
+MAP_VERSION_TO_INSTALL_MATPLOTLIB = {
+    k: {
+        'python': '3.11',
+        'packages': 'environment.yml',
+        'install': 'python -m pip install -e .',
+        'pre_install': [
+            'apt-get -y update && apt-get -y upgrade && apt-get install -y imagemagick ffmpeg texlive texlive-latex-extra texlive-fonts-recommended texlive-xetex texlive-luatex cm-super dvipng'
+        ],
+        'pip_packages': [
+            'contourpy==1.1.0',
+            'cycler==0.11.0',
+            'fonttools==4.42.1',
+            'ghostscript',
+            'kiwisolver==1.4.5',
+            'numpy==1.25.2',
+            'packaging==23.1',
+            'pillow==10.0.0',
+            'pikepdf',
+            'pyparsing==3.0.9',
+            'python-dateutil==2.8.2',
+            'six==1.16.0',
+            'setuptools==68.1.2',
+            'setuptools-scm==7.1.0',
+            'typing-extensions==4.7.1',
+        ],
+    }
+    for k in ['3.5', '3.6', '3.7']
+}
+MAP_VERSION_TO_INSTALL_MATPLOTLIB.update(
+    {
+        k: {
+            'python': '3.8',
+            'packages': 'requirements.txt',
+            'install': 'python -m pip install -e .',
+            'pre_install': [
+                'apt-get -y update && apt-get -y upgrade && apt-get install -y imagemagick ffmpeg libfreetype6-dev pkg-config texlive texlive-latex-extra texlive-fonts-recommended texlive-xetex texlive-luatex cm-super'
+            ],
+            'pip_packages': ['pytest', 'ipython'],
+        }
+        for k in ['3.1', '3.2', '3.3', '3.4']
+    }
+)
+MAP_VERSION_TO_INSTALL_MATPLOTLIB.update(
+    {
+        k: {
+            'python': '3.7',
+            'packages': 'requirements.txt',
+            'install': 'python -m pip install -e .',
+            'pre_install': [
+                'apt-get -y update && apt-get -y upgrade && apt-get install -y imagemagick ffmpeg libfreetype6-dev pkg-config'
+            ],
+            'pip_packages': ['pytest'],
+        }
+        for k in ['3.0']
+    }
+)
+MAP_VERSION_TO_INSTALL_MATPLOTLIB.update(
+    {
+        k: {
+            'python': '3.5',
+            'install': 'python setup.py build; python setup.py install',
+            'pre_install': [
+                'apt-get -y update && apt-get -y upgrade && && apt-get install -y imagemagick ffmpeg'
+            ],
+            'pip_packages': ['pytest'],
+            'execute_test_as_nonroot': True,
+        }
+        for k in ['2.0', '2.1', '2.2', '1.0', '1.1', '1.2', '1.3', '1.4', '1.5']
+    }
+)
+MAP_VERSION_TO_INSTALL_SPHINX = {
+    k: {
+        'python': '3.9',
+        'pip_packages': ['tox==4.16.0', 'tox-current-env==0.0.11'],
+        'install': 'python -m pip install -e .[test]',
+        'pre_install': ["sed -i 's/pytest/pytest -rA/' tox.ini"],
+    }
+    for k in ['1.5', '1.6', '1.7', '1.8', '2.0', '2.1', '2.2', '2.3', '2.4', '3.0']
+    + ['3.1', '3.2', '3.3', '3.4', '3.5', '4.0', '4.1', '4.2', '4.3', '4.4']
+    + ['4.5', '5.0', '5.1', '5.2', '5.3', '6.0', '6.2', '7.0', '7.1', '7.2']
+}
+for k in ['3.0', '3.1', '3.2', '3.3', '3.4', '3.5', '4.0', '4.1', '4.2', '4.3', '4.4']:
+    MAP_VERSION_TO_INSTALL_SPHINX[k]['pre_install'].extend(
+        [
+            "sed -i 's/Jinja2>=2.3/Jinja2<3.0/' setup.py",
+            "sed -i 's/sphinxcontrib-applehelp/sphinxcontrib-applehelp<=1.0.7/' setup.py",
+            "sed -i 's/sphinxcontrib-devhelp/sphinxcontrib-devhelp<=1.0.5/' setup.py",
+            "sed -i 's/sphinxcontrib-qthelp/sphinxcontrib-qthelp<=1.0.6/' setup.py",
+            "sed -i 's/alabaster>=0.7,<0.8/alabaster>=0.7,<0.7.12/' setup.py",
+            "sed -i \"s/'packaging',/'packaging', 'markupsafe<=2.0.1',/\" setup.py",
+        ]
+    )
+    if k in ['4.2', '4.3', '4.4']:
+        MAP_VERSION_TO_INSTALL_SPHINX[k]['pre_install'].extend(
+            [
+                "sed -i 's/sphinxcontrib-htmlhelp>=2.0.0/sphinxcontrib-htmlhelp>=2.0.0,<=2.0.4/' setup.py",
+                "sed -i 's/sphinxcontrib-serializinghtml>=1.1.5/sphinxcontrib-serializinghtml>=1.1.5,<=1.1.9/' setup.py",
+            ]
+        )
+    elif k == '4.1':
+        MAP_VERSION_TO_INSTALL_SPHINX[k]['pre_install'].extend(
+            [
+                (
+                    "grep -q 'sphinxcontrib-htmlhelp>=2.0.0' setup.py && "
+                    "sed -i 's/sphinxcontrib-htmlhelp>=2.0.0/sphinxcontrib-htmlhelp>=2.0.0,<=2.0.4/' setup.py || "
+                    "sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp<=2.0.4/' setup.py"
+                ),
+                (
+                    "grep -q 'sphinxcontrib-serializinghtml>=1.1.5' setup.py && "
+                    "sed -i 's/sphinxcontrib-serializinghtml>=1.1.5/sphinxcontrib-serializinghtml>=1.1.5,<=1.1.9/' setup.py || "
+                    "sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml<=1.1.9/' setup.py"
+                ),
+            ]
+        )
+    else:
+        MAP_VERSION_TO_INSTALL_SPHINX[k]['pre_install'].extend(
+            [
+                "sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp<=2.0.4/' setup.py",
+                "sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml<=1.1.9/' setup.py",
+            ]
+        )
+MAP_VERSION_TO_INSTALL_SPHINX['7.2']['pre_install'] += [
+    'apt-get update && apt-get install -y graphviz'
+]
+MAP_VERSION_TO_INSTALL_ASTROPY = {
+    k: {
+        'python': '3.9',
+        'install': 'python -m pip install -e .[test] --verbose',
+        'pip_packages': [
+            'attrs==23.1.0',
+            'exceptiongroup==1.1.3',
+            'execnet==2.0.2',
+            'hypothesis==6.82.6',
+            'iniconfig==2.0.0',
+            'numpy==1.25.2',
+            'packaging==23.1',
+            'pluggy==1.3.0',
+            'psutil==5.9.5',
+            'pyerfa==2.0.0.3',
+            'pytest-arraydiff==0.5.0',
+            'pytest-astropy-header==0.2.2',
+            'pytest-astropy==0.10.0',
+            'pytest-cov==4.1.0',
+            'pytest-doctestplus==1.0.0',
+            'pytest-filter-subpackage==0.1.2',
+            'pytest-mock==3.11.1',
+            'pytest-openfiles==0.5.0',
+            'pytest-remotedata==0.4.0',
+            'pytest-xdist==3.3.1',
+            'pytest==7.4.0',
+            'PyYAML==6.0.1',
+            'setuptools==68.0.0',
+            'sortedcontainers==2.4.0',
+            'tomli==2.0.1',
+        ],
+    }
+    for k in ['0.1', '0.2', '0.3', '0.4', '1.1', '1.2', '1.3', '3.0', '3.1', '3.2']
+    + ['4.1', '4.2', '4.3', '5.0', '5.1', '5.2']
+}
+for k in ['4.1', '4.2', '4.3', '5.0', '5.1', '5.2']:
+    MAP_VERSION_TO_INSTALL_ASTROPY[k]['pre_install'] = [
+        'sed -i \'s/requires = \\["setuptools",/requires = \\["setuptools==68.0.0",/\' pyproject.toml'
+    ]
+MAP_VERSION_TO_INSTALL_SYMPY = {
+    k: {
+        'python': '3.9',
+        'packages': 'mpmath flake8',
+        'pip_packages': ['mpmath==1.3.0', 'flake8-comprehensions'],
+        'install': 'python -m pip install -e .',
+    }
+    for k in ['0.7', '1.0', '1.1', '1.10', '1.11', '1.12', '1.2', '1.4', '1.5', '1.6']
+    + ['1.7', '1.8', '1.9']
+}
+MAP_VERSION_TO_INSTALL_SYMPY.update(
+    {
+        k: {
+            'python': '3.9',
+            'packages': 'requirements.txt',
+            'install': 'python -m pip install -e .',
+            'pip_packages': ['mpmath==1.3.0'],
+        }
+        for k in ['1.13']
+    }
+)
+MAP_VERSION_TO_INSTALL_PYLINT = {
+    k: {
+        'python': '3.9',
+        'packages': 'requirements.txt',
+        'install': 'python -m pip install -e .',
+    }
+    for k in [
+        '2.10',
+        '2.11',
+        '2.13',
+        '2.14',
+        '2.15',
+        '2.16',
+        '2.17',
+        '2.8',
+        '2.9',
+        '3.0',
+    ]
+}
+MAP_VERSION_TO_INSTALL_PYLINT['2.8']['pip_packages'] = ['pyenchant==3.2']
+MAP_VERSION_TO_INSTALL_PYLINT['2.8']['pre_install'] = [
+    'apt-get update && apt-get install -y libenchant-2-dev hunspell-en-us'
+]
+MAP_VERSION_TO_INSTALL_PYLINT.update(
+    {
+        k: {
+            **MAP_VERSION_TO_INSTALL_PYLINT[k],
+            'pip_packages': ['astroid==3.0.0a6', 'setuptools'],
+        }
+        for k in ['3.0']
+    }
+)
+
+MAP_VERSION_TO_INSTALL_XARRAY = {
+    k: {
+        'python': '3.10',
+        'packages': 'environment.yml',
+        'install': 'python -m pip install -e .',
+        'pip_packages': [
+            'numpy==1.23.0',
+            'packaging==23.1',
+            'pandas==1.5.3',
+            'pytest==7.4.0',
+            'python-dateutil==2.8.2',
+            'pytz==2023.3',
+            'six==1.16.0',
+            'scipy==1.11.1',
+            'setuptools==68.0.0',
+        ],
+        'no_use_env': True,
+    }
+    for k in ['0.12', '0.18', '0.19', '0.20', '2022.03', '2022.06', '2022.09']
+}
+
+MAP_VERSION_TO_INSTALL_SQLFLUFF = {
+    k: {
+        'python': '3.9',
+        'packages': 'requirements.txt',
+        'install': 'python -m pip install -e .',
+    }
+    for k in [
+        '0.10',
+        '0.11',
+        '0.12',
+        '0.13',
+        '0.4',
+        '0.5',
+        '0.6',
+        '0.8',
+        '0.9',
+        '1.0',
+        '1.1',
+        '1.2',
+        '1.3',
+        '1.4',
+        '2.0',
+        '2.1',
+        '2.2',
+    ]
+}
+MAP_VERSION_TO_INSTALL_DBT_CORE = {
+    k: {
+        'python': '3.9',
+        'packages': 'requirements.txt',
+        'install': 'python -m pip install -e .',
+    }
+    for k in [
+        '0.13',
+        '0.14',
+        '0.15',
+        '0.16',
+        '0.17',
+        '0.18',
+        '0.19',
+        '0.20',
+        '0.21',
+        '1.0',
+        '1.1',
+        '1.2',
+        '1.3',
+        '1.4',
+        '1.5',
+        '1.6',
+        '1.7',
+    ]
+}
+MAP_VERSION_TO_INSTALL_PYVISTA = {
+    k: {
+        'python': '3.9',
+        'install': 'python -m pip install -e .',
+        'pip_packages': ['pytest'],
+    }
+    for k in ['0.20', '0.21', '0.22', '0.23']
+}
+MAP_VERSION_TO_INSTALL_PYVISTA.update(
+    {
+        k: {
+            'python': '3.9',
+            'packages': 'requirements.txt',
+            'install': 'python -m pip install -e .',
+            'pip_packages': ['pytest'],
+        }
+        for k in [
+            '0.24',
+            '0.25',
+            '0.26',
+            '0.27',
+            '0.28',
+            '0.29',
+            '0.30',
+            '0.31',
+            '0.32',
+            '0.33',
+            '0.34',
+            '0.35',
+            '0.36',
+            '0.37',
+            '0.38',
+            '0.39',
+            '0.40',
+            '0.41',
+            '0.42',
+            '0.43',
+        ]
+    }
+)
+MAP_VERSION_TO_INSTALL_ASTROID = {
+    k: {
+        'python': '3.9',
+        'install': 'python -m pip install -e .',
+        'pip_packages': ['pytest'],
+    }
+    for k in [
+        '2.10',
+        '2.12',
+        '2.13',
+        '2.14',
+        '2.15',
+        '2.16',
+        '2.5',
+        '2.6',
+        '2.7',
+        '2.8',
+        '2.9',
+        '3.0',
+    ]
+}
+MAP_VERSION_TO_INSTALL_MARSHMALLOW = {
+    k: {
+        'python': '3.9',
+        'install': "python -m pip install -e '.[dev]'",
+    }
+    for k in [
+        '2.18',
+        '2.19',
+        '2.20',
+        '3.0',
+        '3.1',
+        '3.10',
+        '3.11',
+        '3.12',
+        '3.13',
+        '3.15',
+        '3.16',
+        '3.19',
+        '3.2',
+        '3.4',
+        '3.8',
+        '3.9',
+    ]
+}
+MAP_VERSION_TO_INSTALL_PVLIB = {
+    k: {
+        'python': '3.9',
+        'install': 'python -m pip install -e .[all]',
+        'packages': 'pandas scipy',
+        'pip_packages': ['jupyter', 'ipython', 'matplotlib', 'pytest', 'flake8'],
+    }
+    for k in ['0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9']
+}
+MAP_VERSION_TO_INSTALL_PYDICOM = {
+    k: {'python': '3.6', 'install': 'python -m pip install -e .', 'packages': 'numpy'}
+    for k in [
+        '1.0',
+        '1.1',
+        '1.2',
+        '1.3',
+        '1.4',
+        '2.0',
+        '2.1',
+        '2.2',
+        '2.3',
+        '2.4',
+        '3.0',
+    ]
+}
+MAP_VERSION_TO_INSTALL_PYDICOM.update(
+    {k: {**MAP_VERSION_TO_INSTALL_PYDICOM[k], 'python': '3.8'} for k in ['1.4', '2.0']}
+)
+MAP_VERSION_TO_INSTALL_PYDICOM.update(
+    {k: {**MAP_VERSION_TO_INSTALL_PYDICOM[k], 'python': '3.9'} for k in ['2.1', '2.2']}
+)
+MAP_VERSION_TO_INSTALL_PYDICOM.update(
+    {k: {**MAP_VERSION_TO_INSTALL_PYDICOM[k], 'python': '3.10'} for k in ['2.3']}
+)
+MAP_VERSION_TO_INSTALL_PYDICOM.update(
+    {k: {**MAP_VERSION_TO_INSTALL_PYDICOM[k], 'python': '3.11'} for k in ['2.4', '3.0']}
+)
+MAP_VERSION_TO_INSTALL_HUMANEVAL = {k: {'python': '3.9'} for k in ['1.0']}
+MAP_VERSION_TO_INSTALL_HUMANEVAL_FIX = {
+    k: {'python': '3.10', 'packages': 'pytest'} for k in ['0.0.1']
+}
+
+# Constants - Task Instance Instllation Environment
+MAP_VERSION_TO_INSTALL = {
+    'astropy/astropy': MAP_VERSION_TO_INSTALL_ASTROPY,
+    'dbt-labs/dbt-core': MAP_VERSION_TO_INSTALL_DBT_CORE,
+    'django/django': MAP_VERSION_TO_INSTALL_DJANGO,
+    'matplotlib/matplotlib': MAP_VERSION_TO_INSTALL_MATPLOTLIB,
+    'marshmallow-code/marshmallow': MAP_VERSION_TO_INSTALL_MARSHMALLOW,
+    'mwaskom/seaborn': MAP_VERSION_TO_INSTALL_SEABORN,
+    'pallets/flask': MAP_VERSION_TO_INSTALL_FLASK,
+    'psf/requests': MAP_VERSION_TO_INSTALL_REQUESTS,
+    'pvlib/pvlib-python': MAP_VERSION_TO_INSTALL_PVLIB,
+    'pydata/xarray': MAP_VERSION_TO_INSTALL_XARRAY,
+    'pydicom/pydicom': MAP_VERSION_TO_INSTALL_PYDICOM,
+    'pylint-dev/astroid': MAP_VERSION_TO_INSTALL_ASTROID,
+    'pylint-dev/pylint': MAP_VERSION_TO_INSTALL_PYLINT,
+    'pytest-dev/pytest': MAP_VERSION_TO_INSTALL_PYTEST,
+    'pyvista/pyvista': MAP_VERSION_TO_INSTALL_PYVISTA,
+    'scikit-learn/scikit-learn': MAP_VERSION_TO_INSTALL_SKLEARN,
+    'sphinx-doc/sphinx': MAP_VERSION_TO_INSTALL_SPHINX,
+    'sqlfluff/sqlfluff': MAP_VERSION_TO_INSTALL_SQLFLUFF,
+    'swe-bench/humaneval': MAP_VERSION_TO_INSTALL_HUMANEVAL,
+    'nielstron/humaneval_fix': MAP_VERSION_TO_INSTALL_HUMANEVAL_FIX,
+    'sympy/sympy': MAP_VERSION_TO_INSTALL_SYMPY,
+}
+
+# Constants - Repository Specific Installation Instructions
+MAP_REPO_TO_INSTALL = {}
+
+# Constants - Task Instance Test Frameworks
+TEST_PYTEST_VERBOSE = 'pytest -rA --tb=long -p no:cacheprovider'
+MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE = {
+    'astropy/astropy': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_ASTROPY.keys()
+    },
+    'django/django': {
+        k: './tests/runtests.py --verbosity 2 --settings=test_sqlite --parallel 1'
+        for k in MAP_VERSION_TO_INSTALL_DJANGO.keys()
+    },
+    'marshmallow-code/marshmallow': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_MARSHMALLOW.keys()
+    },
+    'matplotlib/matplotlib': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_MATPLOTLIB.keys()
+    },
+    'mwaskom/seaborn': {
+        k: 'pytest -rA --tb=long' for k in MAP_VERSION_TO_INSTALL_SEABORN.keys()
+    },
+    'pallets/flask': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_FLASK.keys()
+    },
+    'psf/requests': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_REQUESTS.keys()
+    },
+    'pvlib/pvlib-python': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_PVLIB.keys()
+    },
+    'pydata/xarray': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_XARRAY.keys()
+    },
+    'pydicom/pydicom': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_PYDICOM.keys()
+    },
+    'pylint-dev/astroid': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_ASTROID.keys()
+    },
+    'pylint-dev/pylint': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_PYLINT.keys()
+    },
+    'pytest-dev/pytest': {
+        k: 'pytest -rA --tb=long' for k in MAP_VERSION_TO_INSTALL_PYTEST.keys()
+    },
+    'pyvista/pyvista': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_PYVISTA.keys()
+    },
+    'scikit-learn/scikit-learn': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_SKLEARN.keys()
+    },
+    'sphinx-doc/sphinx': {
+        k: 'tox -epy39 -v --' for k in MAP_VERSION_TO_INSTALL_SPHINX.keys()
+    },
+    'sqlfluff/sqlfluff': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_SQLFLUFF.keys()
+    },
+    'swe-bench/humaneval': {
+        k: 'python' for k in MAP_VERSION_TO_INSTALL_HUMANEVAL.keys()
+    },
+    'nielstron/humaneval_fix': {
+        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_HUMANEVAL.keys()
+    },
+    'sympy/sympy': {
+        k: 'bin/test -C --verbose' for k in MAP_VERSION_TO_INSTALL_SYMPY.keys()
+    },
+}
+MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE['django/django']['1.9'] = (
+    './tests/runtests.py --verbosity 2'
+)
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -3,7 +3,7 @@ import copy
 import json
 import os
 import tempfile
-from typing import Any
+from typing import Any, Literal

 import pandas as pd
 import toml
@@ -17,6 +17,11 @@ from evaluation.benchmarks.swe_bench.binary_patch_utils import (
 from evaluation.benchmarks.swe_bench.resource.mapping import (
    get_instance_resource_factor,
 )
+from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
+    MAP_REPO_TO_INSTALL,
+    MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE,
+    MAP_VERSION_TO_INSTALL,
+)
 from evaluation.utils.shared import (
    EvalException,
    EvalMetadata,
@@ -55,6 +60,7 @@ from openhands.utils.shutdown_listener import sleep_if_should_continue

 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
 RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
+BenchMode = Literal['swe', 'swt', 'swt-ci']


 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
@@ -68,7 +74,36 @@ def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:

 def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
-    instruction = f"""
+    mode = metadata.details['mode']
+    if mode.startswith('swt'):
+        test_instructions = (
+            f'The following command can be used to run the tests: `{list(MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE[instance.repo].values())[0]}`. Make sure they fail in the expected way.\n'
+            if mode.endswith('ci')
+            else ''
+        )
+        instruction = f"""\
+<uploaded_files>
+/workspace/{workspace_dir_name}
+</uploaded_files>
+I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:
+
+<issue_description>
+{instance.problem_statement}
+</issue_description>
+
+
+Can you help me implement the necessary changes to the repository to test whether the issue in <issue_description> was resolved?
+I will take care of all changes to any of the non-test files. This means you DON'T have to modify the actual logic and ONLY have to update test logic and tests!
+Your task is to make the minimal changes to tests files in the /workspace directory to reproduce the issue in the <issue_description>, i.e., such that the generated tests fail in the current state (where the issue is unresolved) and pass when the issue will be resolved.
+Follow these steps to reproduce the issue:
+1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.
+2. Create a script `reproduction.py` to reproduce the error and execute it with `python reproduction.py` using the BashTool, to confirm the error
+3. Edit the sourcecode of the repo to integrate your reproduction script into the test framework
+4. Run the test framework and make sure your tests fail! Only submit FAILING tests! Never submit passing tests.
+{test_instructions}Your thinking should be thorough and so it's fine if it's very long.
+"""
+    else:
+        instruction = f"""
 <uploaded_files>
 /workspace/{workspace_dir_name}
 </uploaded_files>
@@ -356,6 +391,30 @@ def initialize_runtime(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')

+    if metadata.details['mode'] == 'swt-ci':
+        # set up repo
+        setup_commands = []
+        if instance['repo'] in MAP_REPO_TO_INSTALL:
+            setup_commands.append(MAP_REPO_TO_INSTALL[instance['repo']])
+
+        # Run pre-install set up if provided
+        install = MAP_VERSION_TO_INSTALL.get(instance['repo'], {}).get(
+            instance['version'], []
+        )
+        if 'pre_install' in install:
+            for pre_install in install['pre_install']:
+                setup_commands.append(pre_install)
+
+        if 'install' in install:
+            setup_commands.append(install['install'])
+
+        for command in setup_commands:
+            action = CmdRunAction(command=command)
+            action.set_hard_timeout(600)
+            logger.info(action, extra={'msg_type': 'ACTION'})
+            obs = runtime.run_action(action)
+            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
    if 'multimodal' not in metadata.dataset.lower():
        # Only for non-multimodal datasets, we need to activate the testbed environment for Python
        # SWE-Bench multimodal datasets are not using the testbed environment
@@ -678,6 +737,13 @@ if __name__ == '__main__':
        default='test',
        help='split to evaluate on',
    )
+    parser.add_argument(
+        '--mode',
+        type=str,
+        default='swe',
+        choices=['swe', 'swt', 'swt-ci'],
+        help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'",
+    )
    args, _ = parser.parse_known_args()

    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
@@ -714,7 +780,7 @@ if __name__ == '__main__':
    if llm_config is None:
        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

-    details = {}
+    details = {'mode': args.mode}
    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)

    dataset_descrption = (
--- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
@@ -12,6 +12,7 @@ NUM_WORKERS=$6
 DATASET=$7
 SPLIT=$8
 N_RUNS=$9
+MODE=${10}

 if [ -z "$NUM_WORKERS" ]; then
  NUM_WORKERS=1
@@ -45,6 +46,11 @@ if [ -z "$SPLIT" ]; then
  SPLIT="test"
 fi

+if [ -z "$MODE" ]; then
+  MODE="swe"
+  echo "MODE not specified, use default $MODE"
+fi
+
 export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
 echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"

@@ -55,6 +61,10 @@ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 echo "SPLIT: $SPLIT"
+echo "MAX_ITER: $MAX_ITER"
+echo "NUM_WORKERS: $NUM_WORKERS"
+echo "COMMIT_HASH: $COMMIT_HASH"
+echo "MODE: $MODE"

 # Default to NOT use Hint
 if [ -z "$USE_HINT_TEXT" ]; then
@@ -74,9 +84,13 @@ fi
 if [ -n "$EXP_NAME" ]; then
  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
 fi
+# if mode != swe, add mode to the eval note
+if [ "$MODE" != "swe" ]; then
+  EVAL_NOTE="${EVAL_NOTE}-${MODE}"
+fi

 function run_eval() {
-  local eval_note=$1
+  local eval_note="${1}"
  COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer.py \
    --agent-cls $AGENT \
    --llm-config $MODEL_CONFIG \
@@ -84,7 +98,8 @@ function run_eval() {
    --eval-num-workers $NUM_WORKERS \
    --eval-note $eval_note \
    --dataset $DATASET \
-    --split $SPLIT"
+    --split $SPLIT \
+    --mode $MODE"

  if [ -n "$EVAL_LIMIT" ]; then
    echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/swe_bench/scripts/swtbench/convert.py
+++ b/evaluation/benchmarks/swe_bench/scripts/swtbench/convert.py
@@ -0,0 +1,95 @@
+import argparse
+import json
+import logging
+
+import unidiff
+
+from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
+    MAP_VERSION_TO_INSTALL,
+)
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def remove_setup_files(model_patch: str, instance: dict, delete_setup_changes: bool):
+    """Discard all changes that a patch applies to files changes by the pre_install script and that are reproduction scripts (top-level script)"""
+    setup_files = ['setup.py', 'tox.ini', 'pyproject.toml']
+    pre_install = (
+        MAP_VERSION_TO_INSTALL.get(instance['repo'], {})
+        .get(instance['version'], {})
+        .get('pre_install', [])
+    )
+    relevant_files = (
+        [
+            file
+            for file in setup_files
+            if any(file in install and 'sed' in install for install in pre_install)
+        ]
+        if delete_setup_changes
+        else []
+    )
+    for i in range(10):
+        try:
+            # Appearently outputs.jsonl has .strip() applied, so we try to reconstruct the original patch by adding auxiliary whitespace
+            patch = unidiff.PatchSet(model_patch + i * '\n')
+            break
+        except unidiff.UnidiffParseError:
+            pass
+
+    to_delete = []
+    for i, file in enumerate(patch):
+        if (
+            any(f in file.source_file for f in relevant_files)
+            or file.target_file.count('/') == 1
+        ):
+            to_delete.append(i)
+    for i in reversed(to_delete):
+        del patch[i]
+    return str(patch)
+
+
+def main(
+    prediction_file: str,
+):
+    """Main function to extract the model patches from the OpenHands prediction file and turn them into the expected SWT-Bench format."""
+    with open(prediction_file) as f:
+        for line in f:
+            pred = json.loads(line)
+            try:
+                git_diff = pred['test_result']['git_patch']
+            except KeyError:
+                _LOGGER.warning(
+                    'Warning: No git diff found for instance %s', pred['instance_id']
+                )
+                continue
+            ci_mode = pred['metadata']['details'].get('mode', '') == 'swt-ci'
+            try:
+                git_diff = remove_setup_files(git_diff, pred['instance'], ci_mode)
+            except:  # noqa: E722
+                _LOGGER.warning(
+                    'Warning: Invalid git diff found for instance %s',
+                    pred['instance_id'],
+                )
+            print(
+                json.dumps(
+                    {
+                        'instance_id': pred['instance_id'],
+                        'model_name_or_path': f'{pred["metadata"]["llm_config"]["openrouter_app_name"]}__{pred["metadata"]["agent_class"]}__{pred["metadata"]["llm_config"]["model"]}',
+                        'model_patch': git_diff,
+                        'full_output': json.dumps(pred),
+                    }
+                )
+            )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--prediction_file',
+        type=str,
+        required=True,
+        help='Path to the prediction file (.../outputs.jsonl)',
+    )
+    args = parser.parse_args()
+
+    main(args.prediction_file)
--- a/frontend/.husky/pre-commit
+++ b/frontend/.husky/pre-commit
@@ -1,3 +1,10 @@
+# Run frontend checks
+echo "Running frontend checks..."
 cd frontend
 npm run check-unlocalized-strings
 npx lint-staged
+
+# Run backend pre-commit
+echo "Running backend pre-commit..."
+cd ..
+pre-commit run --files openhands/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
--- a/frontend/tests/components/chat/chat-input.test.tsx
+++ b/frontend/tests/components/chat/chat-input.test.tsx
@@ -223,7 +223,7 @@ describe("ChatInput", () => {
    render(<ChatInput onSubmit={onSubmitMock} />);
    const textarea = screen.getByRole("textbox");
    expect(textarea).toBeInTheDocument();
-    
+
    // The actual verification of maxRows=16 is handled internally by the TextareaAutosize component
    // and affects how many rows the textarea can expand to
  });
--- a/frontend/tests/components/chat/chat-interface.test.tsx
+++ b/frontend/tests/components/chat/chat-interface.test.tsx
@@ -1,8 +1,8 @@
 import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
-import type { Message } from "#/message";
 import { act, screen, waitFor, within } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { renderWithProviders } from "test-utils";
+import type { Message } from "#/message";
 import { addUserMessage } from "#/state/chat-slice";
 import { SUGGESTIONS } from "#/utils/suggestions";
 import * as ChatSlice from "#/state/chat-slice";
@@ -45,7 +45,15 @@ describe("Empty state", () => {
  it("should render suggestions if empty", () => {
    const { store } = renderWithProviders(<ChatInterface />, {
      preloadedState: {
-        chat: { messages: [] },
+        chat: {
+          messages: [],
+          systemMessage: {
+            content: "",
+            tools: [],
+            openhands_version: null,
+            agent_class: null
+          }
+        },
      },
    });

@@ -68,7 +76,15 @@ describe("Empty state", () => {
  it("should render the default suggestions", () => {
    renderWithProviders(<ChatInterface />, {
      preloadedState: {
-        chat: { messages: [] },
+        chat: {
+          messages: [],
+          systemMessage: {
+            content: "",
+            tools: [],
+            openhands_version: null,
+            agent_class: null
+          }
+        },
      },
    });

@@ -98,7 +114,15 @@ describe("Empty state", () => {
      const user = userEvent.setup();
      const { store } = renderWithProviders(<ChatInterface />, {
        preloadedState: {
-          chat: { messages: [] },
+          chat: {
+            messages: [],
+            systemMessage: {
+              content: "",
+              tools: [],
+              openhands_version: null,
+              agent_class: null
+            }
+          },
        },
      });

@@ -127,7 +151,15 @@ describe("Empty state", () => {
      const user = userEvent.setup();
      const { rerender } = renderWithProviders(<ChatInterface />, {
        preloadedState: {
-          chat: { messages: [] },
+          chat: {
+            messages: [],
+            systemMessage: {
+              content: "",
+              tools: [],
+              openhands_version: null,
+              agent_class: null
+            }
+          },
        },
      });

--- a/frontend/tests/components/chat/expandable-message.test.tsx
+++ b/frontend/tests/components/chat/expandable-message.test.tsx
@@ -95,6 +95,23 @@ describe("ExpandableMessage", () => {
    expect(screen.queryByTestId("status-icon")).not.toBeInTheDocument();
  });

+  it("should render with neutral border and no icon for action messages with undefined success (timeout case)", () => {
+    renderWithProviders(
+      <ExpandableMessage
+        id="OBSERVATION_MESSAGE$RUN"
+        message="Command timed out"
+        type="action"
+        success={undefined}
+      />,
+    );
+    const element = screen.getByText("OBSERVATION_MESSAGE$RUN");
+    const container = element.closest(
+      "div.flex.gap-2.items-center.justify-start",
+    );
+    expect(container).toHaveClass("border-neutral-300");
+    expect(screen.queryByTestId("status-icon")).not.toBeInTheDocument();
+  });
+
  it("should render the out of credits message when the user is out of credits", async () => {
    const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
    // @ts-expect-error - We only care about the APP_MODE and FEATURE_FLAGS fields
--- a/frontend/tests/components/features/auth-modal.test.tsx
+++ b/frontend/tests/components/features/auth-modal.test.tsx
@@ -1,53 +1,48 @@
 import { render, screen } from "@testing-library/react";
-import { it, describe, expect, vi, beforeAll, afterAll } from "vitest";
+import { it, describe, expect, vi, beforeEach, afterEach } from "vitest";
 import userEvent from "@testing-library/user-event";
 import { AuthModal } from "#/components/features/waitlist/auth-modal";
-import * as CaptureConsent from "#/utils/handle-capture-consent";
+import * as AuthHook from "#/context/auth-context";
+
+// Mock the useAuthUrl hook
+vi.mock("#/hooks/use-auth-url", () => ({
+  useAuthUrl: () => "https://gitlab.com/oauth/authorize"
+}));

 describe("AuthModal", () => {
-  beforeAll(() => {
+  beforeEach(() => {
    vi.stubGlobal("location", { href: "" });
+    vi.spyOn(AuthHook, "useAuth").mockReturnValue({
+      providersAreSet: false,
+      setProvidersAreSet: vi.fn(),
+      providerTokensSet: [],
+      setProviderTokensSet: vi.fn()
+    });
  });

-  afterAll(() => {
+  afterEach(() => {
    vi.unstubAllGlobals();
+    vi.resetAllMocks();
  });

-  it("should render a tos checkbox that is unchecked by default", () => {
-    render(<AuthModal githubAuthUrl={null} />);
-    const checkbox = screen.getByRole("checkbox");
+  it("should render the GitHub and GitLab buttons", () => {
+    render(<AuthModal githubAuthUrl="mock-url" appMode="saas" />);

-    expect(checkbox).not.toBeChecked();
+    const githubButton = screen.getByRole("button", { name: "GITHUB$CONNECT_TO_GITHUB" });
+    const gitlabButton = screen.getByRole("button", { name: "GITLAB$CONNECT_TO_GITLAB" });
+
+    expect(githubButton).toBeInTheDocument();
+    expect(gitlabButton).toBeInTheDocument();
  });

-  it("should only enable the GitHub button if the tos checkbox is checked", async () => {
+  it("should redirect to GitHub auth URL when GitHub button is clicked", async () => {
    const user = userEvent.setup();
-    render(<AuthModal githubAuthUrl={null} />);
-    const checkbox = screen.getByRole("checkbox");
-    const button = screen.getByRole("button", { name: "GITHUB$CONNECT_TO_GITHUB" });
+    const mockUrl = "https://github.com/login/oauth/authorize";
+    render(<AuthModal githubAuthUrl={mockUrl} appMode="saas" />);

-    expect(button).toBeDisabled();
+    const githubButton = screen.getByRole("button", { name: "GITHUB$CONNECT_TO_GITHUB" });
+    await user.click(githubButton);

-    await user.click(checkbox);
-
-    expect(button).not.toBeDisabled();
-  });
-
-  it("should set user analytics consent to true when the user checks the tos checkbox", async () => {
-    const handleCaptureConsentSpy = vi.spyOn(
-      CaptureConsent,
-      "handleCaptureConsent",
-    );
-
-    const user = userEvent.setup();
-    render(<AuthModal githubAuthUrl="mock-url" />);
-
-    const checkbox = screen.getByRole("checkbox");
-    await user.click(checkbox);
-
-    const button = screen.getByRole("button", { name: "GITHUB$CONNECT_TO_GITHUB" });
-    await user.click(button);
-
-    expect(handleCaptureConsentSpy).toHaveBeenCalledWith(true);
+    expect(window.location.href).toBe(mockUrl);
  });
 });
--- a/frontend/tests/components/features/chat/path-component.test.tsx
+++ b/frontend/tests/components/features/chat/path-component.test.tsx
@@ -0,0 +1,34 @@
+import { describe, expect, it } from "vitest";
+import { isLikelyDirectory } from "#/components/features/chat/path-component";
+
+describe("isLikelyDirectory", () => {
+  it("should return false for empty path", () => {
+    expect(isLikelyDirectory("")).toBe(false);
+  });
+
+  it("should return true for paths ending with forward slash", () => {
+    expect(isLikelyDirectory("/path/to/dir/")).toBe(true);
+    expect(isLikelyDirectory("dir/")).toBe(true);
+  });
+
+  it("should return true for paths ending with backslash", () => {
+    expect(isLikelyDirectory("C:\\path\\to\\dir\\")).toBe(true);
+    expect(isLikelyDirectory("dir\\")).toBe(true);
+  });
+
+  it("should return true for paths without extension", () => {
+    expect(isLikelyDirectory("/path/to/dir")).toBe(true);
+    expect(isLikelyDirectory("dir")).toBe(true);
+  });
+
+  it("should return false for paths ending with dot", () => {
+    expect(isLikelyDirectory("/path/to/dir.")).toBe(false);
+    expect(isLikelyDirectory("dir.")).toBe(false);
+  });
+
+  it("should return false for paths with file extensions", () => {
+    expect(isLikelyDirectory("/path/to/file.txt")).toBe(false);
+    expect(isLikelyDirectory("file.js")).toBe(false);
+    expect(isLikelyDirectory("script.test.ts")).toBe(false);
+  });
+});
--- a/frontend/tests/components/features/git/git-repo-selector.test.tsx
+++ b/frontend/tests/components/features/git/git-repo-selector.test.tsx
@@ -56,12 +56,16 @@ describe("GitRepositorySelector", () => {
        full_name: "test/repo1",
        git_provider: "github" as Provider,
        stargazers_count: 100,
+        is_public: true,
+        pushed_at: "2023-01-01T00:00:00Z",
      },
      {
        id: 2,
        full_name: "test/repo2",
        git_provider: "github" as Provider,
        stargazers_count: 200,
+        is_public: true,
+        pushed_at: "2023-01-02T00:00:00Z",
      },
    ];

--- a/frontend/tests/components/features/home/home-header.test.tsx
+++ b/frontend/tests/components/features/home/home-header.test.tsx
@@ -0,0 +1,73 @@
+import { QueryClientProvider, QueryClient } from "@tanstack/react-query";
+import { render, screen } from "@testing-library/react";
+import { Provider } from "react-redux";
+import { createRoutesStub } from "react-router";
+import { setupStore } from "test-utils";
+import { describe, expect, it, vi } from "vitest";
+import userEvent from "@testing-library/user-event";
+import { AuthProvider } from "#/context/auth-context";
+import { HomeHeader } from "#/components/features/home/home-header";
+import OpenHands from "#/api/open-hands";
+
+const renderHomeHeader = () => {
+  const RouterStub = createRoutesStub([
+    {
+      Component: HomeHeader,
+      path: "/",
+    },
+    {
+      Component: () => <div data-testid="conversation-screen" />,
+      path: "/conversations/:conversationId",
+    },
+  ]);
+
+  return render(<RouterStub />, {
+    wrapper: ({ children }) => (
+      <Provider store={setupStore()}>
+        <AuthProvider initialProvidersAreSet>
+          <QueryClientProvider client={new QueryClient()}>
+            {children}
+          </QueryClientProvider>
+        </AuthProvider>
+      </Provider>
+    ),
+  });
+};
+
+describe("HomeHeader", () => {
+  it("should create an empty conversation and redirect when pressing the launch from scratch button", async () => {
+    const createConversationSpy = vi.spyOn(OpenHands, "createConversation");
+
+    renderHomeHeader();
+
+    const launchButton = screen.getByRole("button", {
+      name: /launch from scratch/i,
+    });
+    await userEvent.click(launchButton);
+
+    expect(createConversationSpy).toHaveBeenCalledExactlyOnceWith(
+      "gui",
+      undefined,
+      undefined,
+      undefined,
+      [],
+      undefined,
+      undefined,
+    );
+
+    // expect to be redirected to /conversations/:conversationId
+    await screen.findByTestId("conversation-screen");
+  });
+
+  it("should change the launch button text to 'Loading...' when creating a conversation", async () => {
+    renderHomeHeader();
+
+    const launchButton = screen.getByRole("button", {
+      name: /launch from scratch/i,
+    });
+    await userEvent.click(launchButton);
+
+    expect(launchButton).toHaveTextContent(/Loading/i);
+    expect(launchButton).toBeDisabled();
+  });
+});
--- a/frontend/tests/components/features/home/repo-connector.test.tsx
+++ b/frontend/tests/components/features/home/repo-connector.test.tsx
@@ -0,0 +1,224 @@
+import { render, screen, waitFor, within } from "@testing-library/react";
+import { describe, expect, it, vi } from "vitest";
+import userEvent from "@testing-library/user-event";
+import { QueryClientProvider, QueryClient } from "@tanstack/react-query";
+import { setupStore } from "test-utils";
+import { Provider } from "react-redux";
+import { createRoutesStub, Outlet } from "react-router";
+import OpenHands from "#/api/open-hands";
+import { AuthProvider } from "#/context/auth-context";
+import { GitRepository } from "#/types/git";
+import { RepoConnector } from "#/components/features/home/repo-connector";
+
+const renderRepoConnector = (initialProvidersAreSet = true) => {
+  const mockRepoSelection = vi.fn();
+  const RouterStub = createRoutesStub([
+    {
+      Component: () => <RepoConnector onRepoSelection={mockRepoSelection} />,
+      path: "/",
+    },
+    {
+      Component: () => <div data-testid="conversation-screen" />,
+      path: "/conversations/:conversationId",
+    },
+    {
+      Component: Outlet,
+      path: "/settings",
+      children: [
+        {
+          Component: () => <div data-testid="settings-screen" />,
+          path: "/settings",
+        },
+        {
+          Component: () => <div data-testid="git-settings-screen" />,
+          path: "/settings/git",
+        },
+      ],
+    },
+  ]);
+
+  return render(<RouterStub />, {
+    wrapper: ({ children }) => (
+      <Provider store={setupStore()}>
+        <AuthProvider initialProvidersAreSet={initialProvidersAreSet}>
+          <QueryClientProvider client={new QueryClient()}>
+            {children}
+          </QueryClientProvider>
+        </AuthProvider>
+      </Provider>
+    ),
+  });
+};
+
+const MOCK_RESPOSITORIES: GitRepository[] = [
+  {
+    id: 1,
+    full_name: "rbren/polaris",
+    git_provider: "github",
+    is_public: true,
+  },
+  {
+    id: 2,
+    full_name: "All-Hands-AI/OpenHands",
+    git_provider: "github",
+    is_public: true,
+  },
+];
+
+describe("RepoConnector", () => {
+  it("should render the repository connector section", () => {
+    renderRepoConnector();
+    screen.getByTestId("repo-connector");
+  });
+
+  it("should render the available repositories in the dropdown", async () => {
+    const retrieveUserGitRepositoriesSpy = vi.spyOn(
+      OpenHands,
+      "retrieveUserGitRepositories",
+    );
+    retrieveUserGitRepositoriesSpy.mockResolvedValue(MOCK_RESPOSITORIES);
+
+    renderRepoConnector();
+
+    // Wait for the loading state to be replaced with the dropdown
+    const dropdown = await waitFor(() => screen.getByTestId("repo-dropdown"));
+    await userEvent.click(dropdown);
+
+    await waitFor(() => {
+      screen.getByText("rbren/polaris");
+      screen.getByText("All-Hands-AI/OpenHands");
+    });
+  });
+
+  it("should only enable the launch button if a repo is selected", async () => {
+    const retrieveUserGitRepositoriesSpy = vi.spyOn(
+      OpenHands,
+      "retrieveUserGitRepositories",
+    );
+    retrieveUserGitRepositoriesSpy.mockResolvedValue(MOCK_RESPOSITORIES);
+
+    renderRepoConnector();
+
+    const launchButton = screen.getByTestId("repo-launch-button");
+    expect(launchButton).toBeDisabled();
+
+    // Wait for the loading state to be replaced with the dropdown
+    const dropdown = await waitFor(() => screen.getByTestId("repo-dropdown"));
+    await userEvent.click(dropdown);
+    await userEvent.click(screen.getByText("rbren/polaris"));
+
+    expect(launchButton).toBeEnabled();
+  });
+
+  it("should render the 'add git(hub|lab) repos' links if saas mode", async () => {
+    const getConfiSpy = vi.spyOn(OpenHands, "getConfig");
+    // @ts-expect-error - only return the APP_MODE
+    getConfiSpy.mockResolvedValue({
+      APP_MODE: "saas",
+    });
+
+    renderRepoConnector();
+
+    await screen.findByText("Add GitHub repos");
+  });
+
+  it("should not render the 'add git(hub|lab) repos' links if oss mode", async () => {
+    const getConfiSpy = vi.spyOn(OpenHands, "getConfig");
+    // @ts-expect-error - only return the APP_MODE
+    getConfiSpy.mockResolvedValue({
+      APP_MODE: "oss",
+    });
+
+    renderRepoConnector();
+
+    expect(screen.queryByText("Add GitHub repos")).not.toBeInTheDocument();
+    expect(screen.queryByText("Add GitLab repos")).not.toBeInTheDocument();
+  });
+
+  it("should create a conversation and redirect with the selected repo when pressing the launch button", async () => {
+    const createConversationSpy = vi.spyOn(OpenHands, "createConversation");
+    const retrieveUserGitRepositoriesSpy = vi.spyOn(
+      OpenHands,
+      "retrieveUserGitRepositories",
+    );
+    retrieveUserGitRepositoriesSpy.mockResolvedValue(MOCK_RESPOSITORIES);
+
+    renderRepoConnector();
+
+    const repoConnector = screen.getByTestId("repo-connector");
+    const launchButton =
+      within(repoConnector).getByTestId("repo-launch-button");
+    await userEvent.click(launchButton);
+
+    // repo not selected yet
+    expect(createConversationSpy).not.toHaveBeenCalled();
+
+    // select a repository from the dropdown
+    const dropdown = await waitFor(() =>
+      within(repoConnector).getByTestId("repo-dropdown"),
+    );
+    await userEvent.click(dropdown);
+
+    const repoOption = screen.getByText("rbren/polaris");
+    await userEvent.click(repoOption);
+    await userEvent.click(launchButton);
+
+    expect(createConversationSpy).toHaveBeenCalledExactlyOnceWith(
+      "gui",
+      "rbren/polaris",
+      "github",
+      undefined,
+      [],
+      undefined,
+      undefined,
+    );
+  });
+
+  it("should change the launch button text to 'Loading...' when creating a conversation", async () => {
+    const retrieveUserGitRepositoriesSpy = vi.spyOn(
+      OpenHands,
+      "retrieveUserGitRepositories",
+    );
+    retrieveUserGitRepositoriesSpy.mockResolvedValue(MOCK_RESPOSITORIES);
+
+    renderRepoConnector();
+
+    const launchButton = screen.getByTestId("repo-launch-button");
+
+    // Wait for the loading state to be replaced with the dropdown
+    const dropdown = await waitFor(() => screen.getByTestId("repo-dropdown"));
+    await userEvent.click(dropdown);
+    await userEvent.click(screen.getByText("rbren/polaris"));
+
+    await userEvent.click(launchButton);
+    expect(launchButton).toBeDisabled();
+    expect(launchButton).toHaveTextContent(/Loading/i);
+  });
+
+  it("should not display a button to settings if the user is signed in with their git provider", async () => {
+    renderRepoConnector(true);
+    expect(
+      screen.queryByTestId("navigate-to-settings-button"),
+    ).not.toBeInTheDocument();
+  });
+
+  it("should display a button to settings if the user needs to sign in with their git provider", async () => {
+    renderRepoConnector(false);
+
+    const goToSettingsButton = await screen.findByTestId(
+      "navigate-to-settings-button",
+    );
+    const dropdown = screen.queryByTestId("repo-dropdown");
+    const launchButton = screen.queryByTestId("repo-launch-button");
+    const providerLinks = screen.queryAllByText(/add git(hub|lab) repos/i);
+
+    expect(dropdown).not.toBeInTheDocument();
+    expect(launchButton).not.toBeInTheDocument();
+    expect(providerLinks.length).toBe(0);
+
+    expect(goToSettingsButton).toBeInTheDocument();
+
+    await userEvent.click(goToSettingsButton);
+    await screen.findByTestId("git-settings-screen");
+  });
+});
--- a/frontend/tests/components/features/home/task-card.test.tsx
+++ b/frontend/tests/components/features/home/task-card.test.tsx
@@ -0,0 +1,111 @@
+import { render, screen } from "@testing-library/react";
+import { beforeEach, describe, expect, it, vi } from "vitest";
+import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
+import userEvent from "@testing-library/user-event";
+import { Provider } from "react-redux";
+import { createRoutesStub } from "react-router";
+import { setupStore } from "test-utils";
+import { SuggestedTask } from "#/components/features/home/tasks/task.types";
+import OpenHands from "#/api/open-hands";
+import { AuthProvider } from "#/context/auth-context";
+import { TaskCard } from "#/components/features/home/tasks/task-card";
+import { GitRepository } from "#/types/git";
+
+const MOCK_TASK_1: SuggestedTask = {
+  issue_number: 123,
+  repo: "repo1",
+  title: "Task 1",
+  task_type: "MERGE_CONFLICTS",
+  git_provider: "github",
+};
+
+const MOCK_RESPOSITORIES: GitRepository[] = [
+  { id: 1, full_name: "repo1", git_provider: "github", is_public: true },
+  { id: 2, full_name: "repo2", git_provider: "github", is_public: true },
+  { id: 3, full_name: "repo3", git_provider: "gitlab", is_public: true },
+  { id: 4, full_name: "repo4", git_provider: "gitlab", is_public: true },
+];
+
+const renderTaskCard = (task = MOCK_TASK_1) => {
+  const RouterStub = createRoutesStub([
+    {
+      Component: () => <TaskCard task={task} />,
+      path: "/",
+    },
+    {
+      Component: () => <div data-testid="conversation-screen" />,
+      path: "/conversations/:conversationId",
+    },
+  ]);
+
+  return render(<RouterStub />, {
+    wrapper: ({ children }) => (
+      <Provider store={setupStore()}>
+        <AuthProvider initialProvidersAreSet>
+          <QueryClientProvider client={new QueryClient()}>
+            {children}
+          </QueryClientProvider>
+        </AuthProvider>
+      </Provider>
+    ),
+  });
+};
+
+describe("TaskCard", () => {
+  it("format the issue id", async () => {
+    renderTaskCard();
+
+    const taskId = screen.getByTestId("task-id");
+    expect(taskId).toHaveTextContent(/#123/i);
+  });
+
+  it("should call createConversation when clicking the launch button", async () => {
+    const createConversationSpy = vi.spyOn(OpenHands, "createConversation");
+
+    renderTaskCard();
+
+    const launchButton = screen.getByTestId("task-launch-button");
+    await userEvent.click(launchButton);
+
+    expect(createConversationSpy).toHaveBeenCalled();
+  });
+
+  describe("creating suggested task conversation", () => {
+    beforeEach(() => {
+      const retrieveUserGitRepositoriesSpy = vi.spyOn(
+        OpenHands,
+        "retrieveUserGitRepositories",
+      );
+      retrieveUserGitRepositoriesSpy.mockResolvedValue(MOCK_RESPOSITORIES);
+    });
+
+    it("should call create conversation with suggest task trigger and selected suggested task", async () => {
+      const createConversationSpy = vi.spyOn(OpenHands, "createConversation");
+
+      renderTaskCard(MOCK_TASK_1);
+
+      const launchButton = screen.getByTestId("task-launch-button");
+      await userEvent.click(launchButton);
+
+      expect(createConversationSpy).toHaveBeenCalledWith(
+        "suggested_task",
+        MOCK_RESPOSITORIES[0].full_name,
+        MOCK_RESPOSITORIES[0].git_provider,
+        undefined,
+        [],
+        undefined,
+        MOCK_TASK_1,
+      );
+    });
+  });
+
+  it("should disable the launch button and update text content when creating a conversation", async () => {
+    renderTaskCard();
+
+    const launchButton = screen.getByTestId("task-launch-button");
+    await userEvent.click(launchButton);
+
+    expect(launchButton).toHaveTextContent(/Loading/i);
+    expect(launchButton).toBeDisabled();
+  });
+});
--- a/frontend/tests/components/features/home/task-suggestions.test.tsx
+++ b/frontend/tests/components/features/home/task-suggestions.test.tsx
@@ -0,0 +1,99 @@
+import { render, screen, waitFor } from "@testing-library/react";
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
+import { Provider } from "react-redux";
+import { createRoutesStub } from "react-router";
+import { setupStore } from "test-utils";
+import { TaskSuggestions } from "#/components/features/home/tasks/task-suggestions";
+import { SuggestionsService } from "#/api/suggestions-service/suggestions-service.api";
+import { MOCK_TASKS } from "#/mocks/task-suggestions-handlers";
+import { AuthProvider } from "#/context/auth-context";
+
+const renderTaskSuggestions = (initialProvidersAreSet = true) => {
+  const RouterStub = createRoutesStub([
+    {
+      Component: TaskSuggestions,
+      path: "/",
+    },
+    {
+      Component: () => <div data-testid="conversation-screen" />,
+      path: "/conversations/:conversationId",
+    },
+    {
+      Component: () => <div data-testid="settings-screen" />,
+      path: "/settings",
+    },
+  ]);
+
+  return render(<RouterStub />, {
+    wrapper: ({ children }) => (
+      <Provider store={setupStore()}>
+        <AuthProvider initialProvidersAreSet={initialProvidersAreSet}>
+          <QueryClientProvider client={new QueryClient()}>
+            {children}
+          </QueryClientProvider>
+        </AuthProvider>
+      </Provider>
+    ),
+  });
+};
+
+describe("TaskSuggestions", () => {
+  const getSuggestedTasksSpy = vi.spyOn(
+    SuggestionsService,
+    "getSuggestedTasks",
+  );
+
+  afterEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it("should render the task suggestions section", () => {
+    renderTaskSuggestions();
+    screen.getByTestId("task-suggestions");
+  });
+
+  it("should render an empty message if there are no tasks", async () => {
+    getSuggestedTasksSpy.mockResolvedValue([]);
+    renderTaskSuggestions();
+    await screen.findByText(/No tasks available/i);
+  });
+
+  it("should render the task groups with the correct titles", async () => {
+    getSuggestedTasksSpy.mockResolvedValue(MOCK_TASKS);
+    renderTaskSuggestions();
+
+    await waitFor(() => {
+      MOCK_TASKS.forEach((taskGroup) => {
+        screen.getByText(taskGroup.title);
+      });
+    });
+  });
+
+  it("should render the task cards with the correct task details", async () => {
+    getSuggestedTasksSpy.mockResolvedValue(MOCK_TASKS);
+    renderTaskSuggestions();
+
+    await waitFor(() => {
+      MOCK_TASKS.forEach((task) => {
+        screen.getByText(task.title);
+      });
+    });
+  });
+
+  it("should render skeletons when loading", async () => {
+    getSuggestedTasksSpy.mockResolvedValue(MOCK_TASKS);
+    renderTaskSuggestions();
+
+    const skeletons = screen.getAllByTestId("task-group-skeleton");
+    expect(skeletons.length).toBeGreaterThan(0);
+
+    await waitFor(() => {
+      MOCK_TASKS.forEach((taskGroup) => {
+        screen.getByText(taskGroup.title);
+      });
+    });
+
+    expect(screen.queryByTestId("task-group-skeleton")).not.toBeInTheDocument();
+  });
+});
--- a/frontend/tests/components/features/payment/payment-form.test.tsx
+++ b/frontend/tests/components/features/payment/payment-form.test.tsx
@@ -61,25 +61,25 @@ describe("PaymentForm", () => {
    renderPaymentForm();

    const topUpInput = await screen.findByTestId("top-up-input");
-    await user.type(topUpInput, "50.12");
+    await user.type(topUpInput, "50");

    const topUpButton = screen.getByText("PAYMENT$ADD_CREDIT");
    await user.click(topUpButton);

-    expect(createCheckoutSessionSpy).toHaveBeenCalledWith(50.12);
+    expect(createCheckoutSessionSpy).toHaveBeenCalledWith(50);
  });

-  it("should round the top-up amount to two decimal places", async () => {
+  it("should only accept integer values", async () => {
    const user = userEvent.setup();
    renderPaymentForm();

    const topUpInput = await screen.findByTestId("top-up-input");
-    await user.type(topUpInput, "50.125456");
+    await user.type(topUpInput, "50");

    const topUpButton = screen.getByText("PAYMENT$ADD_CREDIT");
    await user.click(topUpButton);

-    expect(createCheckoutSessionSpy).toHaveBeenCalledWith(50.13);
+    expect(createCheckoutSessionSpy).toHaveBeenCalledWith(50);
  });

  it("should disable the top-up button if the user enters an invalid amount", async () => {
@@ -100,7 +100,7 @@ describe("PaymentForm", () => {
    renderPaymentForm();

    const topUpInput = await screen.findByTestId("top-up-input");
-    await user.type(topUpInput, "50.12");
+    await user.type(topUpInput, "50");

    const topUpButton = screen.getByText("PAYMENT$ADD_CREDIT");
    await user.click(topUpButton);
@@ -114,7 +114,7 @@ describe("PaymentForm", () => {
      renderPaymentForm();

      const topUpInput = await screen.findByTestId("top-up-input");
-      await user.type(topUpInput, "-50.12");
+      await user.type(topUpInput, "-50");

      const topUpButton = screen.getByText("PAYMENT$ADD_CREDIT");
      await user.click(topUpButton);
@@ -139,6 +139,8 @@ describe("PaymentForm", () => {
      const user = userEvent.setup();
      renderPaymentForm();

+      // With type="number", the browser would prevent non-numeric input,
+      // but we'll test the validation logic anyway
      const topUpInput = await screen.findByTestId("top-up-input");
      await user.type(topUpInput, "abc");

@@ -160,5 +162,19 @@ describe("PaymentForm", () => {

      expect(createCheckoutSessionSpy).not.toHaveBeenCalled();
    });
+
+    test("user enters a decimal value", async () => {
+      const user = userEvent.setup();
+      renderPaymentForm();
+
+      // With step="1", the browser would validate this, but we'll test our validation logic
+      const topUpInput = await screen.findByTestId("top-up-input");
+      await user.type(topUpInput, "50.5");
+
+      const topUpButton = screen.getByText("PAYMENT$ADD_CREDIT");
+      await user.click(topUpButton);
+
+      expect(createCheckoutSessionSpy).not.toHaveBeenCalled();
+    });
  });
 });
--- a/frontend/tests/components/file-explorer/explorer-tree.test.tsx
+++ b/frontend/tests/components/file-explorer/explorer-tree.test.tsx
@@ -1,28 +0,0 @@
-import { screen } from "@testing-library/react";
-import { renderWithProviders } from "test-utils";
-import { describe, afterEach, vi, it, expect } from "vitest";
-import { ExplorerTree } from "#/components/features/file-explorer/explorer-tree";
-
-const FILES = ["file-1-1.ts", "folder-1-2"];
-
-describe.skip("ExplorerTree", () => {
-  afterEach(() => {
-    vi.resetAllMocks();
-  });
-
-  it("should render the explorer", () => {
-    renderWithProviders(<ExplorerTree files={FILES} defaultOpen />);
-
-    expect(screen.getByText("file-1-1.ts")).toBeInTheDocument();
-    expect(screen.getByText("folder-1-2")).toBeInTheDocument();
-    // TODO: make sure children render
-  });
-
-  it("should render the explorer given the defaultExpanded prop", () => {
-    renderWithProviders(<ExplorerTree files={FILES} />);
-
-    expect(screen.queryByText("file-1-1.ts")).toBeInTheDocument();
-    expect(screen.queryByText("folder-1-2")).toBeInTheDocument();
-    // TODO: make sure children don't render
-  });
-});
--- a/frontend/tests/components/file-explorer/file-explorer.test.tsx
+++ b/frontend/tests/components/file-explorer/file-explorer.test.tsx
@@ -1,64 +0,0 @@
-import { screen } from "@testing-library/react";
-import userEvent from "@testing-library/user-event";
-import { renderWithProviders } from "test-utils";
-import { describe, it, expect, vi, afterEach } from "vitest";
-import { AgentState } from "#/types/agent-state";
-import { FileExplorer } from "#/components/features/file-explorer/file-explorer";
-import { FileService } from "#/api/file-service/file-service.api";
-
-const getFilesSpy = vi.spyOn(FileService, "getFiles");
-
-vi.mock("../../services/fileService", async () => ({
-  uploadFiles: vi.fn(),
-}));
-
-const renderFileExplorerWithRunningAgentState = () =>
-  renderWithProviders(<FileExplorer isOpen onToggle={() => {}} />, {
-    preloadedState: {
-      agent: {
-        curAgentState: AgentState.RUNNING,
-      },
-    },
-  });
-
-describe.skip("FileExplorer", () => {
-  afterEach(() => {
-    vi.clearAllMocks();
-  });
-
-  it("should get the workspace directory", async () => {
-    renderFileExplorerWithRunningAgentState();
-
-    expect(await screen.findByText("folder1")).toBeInTheDocument();
-    expect(await screen.findByText("file1.ts")).toBeInTheDocument();
-    expect(getFilesSpy).toHaveBeenCalledTimes(1); // once for root
-  });
-
-  it("should refetch the workspace when clicking the refresh button", async () => {
-    const user = userEvent.setup();
-    renderFileExplorerWithRunningAgentState();
-
-    expect(await screen.findByText("folder1")).toBeInTheDocument();
-    expect(await screen.findByText("file1.ts")).toBeInTheDocument();
-    expect(getFilesSpy).toHaveBeenCalledTimes(1); // once for root
-
-    const refreshButton = screen.getByTestId("refresh");
-    await user.click(refreshButton);
-
-    expect(getFilesSpy).toHaveBeenCalledTimes(2); // once for root, once for refresh button
-  });
-
-  it("should toggle the explorer visibility when clicking the toggle button", async () => {
-    const user = userEvent.setup();
-    renderFileExplorerWithRunningAgentState();
-
-    const folder1 = await screen.findByText("folder1");
-    expect(folder1).toBeInTheDocument();
-
-    const toggleButton = screen.getByTestId("toggle");
-    await user.click(toggleButton);
-
-    expect(folder1).toBeInTheDocument();
-    expect(folder1).not.toBeVisible();
-  });
-});
--- a/Show More
+++ b/Show More