Merge branch 'main' into test-user

Add Logging
2026-04-29 03:00:45 -04:00 · 2025-09-17 14:02:52 -04:00 · 2025-09-07 21:55:39 -04:00 · 2025-09-05 20:52:48 -04:00 · 2025-09-05 14:20:10 -04:00 · 2025-09-05 13:16:03 -04:00
386 changed files with 5793 additions and 22680 deletions
--- a/.github/scripts/update_pr_description.sh
+++ b/.github/scripts/update_pr_description.sh
@@ -18,7 +18,7 @@ DOCKER_RUN_COMMAND="docker run -it --rm \
  docker.all-hands.dev/all-hands-ai/openhands:${SHORT_SHA}"

 # Define the uvx command
-UVX_RUN_COMMAND="uvx --python 3.12 --from git+https://github.com/All-Hands-AI/OpenHands@${BRANCH_NAME}#subdirectory=openhands-cli openhands"
+UVX_RUN_COMMAND="uvx --python 3.12 --from git+https://github.com/All-Hands-AI/OpenHands@${BRANCH_NAME} openhands"

 # Get the current PR body
 PR_BODY=$(gh pr view "$PR_NUMBER" --json body --jq .body)
--- a/.github/workflows/cli-build-test.yml
+++ b/.github/workflows/cli-build-test.yml
@@ -1,58 +0,0 @@
-# Workflow that builds and tests the CLI binary executable
-name: CLI - Build and Test Binary
-
-# Run on pushes to main branch and all pull requests, but only when CLI files change
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - "openhands-cli/**"
-  pull_request:
-    paths:
-      - "openhands-cli/**"
-
-# Cancel previous runs if a new commit is pushed
-concurrency:
-  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  build-and-test-binary:
-    name: Build and test binary executable
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.12
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v3
-        with:
-          version: "latest"
-
-      - name: Install dependencies
-        working-directory: openhands-cli
-        run: |
-          uv sync
-
-      - name: Build binary executable
-        working-directory: openhands-cli
-        run: |
-          ./build.sh --install-pyinstaller | tee output.log
-          echo "Full output:"
-          cat output.log
-
-          if grep -q "❌" output.log; then
-            echo "❌ Found failure marker in output"
-            exit 1
-          fi
-
-          echo "✅ Build & test finished without ❌ markers"
--- a/.github/workflows/ghcr-build.yml
+++ b/.github/workflows/ghcr-build.yml
@@ -46,7 +46,6 @@ jobs:
          else
            json=$(jq -n -c '[
                { image: "nikolaik/python-nodejs:python3.12-nodejs22", tag: "nikolaik" },
-                { image: "ghcr.io/all-hands-ai/python-nodejs:python3.13-nodejs22-trixie", tag: "trixie" },
                { image: "ubuntu:24.04", tag: "ubuntu" }
              ]')
          fi
@@ -137,7 +136,6 @@ jobs:
        if: github.event.pull_request.head.repo.fork != true
        shell: bash
        run: |
-
          ./containers/build.sh -i runtime -o ${{ env.REPO_OWNER }} -t ${{ matrix.base_image.tag }} --dry

          DOCKER_BUILD_JSON=$(jq -c . < docker-build-dry.json)
@@ -213,8 +211,6 @@ jobs:
            latest=auto
            prefix=
            suffix=
-        env:
-          DOCKER_METADATA_PR_HEAD_SHA: true
      - name: Determine app image tag
        shell: bash
        run: |
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -73,24 +73,6 @@ jobs:
        working-directory: ./enterprise
        run: pre-commit run --all-files --config ./dev_config/python/.pre-commit-config.yaml

-  lint-cli-python:
-    name: Lint CLI python
-    runs-on: blacksmith-4vcpu-ubuntu-2204
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - name: Set up python
-        uses: useblacksmith/setup-python@v6
-        with:
-          python-version: 3.12
-          cache: "pip"
-      - name: Install pre-commit
-        run: pip install pre-commit==4.2.0
-      - name: Run pre-commit hooks
-        working-directory: ./openhands-cli
-        run: pre-commit run --all-files --config ./dev_config/python/.pre-commit-config.yaml
-
  # Check version consistency across documentation
  check-version-consistency:
    name: Check version consistency
--- a/.github/workflows/py-tests.yml
+++ b/.github/workflows/py-tests.yml
@@ -19,16 +19,12 @@ jobs:
  # Run python tests on Linux
  test-on-linux:
    name: Python Tests on Linux
-    runs-on: blacksmith-4vcpu-ubuntu-2404
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    env:
      INSTALL_DOCKER: "0" # Set to '0' to skip Docker installation
    strategy:
      matrix:
        python-version: ["3.12"]
-    permissions:
-      # For coverage comment and python-coverage-comment-action branch
-      pull-requests: write
-      contents: write
    steps:
      - uses: actions/checkout@v4
      - name: Set up Docker Buildx
@@ -52,21 +48,10 @@ jobs:
      - name: Build Environment
        run: make build
      - name: Run Unit Tests
-        run: PYTHONPATH=".:$PYTHONPATH" poetry run pytest --forked -n auto -s ./tests/unit --cov=openhands --cov-branch
-        env:
-          COVERAGE_FILE: ".coverage.${{ matrix.python_version }}"
+        run: PYTHONPATH=".:$PYTHONPATH" poetry run pytest --forked -n auto -svv ./tests/unit
      - name: Run Runtime Tests with CLIRuntime
-        run: PYTHONPATH=".:$PYTHONPATH" TEST_RUNTIME=cli poetry run pytest -s tests/runtime/test_bash.py --cov=openhands --cov-branch
-        env:
-          COVERAGE_FILE: ".coverage.runtime.${{ matrix.python_version }}"
-      - name: Store coverage file
-        uses: actions/upload-artifact@v4
-        with:
-          name: coverage-openhands
-          path: |
-            .coverage.${{ matrix.python_version }}
-            .coverage.runtime.${{ matrix.python_version }}
-          include-hidden-files: true
+        run: PYTHONPATH=".:$PYTHONPATH" TEST_RUNTIME=cli poetry run pytest -svv tests/runtime/test_bash.py
+
  # Run specific Windows python tests
  test-on-windows:
    name: Python Tests on Windows
@@ -100,7 +85,7 @@ jobs:
          DEBUG: "1"
  test-enterprise:
    name: Enterprise Python Unit Tests
-    runs-on: blacksmith-4vcpu-ubuntu-2404
+    runs-on: blacksmith-4vcpu-ubuntu-2204
    strategy:
      matrix:
        python-version: ["3.12"]
@@ -117,87 +102,5 @@ jobs:
        working-directory: ./enterprise
        run: poetry install --with dev,test
      - name: Run Unit Tests
-        # Use base working directory for coverage paths to line up.
-        run: PYTHONPATH=".:$PYTHONPATH" poetry run --project=enterprise pytest --forked -n auto -s -p no:ddtrace -p no:ddtrace.pytest_bdd -p no:ddtrace.pytest_benchmark ./enterprise/tests/unit --cov=enterprise --cov-branch
-        env:
-          COVERAGE_FILE: ".coverage.enterprise.${{ matrix.python_version }}"
-      - name: Store coverage file
-        uses: actions/upload-artifact@v4
-        with:
-          name: coverage-enterprise
-          path: ".coverage.enterprise.${{ matrix.python_version }}"
-          include-hidden-files: true
-
-  # Run CLI unit tests
-  test-cli-python:
-    name: CLI Unit Tests
-    runs-on: blacksmith-4vcpu-ubuntu-2404
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Set up Python
-        uses: useblacksmith/setup-python@v6
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v3
-        with:
-          version: "latest"
-
-      - name: Install dependencies
-        working-directory: ./openhands-cli
-        run: |
-          uv sync --group dev
-
-      - name: Run CLI unit tests
-        working-directory: ./openhands-cli
-        env:
-          # write coverage to repo root so the merge step finds it
-          COVERAGE_FILE: "${{ github.workspace }}/.coverage.openhands-cli.${{ matrix.python-version }}"
-        run: |
-          uv run pytest --forked -n auto -s \
-            -p no:ddtrace -p no:ddtrace.pytest_bdd -p no:ddtrace.pytest_benchmark \
-            tests --cov=openhands_cli --cov-branch
-
-      - name: Store coverage file
-        uses: actions/upload-artifact@v4
-        with:
-          name: coverage-openhands-cli
-          path: ".coverage.openhands-cli.${{ matrix.python-version }}"
-          include-hidden-files: true
-
-
-  coverage-comment:
-    name: Coverage Comment
-    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
-    needs: [test-on-linux, test-enterprise, test-cli-python]
-
-    permissions:
-      pull-requests: write
-      contents: write
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/download-artifact@v5
-        id: download
-        with:
-          pattern: coverage-*
-          merge-multiple: true
-
-      - name: Create symlink for CLI source files
-        run: ln -sf openhands-cli/openhands_cli openhands_cli
-
-      - name: Coverage comment
-        id: coverage_comment
-        uses: py-cov-action/python-coverage-comment-action@v3
-        with:
-          GITHUB_TOKEN: ${{ github.token }}
-          MERGE_COVERAGE_FILES: true
+        working-directory: ./enterprise
+        run: PYTHONPATH=".:$PYTHONPATH" poetry run pytest --forked -n auto -svv -p no:ddtrace -p no:ddtrace.pytest_bdd -p no:ddtrace.pytest_benchmark ./tests/unit
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@@ -1,7 +1,7 @@
 # Publishes the OpenHands PyPi package
 name: Publish PyPi Package

-
+# Triggered manually
 on:
  workflow_dispatch:
    inputs:
@@ -9,9 +9,6 @@ on:
        description: 'Reason for manual trigger'
        required: true
        default: ''
-  push:
-    tags:
-      - "*"

 jobs:
  release:
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -15,8 +15,8 @@ jobs:
          stale-issue-message: 'This issue is stale because it has been open for 40 days with no activity. Remove the stale label or leave a comment, otherwise it will be closed in 10 days.'
          stale-pr-message: 'This PR is stale because it has been open for 40 days with no activity. Remove the stale label or leave a comment, otherwise it will be closed in 10 days.'
          days-before-stale: 40
-          exempt-issue-labels: roadmap,backlog,app-team
+          exempt-issue-labels: roadmap,backlog
          close-issue-message: 'This issue was automatically closed due to 50 days of inactivity. We do this to help keep the issues somewhat manageable and focus on active issues.'
          close-pr-message: 'This PR was closed because it had no activity for 50 days. If you feel this was closed in error, and you would like to continue the PR, please resubmit or let us know.'
          days-before-close: 10
-          operations-per-run: 300
+          operations-per-run: 150
--- a/.github/workflows/welcome-good-first-issue.yml
+++ b/.github/workflows/welcome-good-first-issue.yml
@@ -45,7 +45,7 @@ jobs:
                    "This issue has been labeled as **good first issue**, which means it's a great place to get started with the OpenHands project.\n\n" +
                    "If you're interested in working on it, feel free to! No need to ask for permission.\n\n" +
                    "Be sure to check out our [development setup guide](" + repoUrl + "/blob/main/Development.md) to get your environment set up, and follow our [contribution guidelines](" + repoUrl + "/blob/main/CONTRIBUTING.md) when you're ready to submit a fix.\n\n" +
-                    "Feel free to join our developer community on [Slack](https://all-hands.dev/joinslack). You can ask for [help](https://openhands-ai.slack.com/archives/C078L0FUGUX), [feedback](https://openhands-ai.slack.com/archives/C086ARSNMGA), and even ask for a [PR review](https://openhands-ai.slack.com/archives/C08D8FJ5771).\n\n" +
+                    "Feel free to join our developer community on [Slack](dub.sh/openhands). You can ask for [help](https://openhands-ai.slack.com/archives/C078L0FUGUX), [feedback](https://openhands-ai.slack.com/archives/C086ARSNMGA), and even ask for a [PR review](https://openhands-ai.slack.com/archives/C08D8FJ5771).\n\n" +
                    "🙌 Happy hacking! 🙌\n\n" +
                    "<!-- auto-comment:good-first-issue -->"
            });
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -113,19 +113,19 @@ individual, or aggression toward or disparagement of classes of individuals.
 **Consequence**: A permanent ban from any sort of public interaction within the
 community.

-### Slack Etiquettes
+### Slack and Discord Etiquettes

-These Slack etiquette guidelines are designed to foster an inclusive, respectful, and productive environment for all community members. By following these best practices, we ensure effective communication and collaboration while minimizing disruptions. Let’s work together to build a supportive and welcoming community!
+These Slack and Discord etiquette guidelines are designed to foster an inclusive, respectful, and productive environment for all community members. By following these best practices, we ensure effective communication and collaboration while minimizing disruptions. Let’s work together to build a supportive and welcoming community!

 - Communicate respectfully and professionally, avoiding sarcasm or harsh language, and remember that tone can be difficult to interpret in text.
 - Use threads for specific discussions to keep channels organized and easier to follow.
 - Tag others only when their input is critical or urgent, and use @here, @channel or @everyone sparingly to minimize disruptions.
 - Be patient, as open-source contributors and maintainers often have other commitments and may need time to respond.
- Post questions or discussions in the most relevant channel (e.g., for [slack - #general](https://openhands-ai.slack.com/archives/C06P5NCGSFP) for general topics, [slack - #questions](https://openhands-ai.slack.com/archives/C06U8UTKSAD) for queries/questions.
+- Post questions or discussions in the most relevant channel (e.g., for [slack - #general](https://openhands-ai.slack.com/archives/C06P5NCGSFP) for general topics, [slack - #questions](https://openhands-ai.slack.com/archives/C06U8UTKSAD) for queries/questions, [discord - #general](https://discord.com/channels/1222935860639563850/1222935861386018885)).
 - When asking for help or raising issues, include necessary details like links, screenshots, or clear explanations to provide context.
 - Keep discussions in public channels whenever possible to allow others to benefit from the conversation, unless the matter is sensitive or private.
 - Always adhere to [our standards](https://github.com/All-Hands-AI/OpenHands/blob/main/CODE_OF_CONDUCT.md#our-standards) to ensure a welcoming and collaborative environment.
- If you choose to mute a channel, consider setting up alerts for topics that still interest you to stay engaged. For Slack, Go to Settings → Notifications → My Keywords to add specific keywords that will notify you when mentioned. For example, if you're here for discussions about LLMs, mute the channel if it’s too busy, but set notifications to alert you only when “LLMs” appears in messages.
+- If you choose to mute a channel, consider setting up alerts for topics that still interest you to stay engaged. For Slack, Go to Settings → Notifications → My Keywords to add specific keywords that will notify you when mentioned. For example, if you're here for discussions about LLMs, mute the channel if it’s too busy, but set notifications to alert you only when “LLMs” appears in messages. Also for Discord, go to the channel notifications and choose the option that best describes your need.

 ## Attribution

--- a/Development.md
+++ b/Development.md
@@ -159,7 +159,7 @@ poetry run pytest ./tests/unit/test_*.py
 To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker
 container image by setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.

-Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.58-nikolaik`
+Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.56-nikolaik`

 ## Develop inside Docker container

--- a/README.md
+++ b/README.md
@@ -11,7 +11,8 @@
  <a href="https://github.com/All-Hands-AI/OpenHands/stargazers"><img src="https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Stargazers"></a>
  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License"></a>
  <br/>
-  <a href="https://all-hands.dev/joinslack"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
+  <a href="https://dub.sh/openhands"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
+  <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits"></a>
  <br/>
  <a href="https://docs.all-hands.dev/usage/getting-started"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
@@ -43,6 +44,8 @@ Learn more at [docs.all-hands.dev](https://docs.all-hands.dev), or [sign up for
 > [this short form](https://docs.google.com/forms/d/e/1FAIpQLSet3VbGaz8z32gW9Wm-Grl4jpt5WgMXPgJ4EDPVmCETCBpJtQ/viewform)
 > to join our Design Partner program, where you'll get early access to commercial features and the opportunity to provide input on our product roadmap.

+![App screenshot](./docs/static/img/screenshot.png)
+
 ## ☁️ OpenHands Cloud
 The easiest way to get started with OpenHands is on [OpenHands Cloud](https://app.all-hands.dev),
 which comes with $20 in free credits for new users.
@@ -76,17 +79,17 @@ You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)
 You can also run OpenHands directly with Docker:

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.58-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.58-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands:/.openhands \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.58
+    docker.all-hands.dev/all-hands-ai/openhands:0.56
 ```

 </details>
@@ -100,7 +103,7 @@ docker run -it --rm --pull=always \
 ### Getting Started

 When you open the application, you'll be asked to choose an LLM provider and add an API key.
-[Anthropic's Claude Sonnet 4.5](https://www.anthropic.com/api) (`anthropic/claude-sonnet-4-5-20250929`)
+[Anthropic's Claude Sonnet 4](https://www.anthropic.com/api) (`anthropic/claude-sonnet-4-20250514`)
 works best, but you have [many options](https://docs.all-hands.dev/usage/llms).

 See the [Running OpenHands](https://docs.all-hands.dev/usage/installation) guide for
@@ -137,9 +140,10 @@ troubleshooting resources, and advanced configuration options.
 ## 🤝 How to Join the Community

 OpenHands is a community-driven project, and we welcome contributions from everyone. We do most of our communication
-through Slack, so this is the best place to start, but we also are happy to have you contact us on Github:
+through Slack, so this is the best place to start, but we also are happy to have you contact us on Discord or Github:

- [Join our Slack workspace](https://all-hands.dev/joinslack) - Here we talk about research, architecture, and future development.
+- [Join our Slack workspace](https://dub.sh/openhands) - Here we talk about research, architecture, and future development.
+- [Join our Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback.
 - [Read or post Github Issues](https://github.com/All-Hands-AI/OpenHands/issues) - Check out the issues we're working on, or add your own ideas.

 See more about the community in [COMMUNITY.md](./COMMUNITY.md) or find details on contributing in [CONTRIBUTING.md](./CONTRIBUTING.md).
--- a/README_CN.md
+++ b/README_CN.md
@@ -0,0 +1,148 @@
+
+<a name="readme-top"></a>
+
+<div align="center">
+  <img src="./docs/static/img/logo.png" alt="Logo" width="200">
+  <h1 align="center">OpenHands: 少写代码，多做事</h1>
+</div>
+
+
+<div align="center">
+  <a href="https://github.com/All-Hands-AI/OpenHands/graphs/contributors"><img src="https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Contributors"></a>
+  <a href="https://github.com/All-Hands-AI/OpenHands/stargazers"><img src="https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Stargazers"></a>
+  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License"></a>
+  <br/>
+  <a href="https://dub.sh/openhands"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="加入我们的Slack社区"></a>
+  <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="加入我们的Discord社区"></a>
+  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="致谢"></a>
+  <br/>
+  <a href="https://docs.all-hands.dev/usage/getting-started"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="查看文档"></a>
+  <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper%20on%20Arxiv-000?logoColor=FFE165&logo=arxiv&style=for-the-badge" alt="Arxiv论文"></a>
+  <a href="https://docs.google.com/spreadsheets/d/1wOUdFCMyY6Nt0AIqF705KN4JKOWgeI4wUGUP60krXXs/edit?gid=0#gid=0"><img src="https://img.shields.io/badge/Benchmark%20score-000?logoColor=FFE165&logo=huggingface&style=for-the-badge" alt="评估基准分数"></a>
+  <hr>
+</div>
+
+欢迎使用OpenHands（前身为OpenDevin），这是一个由AI驱动的软件开发代理平台。
+
+OpenHands代理可以完成人类开发者能做的任何事情：修改代码、运行命令、浏览网页、调用API，甚至从StackOverflow复制代码片段。
+
+在[docs.all-hands.dev](https://docs.all-hands.dev)了解更多信息，或[注册OpenHands Cloud](https://app.all-hands.dev)开始使用。
+
+> [!IMPORTANT]
+> 在工作中使用OpenHands？我们很想与您交流！填写
+> [这份简短表格](https://docs.google.com/forms/d/e/1FAIpQLSet3VbGaz8z32gW9Wm-Grl4jpt5WgMXPgJ4EDPVmCETCBpJtQ/viewform)
+> 加入我们的设计合作伙伴计划，您将获得商业功能的早期访问权限，并有机会对我们的产品路线图提供意见。
+
+![应用截图](./docs/static/img/screenshot.png)
+
+## ☁️ OpenHands Cloud
+开始使用OpenHands的最简单方式是在[OpenHands Cloud](https://app.all-hands.dev)上，
+新用户可获得$50的免费额度。
+
+## 💻 在本地运行OpenHands
+
+OpenHands也可以使用Docker在本地系统上运行。
+查看[运行OpenHands](https://docs.all-hands.dev/usage/installation)指南了解
+系统要求和更多信息。
+
+> [!WARNING]
+> 在公共网络上？请参阅我们的[强化Docker安装指南](https://docs.all-hands.dev/usage/runtimes/docker#hardened-docker-installation)
+> 通过限制网络绑定和实施其他安全措施来保护您的部署。
+
+
+```bash
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik
+
+docker run -it --rm --pull=always \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik \
+    -e LOG_ALL_EVENTS=true \
+    -v /var/run/docker.sock:/var/run/docker.sock \
+    -v ~/.openhands:/.openhands \
+    -p 3000:3000 \
+    --add-host host.docker.internal:host-gateway \
+    --name openhands-app \
+    docker.all-hands.dev/all-hands-ai/openhands:0.56
+```
+
+> **注意**: 如果您在0.44版本之前使用过OpenHands，您可能需要运行 `mv ~/.openhands-state ~/.openhands` 来将对话历史迁移到新位置。
+
+您将在[http://localhost:3000](http://localhost:3000)找到运行中的OpenHands！
+
+打开应用程序时，您将被要求选择一个LLM提供商并添加API密钥。
+[Anthropic的Claude Sonnet 4](https://www.anthropic.com/api)（`anthropic/claude-sonnet-4-20250514`）
+效果最佳，但您还有[许多选择](https://docs.all-hands.dev/usage/llms)。
+
+## 💡 运行OpenHands的其他方式
+
+> [!CAUTION]
+> OpenHands旨在由单个用户在其本地工作站上运行。
+> 它不适合多租户部署，即多个用户共享同一实例。没有内置的身份验证、隔离或可扩展性。
+>
+> 如果您有兴趣在多租户环境中运行OpenHands，请
+> [与我们联系](https://docs.google.com/forms/d/e/1FAIpQLSet3VbGaz8z32gW9Wm-Grl4jpt5WgMXPgJ4EDPVmCETCBpJtQ/viewform)
+> 了解高级部署选项。
+
+您还可以[将OpenHands连接到本地文件系统](https://docs.all-hands.dev/usage/runtimes/docker#connecting-to-your-filesystem)，
+以可编程的[无头模式](https://docs.all-hands.dev/usage/how-to/headless-mode)运行OpenHands，
+通过[友好的CLI](https://docs.all-hands.dev/usage/how-to/cli-mode)与其交互，
+或使用[GitHub Action](https://docs.all-hands.dev/usage/how-to/github-action)在标记的问题上运行它。
+
+访问[运行OpenHands](https://docs.all-hands.dev/usage/installation)获取更多信息和设置说明。
+
+如果您想修改OpenHands源代码，请查看[Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md)。
+
+遇到问题？[故障排除指南](https://docs.all-hands.dev/usage/troubleshooting)可以提供帮助。
+
+## 📖 文档
+  <a href="https://deepwiki.com/All-Hands-AI/OpenHands"><img src="https://deepwiki.com/badge.svg" alt="Ask DeepWiki" title="DeepWiki自动生成文档"></a>
+
+要了解有关项目的更多信息，以及使用OpenHands的技巧，
+请查看我们的[文档](https://docs.all-hands.dev/usage/getting-started)。
+
+在那里，您将找到有关如何使用不同LLM提供商、
+故障排除资源和高级配置选项的资源。
+
+## 🤝 如何加入社区
+
+OpenHands是一个社区驱动的项目，我们欢迎每个人的贡献。我们大部分沟通
+通过Slack进行，因此这是开始的最佳场所，但我们也很乐意您通过Discord或Github与我们联系：
+
+- [加入我们的Slack工作空间](https://dub.sh/openhands) - 这里我们讨论研究、架构和未来发展。
+- [加入我们的Discord服务器](https://discord.gg/ESHStjSjD4) - 这是一个社区运营的服务器，用于一般讨论、问题和反馈。
+- [阅读或发布Github问题](https://github.com/All-Hands-AI/OpenHands/issues) - 查看我们正在处理的问题，或添加您自己的想法。
+
+在[COMMUNITY.md](./COMMUNITY.md)中了解更多关于社区的信息，或在[CONTRIBUTING.md](./CONTRIBUTING.md)中找到有关贡献的详细信息。
+
+## 📈 进展
+
+在[这里](https://github.com/orgs/All-Hands-AI/projects/1)查看OpenHands月度路线图（每月月底在维护者会议上更新）。
+
+<p align="center">
+  <a href="https://star-history.com/#All-Hands-AI/OpenHands&Date">
+    <img src="https://api.star-history.com/svg?repos=All-Hands-AI/OpenHands&type=Date" width="500" alt="Star History Chart">
+  </a>
+</p>
+
+## 📜 许可证
+
+根据MIT许可证分发。有关更多信息，请参阅[`LICENSE`](./LICENSE)。
+
+## 🙏 致谢
+
+OpenHands由大量贡献者构建，每一份贡献都备受感谢！我们还借鉴了其他开源项目，对他们的工作深表感谢。
+
+有关OpenHands中使用的开源项目和许可证列表，请参阅我们的[CREDITS.md](./CREDITS.md)文件。
+
+## 📚 引用
+
+```
+@misc{openhands,
+      title={{OpenHands: An Open Platform for AI Software Developers as Generalist Agents}},
+      author={Xingyao Wang and Boxuan Li and Yufan Song and Frank F. Xu and Xiangru Tang and Mingchen Zhuge and Jiayi Pan and Yueqi Song and Bowen Li and Jaskirat Singh and Hoang H. Tran and Fuqiang Li and Ren Ma and Mingzhang Zheng and Bill Qian and Yanjun Shao and Niklas Muennighoff and Yizhe Zhang and Binyuan Hui and Junyang Lin and Robert Brennan and Hao Peng and Heng Ji and Graham Neubig},
+      year={2024},
+      eprint={2407.16741},
+      archivePrefix={arXiv},
+      primaryClass={cs.SE},
+      url={https://arxiv.org/abs/2407.16741},
+}
+```
--- a/README_JA.md
+++ b/README_JA.md
@@ -0,0 +1,60 @@
+<a name="readme-top"></a>
+
+<div align="center">
+  <img src="./docs/static/img/logo.png" alt="Logo" width="200">
+  <h1 align="center">OpenHands: コードを減らして、もっと作ろう</h1>
+</div>
+
+<div align="center">
+  <a href="https://github.com/All-Hands-AI/OpenHands/graphs/contributors"><img src="https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Contributors"></a>
+  <a href="https://github.com/All-Hands-AI/OpenHands/stargazers"><img src="https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Stargazers"></a>
+  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License"></a>
+  <br/>
+  <a href="https://dub.sh/openhands"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Slackコミュニティに参加"></a>
+  <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Discordコミュニティに参加"></a>
+  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="クレジット"></a>
+  <br/>
+  <a href="https://docs.all-hands.dev/usage/getting-started"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="ドキュメントを見る"></a>
+  <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper%20on%20Arxiv-000?logoColor=FFE165&logo=arxiv&style=for-the-badge" alt="Arxiv論文"></a>
+  <a href="https://docs.google.com/spreadsheets/d/1wOUdFCMyY6Nt0AIqF705KN4JKOWgeI4wUGUP60krXXs/edit?gid=0#gid=0"><img src="https://img.shields.io/badge/Benchmark%20score-000?logoColor=FFE165&logo=huggingface&style=for-the-badge" alt="評価ベンチマークスコア"></a>
+  <hr>
+</div>
+
+OpenHands（旧OpenDevin）へようこそ。これはAIが駆動するソフトウェア開発エージェントのプラットフォームです。
+
+OpenHandsのエージェントは人間の開発者ができることは何でもこなします。コードを修正し、コマンドを実行し、ウェブを閲覧し、APIを呼び出し、StackOverflowからコードスニペットをコピーすることさえできます。
+
+詳細は[docs.all-hands.dev](https://docs.all-hands.dev)をご覧いただくか、[OpenHands Cloud](https://app.all-hands.dev)に登録して始めましょう。
+
+> [!IMPORTANT]
+> 仕事でOpenHandsを使っていますか？ぜひお話を聞かせてください。[こちらの短いフォーム](https://docs.google.com/forms/d/e/1FAIpQLSet3VbGaz8z32gW9Wm-Grl4jpt5WgMXPgJ4EDPVmCETCBpJtQ/viewform)にご記入いただき、Design Partnerプログラムにご参加ください。商用機能の早期アクセスや製品ロードマップへのフィードバックの機会を提供します。
+
+![アプリのスクリーンショット](./docs/static/img/screenshot.png)
+
+## ☁️ OpenHands Cloud
+OpenHandsを始める最も簡単な方法は[OpenHands Cloud](https://app.all-hands.dev)を利用することです。新規ユーザーには50ドル分の無料クレジットが付与されます。
+
+## 💻 OpenHandsをローカルで実行する
+
+OpenHandsはDockerを利用してローカル環境でも実行できます。システム要件や詳細については[Running OpenHands](https://docs.all-hands.dev/usage/installation)ガイドをご覧ください。
+
+> [!WARNING]
+> 公共ネットワークで実行していますか？[Hardened Docker Installation Guide](https://docs.all-hands.dev/usage/runtimes/docker#hardened-docker-installation)を参照して、ネットワークバインディングの制限や追加のセキュリティ対策を実施してください。
+
+```bash
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik
+
+docker run -it --rm --pull=always \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik \
+    -e LOG_ALL_EVENTS=true \
+    -v /var/run/docker.sock:/var/run/docker.sock \
+    -v ~/.openhands:/.openhands \
+    -p 3000:3000 \
+    --add-host host.docker.internal:host-gateway \
+    --name openhands-app \
+    docker.all-hands.dev/all-hands-ai/openhands:0.56
+```
+
+**注**: バージョン0.44以前のOpenHandsを使用していた場合は、会話履歴を移行するために `mv ~/.openhands-state ~/.openhands` を実行してください。
+
+OpenHandsは[http://localhost:3000](http://localhost:3000)で起動します！
--- a/config.template.toml
+++ b/config.template.toml
@@ -489,47 +489,6 @@ type = "noop"
 # Run the runtime sandbox container in privileged mode for use with docker-in-docker
 #privileged = false

-#################################### MCP #####################################
-# Configuration for Model Context Protocol (MCP) servers
-# MCP allows OpenHands to communicate with external tool servers
-##############################################################################
-[mcp]
-# SSE servers - Server-Sent Events transport (legacy)
-#sse_servers = [
-#    # Basic SSE server with just a URL
-#    "http://localhost:8080/mcp/sse",
-#
-#    # SSE server with authentication
-#    {url = "https://api.example.com/mcp/sse", api_key = "your-api-key"}
-#]
-
-# SHTTP servers - Streamable HTTP transport (recommended)
-#shttp_servers = [
-#    # Basic SHTTP server with default 60s timeout
-#    "https://api.example.com/mcp/shttp",
-#
-#    # SHTTP server with custom timeout for long-running tools
-#    {
-#        url = "https://api.example.com/mcp/shttp",
-#        api_key = "your-api-key",
-#        timeout = 180  # 3 minutes for processing-heavy tools (1-3600 seconds)
-#    }
-#]
-
-# Stdio servers - Direct process communication (development only)
-#stdio_servers = [
-#    # Basic stdio server
-#    {name = "filesystem", command = "npx", args = ["@modelcontextprotocol/server-filesystem", "/"]},
-#
-#    # Stdio server with environment variables
-#    {
-#        name = "fetch",
-#        command = "uvx",
-#        args = ["mcp-server-fetch"],
-#        env = {DEBUG = "true"}
-#    }
-#]
-
 #################################### Model Routing ############################
 # Configuration for experimental model routing feature
 # Enables intelligent switching between different LLM models for specific purposes
--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -1,5 +1,5 @@
 ARG OPENHANDS_BUILD_VERSION=dev
-FROM node:24.8-trixie-slim AS frontend-builder
+FROM node:24.3.0-bookworm-slim AS frontend-builder

 WORKDIR /app

@@ -9,7 +9,7 @@ RUN npm ci
 COPY frontend ./
 RUN npm run build

-FROM python:3.13.7-slim-trixie AS base
+FROM python:3.12.10-slim AS base
 FROM base AS backend-builder

 WORKDIR /app
--- a/containers/dev/compose.yml
+++ b/containers/dev/compose.yml
@@ -12,7 +12,7 @@ services:
      - SANDBOX_API_HOSTNAME=host.docker.internal
      - DOCKER_HOST_ADDR=host.docker.internal
      #
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.58-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.56-nikolaik}
      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
    ports:
--- a/dev_config/python/.pre-commit-config.yaml
+++ b/dev_config/python/.pre-commit-config.yaml
@@ -3,9 +3,9 @@ repos:
    rev: v5.0.0
    hooks:
      - id: trailing-whitespace
-        exclude: ^(docs/|modules/|python/|openhands-ui/|third_party/|enterprise/|openhands-cli/)
+        exclude: ^(docs/|modules/|python/|openhands-ui/|third_party/|enterprise/)
      - id: end-of-file-fixer
-        exclude: ^(docs/|modules/|python/|openhands-ui/|third_party/|enterprise/|openhands-cli/)
+        exclude: ^(docs/|modules/|python/|openhands-ui/|third_party/|enterprise/)
      - id: check-yaml
        args: ["--allow-multiple-documents"]
      - id: debug-statements
@@ -28,12 +28,12 @@ repos:
        entry: ruff check --config dev_config/python/ruff.toml
        types_or: [python, pyi, jupyter]
        args: [--fix, --unsafe-fixes]
-        exclude: ^(third_party/|enterprise/|openhands-cli/)
+        exclude: ^(third_party/|enterprise/)
      # Run the formatter.
      - id: ruff-format
        entry: ruff format --config dev_config/python/ruff.toml
        types_or: [python, pyi, jupyter]
-        exclude: ^(third_party/|enterprise/|openhands-cli/)
+        exclude: ^(third_party/|enterprise/)

  - repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.15.0
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -7,7 +7,7 @@ services:
    image: openhands:latest
    container_name: openhands-app-${DATE:-}
    environment:
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.58-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik}
      #- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} # enable this only if you want a specific non-root sandbox user but you will have to manually adjust permissions of ~/.openhands for this user
      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
    ports:
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -3,16 +3,18 @@
  "theme": "mint",
  "name": "All Hands Docs",
  "colors": {
-    "primary": "#99873c"
+    "primary": "#99873c",
+    "light": "#ffe165",
+    "dark": "#ffe165"
  },
  "background": {
    "color": {
-      "light": "#f7f3ee"
+      "light": "#f7f3ee",
+      "dark": "#0B0D0E"
    }
  },
  "appearance": {
-    "default": "light",
-    "strict": true
+    "default": "light"
  },
  "favicon": "/logo-square.png",
  "navigation": {
@@ -29,7 +31,6 @@
            "group": "OpenHands Cloud",
            "pages": [
              "usage/cloud/openhands-cloud",
-              "usage/cloud/pro-subscription",
              {
                "group": "Integrations",
                "pages": [
@@ -108,7 +109,8 @@
                },
                "usage/configuration-options",
                "usage/how-to/custom-sandbox-guide",
-                "usage/search-engine-setup"
+                "usage/search-engine-setup",
+                "usage/mcp"
                ]
              }
            ]
@@ -116,13 +118,7 @@
          {
            "group": "Customizations & Settings",
            "pages": [
-              {
-                "group": "OpenHands Settings",
-                "pages": [
-                  "usage/settings/secrets-settings",
-                  "usage/settings/mcp-settings"
-                ]
-              },
+              "usage/common-settings",
              "usage/prompting/repository",
              {
                "group": "Microagents",
@@ -212,8 +208,9 @@
  },
  "footer": {
    "socials": {
-      "slack": "https://all-hands.dev/joinslack",
-      "github": "https://github.com/All-Hands-AI/OpenHands"
+      "slack": "https://dub.sh/openhands",
+      "github": "https://github.com/All-Hands-AI/OpenHands",
+      "discord": "https://discord.gg/ESHStjSjD4"
    }
  },
  "contextual": {
--- a/docs/index.mdx
+++ b/docs/index.mdx
@@ -4,8 +4,7 @@ description: OpenHands - Code Less, Make More
 icon: book-open
 mode: wide
 ---
-Use AI to tackle the toil in your backlog. Our agents have all the same tools as a human developer:
-they can modify code, run commands, browse the web, call APIs, and yes-even copy code snippets from StackOverflow.
+Use AI to tackle the toil in your backlog. Our agents have all the same tools as a human developer: they can modify code, run commands, browse the web, call APIs, and yes-even copy code snippets from StackOverflow.

 <iframe
  className="w-full aspect-video"
--- a/docs/reo-init.js
+++ b/docs/reo-init.js
@@ -1,14 +0,0 @@
-// Reo.dev tracking initialization
-(function() {
-  var e, t, n;
-  e = "6bac7145b4ee6ec";
-  t = function() {
-    Reo.init({clientID: "6bac7145b4ee6ec"});
-  };
-  n = document.createElement("script");
-  n.src = "https://static.reo.dev/" + e + "/reo.js";
-  n.defer = true;
-  n.onload = t;
-  document.head.appendChild(n);
-})();
-
--- a/docs/static/img/api-key-generation.png
+++ b/docs/static/img/api-key-generation.png
--- a/docs/static/img/connect-repo-no-github.png
+++ b/docs/static/img/connect-repo-no-github.png
--- a/docs/static/img/connect-repo.png
+++ b/docs/static/img/connect-repo.png
--- a/docs/static/img/oh-features.png
+++ b/docs/static/img/oh-features.png
--- a/docs/static/img/screenshot.png
+++ b/docs/static/img/screenshot.png
--- a/docs/usage/cloud/bitbucket-installation.mdx
+++ b/docs/usage/cloud/bitbucket-installation.mdx
@@ -8,21 +8,9 @@ description: This guide walks you through the process of installing OpenHands Cl

 - Signed in to [OpenHands Cloud](https://app.all-hands.dev) with [a Bitbucket account](/usage/cloud/openhands-cloud).

-## Adding Bitbucket Repository Access
-
-Upon signing into OpenHands Cloud with a Bitbucket account, OpenHands will have access to your repositories.
-
-## Working With Bitbucket Repos in Openhands Cloud
-
-After signing in with a Bitbucket account, use the `Open Repository` section to select the appropriate repository and
-branch you'd like OpenHands to work on. Then click on `Launch` to start the conversation!
-
-![Connect Repo](/static/img/connect-repo.png)
-
 ## IP Whitelisting

-If your Bitbucket Cloud instance has IP restrictions, you'll need to whitelist the following IP addresses to allow
-OpenHands to access your repositories:
+If your Bitbucket Cloud instance has IP restrictions, you'll need to whitelist the following IP addresses to allow OpenHands to access your repositories:

 ### Core App IP
 ```
@@ -43,6 +31,17 @@ OpenHands to access your repositories:
 34.60.55.59
 ```

+## Adding Bitbucket Repository Access
+
+Upon signing into OpenHands Cloud with a Bitbucket account, OpenHands will have access to your repositories.
+
+## Working With Bitbucket Repos in Openhands Cloud
+
+After signing in with a Bitbucket account, use the `select a repo` and `select a branch` dropdowns to select the
+appropriate repository and branch you'd like OpenHands to work on. Then click on `Launch` to start the conversation!
+
+![Connect Repo](/static/img/connect-repo-no-github.png)
+
 ## Next Steps

 - [Learn about the Cloud UI](/usage/cloud/cloud-ui).
--- a/docs/usage/cloud/cloud-api.mdx
+++ b/docs/usage/cloud/cloud-api.mdx
@@ -12,92 +12,104 @@ For the available API endpoints, refer to the
 To use the OpenHands Cloud API, you'll need to generate an API key:

 1. Log in to your [OpenHands Cloud](https://app.all-hands.dev) account.
-2. Navigate to the [Settings > API Keys](https://app.all-hands.dev/settings/api-keys) page.
-3. Click `Create API Key`.
-4. Give your key a descriptive name (Example: "Development" or "Production") and select `Create`.
-5. Copy the generated API key and store it securely. It will only be shown once.
+2. Navigate to the [Settings page](https://app.all-hands.dev/settings).
+3. Select the `API Keys` tab.
+4. Click `Create API Key`.
+5. Give your key a descriptive name (Example: "Development" or "Production") and select `Create`.
+6. Copy the generated API key and store it securely. It will only be shown once.

-## API Usage Example
+![API Key Generation](/static/img/api-key-generation.png)
+
+## API Usage

 ### Starting a New Conversation

-To start a new conversation with OpenHands to perform a task,
-[you'll need to make a POST request to the conversation endpoint](/api-reference/new-conversation).
+To start a new conversation with OpenHands to perform a task, you'll need to make a POST request to the conversation endpoint.

-<Tabs>
-  <Tab title="cURL">
-    ```bash
-    curl -X POST "https://app.all-hands.dev/api/conversations" \
-      -H "Authorization: Bearer YOUR_API_KEY" \
-      -H "Content-Type: application/json" \
-      -d '{
-        "initial_user_msg": "Check whether there is any incorrect information in the README.md file and send a PR to fix it if so.",
-        "repository": "yourusername/your-repo"
-      }'
-    ```
-  </Tab>
-  <Tab title="Python (with requests)">
-    ```python
-    import requests
+#### Request Parameters

-    api_key = "YOUR_API_KEY"
-    url = "https://app.all-hands.dev/api/conversations"
+| Parameter          | Type     | Required | Description                                                                                          |
+|--------------------|----------|----------|------------------------------------------------------------------------------------------------------|
+| `initial_user_msg` | string   | Yes      | The initial message to start the conversation.                                                       |
+| `repository`       | string   | No       | Git repository name to provide context in the format `owner/repo`. You must have access to the repo. |

-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json"
-    }
+#### Examples

-    data = {
-        "initial_user_msg": "Check whether there is any incorrect information in the README.md file and send a PR to fix it if so.",
-        "repository": "yourusername/your-repo"
-    }

-    response = requests.post(url, headers=headers, json=data)
-    conversation = response.json()
+<Accordion title="cURL">
+  ```bash
+  curl -X POST "https://app.all-hands.dev/api/conversations" \
+    -H "Authorization: Bearer YOUR_API_KEY" \
+    -H "Content-Type: application/json" \
+    -d '{
+      "initial_user_msg": "Check whether there is any incorrect information in the README.md file and send a PR to fix it if so.",
+      "repository": "yourusername/your-repo"
+    }'
+  ```
+</Accordion>

-    print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['conversation_id']}")
-    print(f"Status: {conversation['status']}")
-    ```
-    </Tab>
-    <Tab title="TypeScript/JavaScript (with fetch)">
-        ```typescript
-    const apiKey = "YOUR_API_KEY";
-    const url = "https://app.all-hands.dev/api/conversations";
+<Accordion title="Python (with requests)">
+  ```python
+  import requests

-    const headers = {
-      "Authorization": `Bearer ${apiKey}`,
+  api_key = "YOUR_API_KEY"
+  url = "https://app.all-hands.dev/api/conversations"
+
+  headers = {
+      "Authorization": f"Bearer {api_key}",
      "Content-Type": "application/json"
-    };
+  }

-    const data = {
-      initial_user_msg: "Check whether there is any incorrect information in the README.md file and send a PR to fix it if so.",
-      repository: "yourusername/your-repo"
-    };
+  data = {
+      "initial_user_msg": "Check whether there is any incorrect information in the README.md file and send a PR to fix it if so.",
+      "repository": "yourusername/your-repo"
+  }

-    async function startConversation() {
-      try {
-        const response = await fetch(url, {
-          method: "POST",
-          headers: headers,
-          body: JSON.stringify(data)
-        });
+  response = requests.post(url, headers=headers, json=data)
+  conversation = response.json()

-        const conversation = await response.json();
+  print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['conversation_id']}")
+  print(f"Status: {conversation['status']}")
+  ```
+</Accordion>

-        console.log(`Conversation Link: https://app.all-hands.dev/conversations/${conversation.id}`);
-        console.log(`Status: ${conversation.status}`);
+<Accordion title="TypeScript/JavaScript (with fetch)">
+  ```typescript
+  const apiKey = "YOUR_API_KEY";
+  const url = "https://app.all-hands.dev/api/conversations";

-        return conversation;
-      } catch (error) {
-        console.error("Error starting conversation:", error);
-      }
+  const headers = {
+    "Authorization": `Bearer ${apiKey}`,
+    "Content-Type": "application/json"
+  };
+
+  const data = {
+    initial_user_msg: "Check whether there is any incorrect information in the README.md file and send a PR to fix it if so.",
+    repository: "yourusername/your-repo"
+  };
+
+  async function startConversation() {
+    try {
+      const response = await fetch(url, {
+        method: "POST",
+        headers: headers,
+        body: JSON.stringify(data)
+      });
+
+      const conversation = await response.json();
+
+      console.log(`Conversation Link: https://app.all-hands.dev/conversations/${conversation.id}`);
+      console.log(`Status: ${conversation.status}`);
+
+      return conversation;
+    } catch (error) {
+      console.error("Error starting conversation:", error);
    }
+  }

-    startConversation();
-    ```
-  </Tab>
-</Tabs>
+  startConversation();
+  ```
+</Accordion>

 #### Response

@@ -116,6 +128,42 @@ You may receive an `AuthenticationError` if:
 - You provided the wrong repository name.
 - You don't have access to the repository.

+
+### Retrieving Conversation Status
+
+You can check the status of a conversation by making a GET request to the conversation endpoint.
+
+#### Endpoint
+
+```
+GET https://app.all-hands.dev/api/conversations/{conversation_id}
+```
+
+#### Example
+
+<Accordion title="cURL">
+  ```bash
+  curl -X GET "https://app.all-hands.dev/api/conversations/{conversation_id}" \
+    -H "Authorization: Bearer YOUR_API_KEY"
+  ```
+</Accordion>
+
+#### Response
+
+The response is formatted as follows:
+
+```json
+{
+  "conversation_id":"abc1234",
+  "title":"Update README.md",
+  "created_at":"2025-04-29T15:13:51.370706Z",
+  "last_updated_at":"2025-04-29T15:13:57.199210Z",
+  "status":"RUNNING",
+  "selected_repository":"yourusername/your-repo",
+  "trigger":"gui"
+}
+```
+
 ## Rate Limits

 If you have too many conversations running at once, older conversations will be paused to limit the number of concurrent conversations.
--- a/docs/usage/cloud/cloud-ui.mdx
+++ b/docs/usage/cloud/cloud-ui.mdx
@@ -8,39 +8,24 @@ description: The Cloud UI provides a web interface for interacting with OpenHand

 The landing page is where you can:

+- [Add GitHub repository access](/usage/cloud/github-installation#adding-github-repository-access) to OpenHands.
 - [Select a GitHub repo](/usage/cloud/github-installation#working-with-github-repos-in-openhands-cloud),
  [a GitLab repo](/usage/cloud/gitlab-installation#working-with-gitlab-repos-in-openhands-cloud) or
  [a Bitbucket repo](/usage/cloud/bitbucket-installation#working-with-bitbucket-repos-in-openhands-cloud) to start working on.
- Launch an empty conversation using `New Conversation`.
 - See `Suggested Tasks` for repositories that OpenHands has access to.
- See your `Recent Conversations`.
+- Launch an empty conversation using `Launch from Scratch`.

 ## Settings

-Settings are divided across tabs, with each tab focusing on a specific area of configuration.
+The Settings page allows you to:

- `User`
-  - Change your email address.
- `Integrations`
-  - [Configure GitHub repository access](/usage/cloud/github-installation#modifying-repository-access) for OpenHands.
-  - [Install the OpenHands Slack app](/usage/cloud/slack-installation).
- `Application`
-  - Set your preferred language, notifications and other preferences.
-  - Toggle task suggestions on GitHub.
-  - Toggle Solvability Analysis.
-  - Set a maximum budget per conversation.
-  - Configure the username and email that OpenHands uses for commits.
- `LLM` (Available for [Pro subscription users](/usage/cloud/pro-subscription))
-  - Choose to use another LLM or use different models from the OpenHands provider.
- `Billing`
-  - Add credits for using the OpenHands provider.
-  - Cancel your `Pro subscription`.
- `Secrets`
-  - [Manage secrets](/usage/settings/secrets-settings).
- `API Keys`
-  - [Create API keys to work with OpenHands programmatically](/usage/cloud/cloud-api).
- `MCP`
-  - [Setup an MCP server](/usage/settings/mcp-settings)
+- [Configure GitHub repository access](/usage/cloud/github-installation#modifying-repository-access) for OpenHands.
+- [Install the OpenHands Slack app](/usage/cloud/slack-installation).
+- Set application settings like your preferred language, notifications and other preferences.
+- Add credits to your account.
+- [Generate custom secrets](/usage/common-settings#secrets-management).
+- [Create API keys to work with OpenHands programmatically](/usage/cloud/cloud-api).
+- Change your email address.

 ## Key Features

--- a/docs/usage/cloud/github-installation.mdx
+++ b/docs/usage/cloud/github-installation.mdx
@@ -12,7 +12,7 @@ description: This guide walks you through the process of installing OpenHands Cl

 You can grant OpenHands access to specific GitHub repositories:

-1. Click on `+ Add GitHub Repos` in the repository selection dropdown.
+1. Click on `Add GitHub repos` on the landing page.
 2. Select your organization and choose the specific repositories to grant OpenHands access to.
 <Accordion title="OpenHands permissions">
  - OpenHands requests short-lived tokens (8-hour expiration) with these permissions:
@@ -34,22 +34,20 @@ You can grant OpenHands access to specific GitHub repositories:
 ## Modifying Repository Access

 You can modify GitHub repository access at any time by:
- Selecting `+ Add GitHub Repos` in the repository selection dropdown or
- Visiting the `Settings > Integrations` page and selecting `Configure GitHub Repositories`
+- Selecting `Add GitHub repos` on the landing page or
+- Visiting the Settings page and selecting `Configure GitHub Repositories` under the `Integrations` tab

 ## Working With GitHub Repos in Openhands Cloud

-Once you've granted GitHub repository access, you can start working with your GitHub repository. Use the
-`Open Repository` section to select the appropriate repository and branch you'd like OpenHands to work on. Then click
-on `Launch` to start the conversation!
+Once you've granted GitHub repository access, you can start working with your GitHub repository. Use the `select a repo`
+and `select a branch` dropdowns to select the appropriate repository and branch you'd like OpenHands to work on. Then
+click on `Launch` to start the conversation!

 ![Connect Repo](/static/img/connect-repo.png)

-## Working on GitHub Issues and Pull Requests Using Openhands
+## Working on Github Issues and Pull Requests Using Openhands

-To allow OpenHands to work directly from GitHub directly, you must
-[give OpenHands access to your repository](/usage/cloud/github-installation#modifying-repository-access). Once access is
-given, you can use OpenHands by labeling the issue or by tagging `@openhands`.
+Giving GitHub repository access to OpenHands also allows you to work on GitHub issues and pull requests directly.

 ### Working with Issues

@@ -66,12 +64,7 @@ To get OpenHands to work on pull requests, mention `@openhands` in the comments
 - Request updates
 - Get code explanations

-<Note>
-The `@openhands` mention functionality in pull requests only works if the pull request is both
-*to* and *from* a repository that you have added through the interface. This is because OpenHands needs appropriate
-permissions to access both repositories.
-</Note>
-
+**Important Note**: The `@openhands` mention functionality in pull requests only works if the pull request is both *to* and *from* a repository that you have added through the interface. This is because OpenHands needs appropriate permissions to access both repositories.

 ## Next Steps

--- a/docs/usage/cloud/gitlab-installation.mdx
+++ b/docs/usage/cloud/gitlab-installation.mdx
@@ -14,17 +14,16 @@ Upon signing into OpenHands Cloud with a GitLab account, OpenHands will have acc

 ## Working With GitLab Repos in Openhands Cloud

-After signing in with a Gitlab account, use the `Open Repository` section to select the appropriate repository and
-branch you'd like OpenHands to work on. Then click on `Launch` to start the conversation!
+After signing in with a Gitlab account, use the `select a repo` and `select a branch` dropdowns to select the
+appropriate repository and branch you'd like OpenHands to work on. Then click on `Launch` to start the conversation!

-![Connect Repo](/static/img/connect-repo.png)
+![Connect Repo](/static/img/connect-repo-no-github.png)

 ## Using Tokens with Reduced Scopes

 OpenHands requests an API-scoped token during OAuth authentication. By default, this token is provided to the agent.
-To restrict the agent's permissions, [you can define a custom secret](/usage/settings/secrets-settings) `GITLAB_TOKEN`,
-which will override the default token assigned to the agent. While the high-permission API token is still requested
-and used for other components of the application (e.g. opening merge requests), the agent will not have access to it.
+To restrict the agent's permissions, you can define a custom secret `GITLAB_TOKEN`, which will override the default token assigned to the agent.
+While the high-permission API token is still requested and used for other components of the application (e.g. opening merge requests), the agent will not have access to it.

 ## Working on GitLab Issues and Merge Requests Using Openhands

@@ -33,8 +32,7 @@ This feature works for personal projects and is available for group projects wit
 [Premium or Ultimate tier subscription](https://docs.gitlab.com/user/project/integrations/webhooks/#group-webhooks).

 A webhook is automatically installed within a few minutes after the owner/maintainer of the project or group logs into
-OpenHands Cloud. If you decide to delete the webhook, then re-installing will require the support of All Hands AI but
-we are planning to improve this in a future release.
+OpenHands Cloud. If you decide to delete the webhook, then re-installing will require the support of All Hands AI but we are planning to improve this in a future release.
 </Note>

 Giving GitLab repository access to OpenHands also allows you to work on GitLab issues and merge requests directly.
--- a/docs/usage/cloud/pro-subscription.mdx
+++ b/docs/usage/cloud/pro-subscription.mdx
@@ -1,50 +0,0 @@
---
-title: "Pro Subscription"
-description: "Learn about OpenHands Cloud Pro Subscription features and pricing."
---
-
-## Overview
-
-The OpenHands Pro Subscription unlocks additional features and better pricing when you run OpenHands conversations in
-OpenHands Cloud.
-
-## Base Features
-
-All users start on the Pay-as-you-go plan and have access to these base features when they sign up:
-
-* **Run multiple OpenHands conversations on OpenHands Cloud runtimes.**
-* **API keys to the OpenHands LLM provider for use in OpenHands CLI or when running OpenHands on your own.**
-* **$20 in initial OpenHands Cloud credits to get started.**
-* **Support for GitHub, GitLab, Bitbucket, Slack, and more.**
-
-## What you get with a Pro Subscription
-
-The $20/month Pro subscription covers the cost of runtime compute in OpenHands Cloud, plus enables the following
-features:
-
-* **Bring Your Own LLM Keys:** Bring your own API keys from OpenAI, Anthropic, Mistral, and other providers.
-* **Model Choice:** Unlocks access to OpenHands LLM provider models for use within OpenHands Cloud.
-* **No Markup Pricing on LLM usage:** When you use the OpenHands LLM provider in OpenHands Cloud, you pay for
-LLM usage at-cost (zero markup) based on API prices.
-
-## Plan Comparison
-
-Here are the key differences between Pay-as-you-go and Pro subscriptions:
-
-### When running OpenHands conversations in OpenHands Cloud
-|  | Pay-as-you-go | Pro Subscription |
-| :---- | ----- | ----- |
-| Monthly price | None \- no commitment | $20/month |
-| Can I bring my own LLM key? | No | ✅ Yes |
-| Do I pay for LLM usage? | ✅ Yes | ✅ Yes |
-| Can I select from different LLMs without bringing my own LLM key? | No \- defaults to Claude Sonnet 4 | ✅ Yes \- via OpenHands LLM provider <br/><br/>[*See models and pricing*](https://docs.all-hands.dev/usage/llms/openhands-llms#pricing) |
-| How much am I charged for LLM usage? | **Marked up pricing** \- 2x Claude Sonnet 4 API prices. *This markup helps cover the cost of runtime compute.* | **No markup** \- 1x API prices. *The $20 monthly subscription covers the cost of runtime compute.* |
-
-
-### When using the OpenHands LLM Provider outside of OpenHands Cloud
-The following applies to **both** the Pay-as-you-go and Pro subscription:
-|  | Pay-as-you-go or Pro Subscription |
-| :---- | :---- |
-| Do I have access to multiple models via the OpenHands LLM provider? | ✅ Yes <br/><br/> [*See models and pricing*](https://docs.all-hands.dev/usage/llms/openhands-llms#pricing) |
-| Can I generate and refresh OpenHands LLM API keys? | ✅ Yes |
-| How much am I charged for LLM usage when I use the OpenHands LLM provider in other AI coding tools?  | **No markup** \- pay 1x API prices <br/> [*See models and pricing*](https://docs.all-hands.dev/usage/llms/openhands-llms#pricing) <br/><br/> *Usage is deducted from your OpenHands Cloud credit balance.*  <br/><br/> *The OpenHands LLM provider is available to all OpenHands Cloud users, and LLM usage is billed at-cost (zero markup). Use these models with OpenHands CLI, running OpenHands on your own, or even other AI coding agents\! [Learn more.](https://www.all-hands.dev/blog/access-state-of-the-art-llm-models-at-cost-via-openhands-gui-and-cli)* |
--- a/docs/usage/cloud/slack-installation.mdx
+++ b/docs/usage/cloud/slack-installation.mdx
@@ -13,9 +13,7 @@ description: This guide walks you through installing the OpenHands Slack app.
 </iframe>

 <Info>
-OpenHands utilizes a large language model (LLM), which may generate responses that are inaccurate or incomplete.
-While we strive for accuracy, OpenHands' outputs are not guaranteed to be correct, and we encourage users to
-validate critical information independently.
+OpenHands utilizes a large language model (LLM), which may generate responses that are inaccurate or incomplete. While we strive for accuracy, OpenHands' outputs are not guaranteed to be correct, and we encourage users to validate critical information independently.
 </Info>

 ## Prerequisites
@@ -41,7 +39,7 @@ validate critical information independently.
  **Make sure your Slack workspace admin/owner has installed OpenHands Slack App first.**

  Every user in the Slack workspace (including admins/owners) must link their OpenHands Cloud account to the OpenHands Slack App. To do this:
-  1. Visit the [Settings > Integrations](https://app.all-hands.dev/settings/integrations) page in OpenHands Cloud.
+  1. Visit [integrations settings](https://app.all-hands.dev/settings/integrations) in OpenHands Cloud.
  2. Click `Install OpenHands Slack App`.
  3. In the top right corner, select the workspace to install the OpenHands Slack app.
  4. Review permissions and click allow.
@@ -59,8 +57,7 @@ To start a new conversation, you can mention `@openhands` in a new message or a

 Once a conversation is started, all thread messages underneath it will be follow-up messages to OpenHands.

-To send follow-up messages for the same conversation, mention `@openhands` in a thread reply to the original message.
-You must be the user who started the conversation.
+To send follow-up messages for the same conversation, mention `@openhands` in a thread reply to the original message. You must be the user who started the conversation.

 ## Example conversation

--- a/docs/usage/settings/secrets-settings.mdx
+++ b/docs/usage/settings/secrets-settings.mdx
@@ -1,19 +1,28 @@
 ---
-title: Secrets Management
-description: How to manage secrets in OpenHands.
+title: OpenHands Settings
+description: Overview of some of the settings available in OpenHands.
 ---

-## Overview
+## Openhands Cloud vs Running on Your Own
+
+There are some differences between the settings available in OpenHands Cloud and those available when running OpenHands
+on your own:
+* [OpenHands Cloud settings](/usage/cloud/cloud-ui#settings)
+* [Settings available when running on your own](/usage/how-to/gui-mode#settings)
+
+Refer to these pages for more detailed information.
+
+## Secrets Management

 OpenHands provides a secrets manager that allows you to securely store and manage sensitive information that can be
 accessed by the agent during runtime, such as API keys. These secrets are automatically exported as environment
 variables in the agent's runtime environment.

-## Accessing the Secrets Manager
+### Accessing the Secrets Manager

-Navigate to the `Settings > Secrets` page. Here, you'll see a list of all your existing custom secrets.
+In the Settings page, navigate to the `Secrets` tab. Here, you'll see a list of all your existing custom secrets.

-## Adding a New Secret
+### Adding a New Secret
 1. Click `Add a new secret`.
 2. Fill in the following fields:
   - **Name**: A unique identifier for your secret (e.g., `AWS_ACCESS_KEY`). This will be the environment variable name.
@@ -21,7 +30,7 @@ Navigate to the `Settings > Secrets` page. Here, you'll see a list of all your e
   - **Description** (optional): A brief description of what the secret is used for, which is also provided to the agent.
 3. Click `Add secret` to save.

-## Editing a Secret
+### Editing a Secret

 1. Click the `Edit` button next to the secret you want to modify.
 2. You can update the name and description of the secret.
@@ -30,13 +39,14 @@ Navigate to the `Settings > Secrets` page. Here, you'll see a list of all your e
  value, delete the secret and create a new one.
 </Note>

-## Deleting a Secret
+### Deleting a Secret

 1. Click the `Delete` button next to the secret you want to remove.
 2. Select `Confirm` to delete the secret.

-## Using Secrets in the Agent
+### Using Secrets in the Agent
 - All custom secrets are automatically exported as environment variables in the agent's runtime environment.
- - You can access them in your code using standard environment variable access methods. For example, if you create a
-  secret named `OPENAI_API_KEY`, you can access it in your code as `process.env.OPENAI_API_KEY` in JavaScript or
-  `os.environ['OPENAI_API_KEY']` in Python.
+ - You can access them in your code using standard environment variable access methods
+   (e.g., `os.environ['SECRET_NAME']` in Python).
+ - Example: If you create a secret named `OPENAI_API_KEY`, you can access it in your code as
+   `process.env.OPENAI_API_KEY` in JavaScript or `os.environ['OPENAI_API_KEY']` in Python.
--- a/docs/usage/faqs.mdx
+++ b/docs/usage/faqs.mdx
@@ -90,6 +90,7 @@ If you would like to set things up more systematically, you can:
  others have encountered the same problem.
 2. **Join our community**: Get help from other users and developers:
   - [Slack community](https://dub.sh/openhands)
+   - [Discord server](https://discord.gg/ESHStjSjD4)
 3. **Check our troubleshooting guide**: Common issues and solutions are documented in
  [Troubleshooting](/usage/troubleshooting/troubleshooting).
 4. **Report bugs**: If you've found a bug, please [create an issue](https://github.com/All-Hands-AI/OpenHands/issues/new)
--- a/docs/usage/how-to/cli-mode.mdx
+++ b/docs/usage/how-to/cli-mode.mdx
@@ -105,7 +105,7 @@ The conversation history will be saved in `~/.openhands/sessions`.

 1. Set the following environment variables in your terminal:
   - `SANDBOX_VOLUMES` to specify the directory you want OpenHands to access ([See using SANDBOX_VOLUMES for more info](../runtimes/docker#using-sandbox_volumes))
-   - `LLM_MODEL` - the LLM model to use (e.g. `export LLM_MODEL="anthropic/claude-sonnet-4-20250514"` or `export LLM_MODEL="anthropic/claude-sonnet-4-5-20250929"`)
+   - `LLM_MODEL` - the LLM model to use (e.g. `export LLM_MODEL="anthropic/claude-sonnet-4-20250514"`)
   - `LLM_API_KEY` - your API key (e.g. `export LLM_API_KEY="sk_test_12345"`)

 2. Run the following command:
@@ -113,7 +113,7 @@ The conversation history will be saved in `~/.openhands/sessions`.
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.58-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -122,7 +122,7 @@ docker run -it \
    -v ~/.openhands:/.openhands \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.58 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.56 \
    python -m openhands.cli.entry --override-cli-mode true
 ```

--- a/docs/usage/how-to/gui-mode.mdx
+++ b/docs/usage/how-to/gui-mode.mdx
@@ -85,11 +85,11 @@ You can use the Settings page at any time to:

 - Setup the LLM provider and model for OpenHands.
 - [Setup the search engine](/usage/search-engine-setup).
- [Configure MCP servers](/usage/settings/mcp-settings).
+- [Configure MCP servers](/usage/mcp).
 - [Connect to GitHub](/usage/how-to/gui-mode#github-setup), [connect to GitLab](/usage/how-to/gui-mode#gitlab-setup)
  and [connect to Bitbucket](/usage/how-to/gui-mode#bitbucket-setup).
 - Set application settings like your preferred language, notifications and other preferences.
- [Manage custom secrets](/usage/settings/secrets-settings).
+- [Manage custom secrets](/usage/common-settings#secrets-management).

 #### GitHub Setup

--- a/docs/usage/how-to/headless-mode.mdx
+++ b/docs/usage/how-to/headless-mode.mdx
@@ -53,7 +53,7 @@ Set environment variables and run the Docker command:
 ```bash
 # Set required environment variables
 export SANDBOX_VOLUMES="/path/to/workspace:/workspace:rw"  # Format: host_path:container_path:mode
-export LLM_MODEL="anthropic/claude-sonnet-4-20250514"  # or "anthropic/claude-sonnet-4-5-20250929"
+export LLM_MODEL="anthropic/claude-sonnet-4-20250514"
 export LLM_API_KEY="your-api-key"
 export SANDBOX_SELECTED_REPO="owner/repo-name"  # Optional: requires GITHUB_TOKEN
 export GITHUB_TOKEN="your-token"  # Required for repository operations
@@ -61,7 +61,7 @@ export GITHUB_TOKEN="your-token"  # Required for repository operations
 # Run OpenHands
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.58-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -73,7 +73,7 @@ docker run -it \
    -v ~/.openhands:/.openhands \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.58 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.56 \
    python -m openhands.core.main -t "write a bash script that prints hi"
 ```

--- a/docs/usage/installation.mdx
+++ b/docs/usage/installation.mdx
@@ -4,17 +4,16 @@ description: Running OpenHands Cloud or running on your own.
 icon: rocket
 ---

-<Tabs>
-  <Tab title="OpenHands Cloud">
-    The easiest way to get started with OpenHands is on OpenHands Cloud, which comes with $20 in free credits for new users.
+## OpenHands Cloud

-    To get started with OpenHands Cloud, visit [app.all-hands.dev](https://app.all-hands.dev).
+The easiest way to get started with OpenHands is on OpenHands Cloud, which comes with $20 in free credits for new users.

-    For more information see [getting started with OpenHands Cloud.](/usage/cloud/openhands-cloud)
-  </Tab>
-  <Tab title="Running OpenHands on Your Own">
-    Run OpenHands on your local system and bring your own LLM and API key.
+To get started with OpenHands Cloud, visit [app.all-hands.dev](https://app.all-hands.dev).

-    For more information see [running OpenHands on your own.](/usage/local-setup)
-  </Tab>
-</Tabs>
+For more information see [getting started with OpenHands Cloud.](/usage/cloud/openhands-cloud)
+
+## Running OpenHands on Your Own
+
+Run OpenHands on your local system and bring your own LLM and API key.
+
+For more information see [running OpenHands on your own.](/usage/local-setup)
--- a/docs/usage/llms/llms.mdx
+++ b/docs/usage/llms/llms.mdx
@@ -18,7 +18,6 @@ Based on these findings and community feedback, these are the latest models that
 ### Cloud / API-Based Models

 - [anthropic/claude-sonnet-4-20250514](https://www.anthropic.com/api) (recommended)
- [anthropic/claude-sonnet-4-5-20250929](https://www.anthropic.com/api) (recommended)
 - [openai/gpt-5-2025-08-07](https://openai.com/api/) (recommended)
 - [gemini/gemini-2.5-pro](https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/)
 - [deepseek/deepseek-chat](https://api-docs.deepseek.com/)
--- a/docs/usage/llms/local-llms.mdx
+++ b/docs/usage/llms/local-llms.mdx
@@ -68,23 +68,23 @@ Download and install the LM Studio desktop app from [lmstudio.ai](https://lmstud
 1. Check [the installation guide](/usage/local-setup) and ensure all prerequisites are met before running OpenHands, then run:

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.58-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.58-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands:/.openhands \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.58
+    docker.all-hands.dev/all-hands-ai/openhands:0.56
 ```

 2. Wait until the server is running (see log below):
 ```
 Digest: sha256:e72f9baecb458aedb9afc2cd5bc935118d1868719e55d50da73190d3a85c674f
-Status: Image is up to date for docker.all-hands.dev/all-hands-ai/openhands:0.58
+Status: Image is up to date for docker.all-hands.dev/all-hands-ai/openhands:0.56
 Starting OpenHands...
 Running OpenHands as root
 14:22:13 - openhands:INFO: server_config.py:50 - Using config class None
@@ -119,7 +119,7 @@ When started for the first time, OpenHands will prompt you to set up the LLM pro

 That's it! You can now start using OpenHands with the local LLM server.

-If you encounter any issues, let us know on [Slack](https://dub.sh/openhands).
+If you encounter any issues, let us know on [Slack](https://dub.sh/openhands) or [Discord](https://discord.gg/ESHStjSjD4).

 ## Advanced: Alternative LLM Backends

--- a/docs/usage/llms/openhands-llms.mdx
+++ b/docs/usage/llms/openhands-llms.mdx
@@ -15,7 +15,7 @@ description: OpenHands LLM provider with access to state-of-the-art (SOTA) agent

 When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings under the `LLM` tab:
 - `LLM Provider` to `OpenHands`
- `LLM Model` to the model you will be using (e.g. claude-sonnet-4-20250514 or claude-sonnet-4-5-20250929)
+- `LLM Model` to the model you will be using (e.g. claude-sonnet-4-20250514)
 - `API Key` to your OpenHands LLM API key copied from above

 ## Using OpenHands LLM Provider in the CLI
@@ -30,21 +30,6 @@ When running OpenHands, you'll need to set the following in the OpenHands UI thr

 ## Pricing

-Pricing follows official API provider rates. Below are the current pricing details for OpenHands models:
+Pricing follows official API provider rates. [You can view model prices here.](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)

-| Model | Input Cost (per 1M tokens) | Cached Input Cost (per 1M tokens) | Output Cost (per 1M tokens) | Max Input Tokens | Max Output Tokens |
-|-------|----------------------------|-----------------------------------|------------------------------|------------------|-------------------|
-| claude-opus-4-20250514 | $15.00 | $1.50 | $75.00 | 200,000 | 32,000 |
-| claude-sonnet-4-20250514 | $3.00 | $0.30 | $15.00 | 200,000 | 64,000 |
-| claude-sonnet-4-5-20250929 | $3.00 | $0.30 | $15.00 | 200,000 | 64,000 |
-| devstral-medium-2507 | $0.40 | N/A | $2.00 | 128,000 | 128,000 |
-| devstral-small-2505 | $0.10 | N/A | $0.30 | 128,000 | 128,000 |
-| devstral-small-2507 | $0.10 | N/A | $0.30 | 128,000 | 128,000 |
-| gemini-2.5-pro | $1.25 | $0.31 | $10.00 | 1,048,576 | 65,535 |
-| gpt-5-2025-08-07 | $1.25 | $0.125 | $10.00 | 400,000 | 128,000 |
-| gpt-5-mini-2025-08-07 | $0.25 | $0.025 | $2.00 | 400,000 | 128,000 |
-| o3 | $2.00 | $0.50 | $8.00 | 200,000 | 100,000 |
-| o4-mini | $1.10 | $0.28 | $4.40 | 200,000 | 100,000 |
-| qwen3-coder-480b | $0.40 | N/A | $1.60 | N/A | N/A |
-
-**Note:** Cached input tokens are charged at a reduced rate when the same content is reused across requests. Models that don't support prompt caching show "N/A" for cached input cost.
+For `qwen3-coder-480b`, we charge the cheapest FP8 rate available on openrouter: \$0.4 per million input tokens and \$1.6 per million output tokens.
--- a/docs/usage/local-setup.mdx
+++ b/docs/usage/local-setup.mdx
@@ -116,17 +116,17 @@ Note that you'll still need `uv` installed for the default MCP servers to work p
 <Accordion title="Docker Command (Click to expand)">

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.58-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.58-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.56-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands:/.openhands \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.58
+    docker.all-hands.dev/all-hands-ai/openhands:0.56
 ```

 </Accordion>
--- a/docs/usage/mcp.mdx
+++ b/docs/usage/mcp.mdx
@@ -0,0 +1,196 @@
+---
+title: Model Context Protocol (MCP)
+description: This page outlines how to configure and use the Model Context Protocol (MCP) in OpenHands, allowing you
+  to extend the agent's capabilities with custom tools.
+---
+
+## Overview
+
+Model Context Protocol (MCP) is a mechanism that allows OpenHands to communicate with external tool servers. These
+servers can provide additional functionality to the agent, such as specialized data processing, external API access,
+or custom tools. MCP is based on the open standard defined at [modelcontextprotocol.io](https://modelcontextprotocol.io).
+
+
+<Note>
+MCP is currently not available on OpenHands Cloud. This feature is only available when running OpenHands locally.
+</Note>
+
+### How MCP Works
+
+When OpenHands starts, it:
+
+1. Reads the MCP configuration.
+2. Connects to any configured SSE and SHTTP servers.
+3. Starts any configured stdio servers.
+4. Registers the tools provided by these servers with the agent.
+
+The agent can then use these tools just like any built-in tool. When the agent calls an MCP tool:
+
+1. OpenHands routes the call to the appropriate MCP server.
+2. The server processes the request and returns a response.
+3. OpenHands converts the response to an observation and presents it to the agent.
+
+## Configuration
+
+MCP configuration can be defined in:
+* The OpenHands UI through the Settings under the `MCP` tab.
+* The `config.toml` file under the `[mcp]` section if not using the UI.
+
+### Configuration Examples
+
+#### Recommended: Using Proxy Servers (SSE/HTTP)
+
+For stdio-based MCP servers, we recommend using MCP proxy tools like [`supergateway`](https://github.com/supercorp-ai/supergateway) instead of direct stdio connections.
+[SuperGateway](https://github.com/supercorp-ai/supergateway) is a popular MCP proxy that converts stdio MCP servers to HTTP/SSE endpoints:
+
+Start the proxy servers separately:
+```bash
+# Terminal 1: Filesystem server proxy
+supergateway --stdio "npx @modelcontextprotocol/server-filesystem /" --port 8080
+
+# Terminal 2: Fetch server proxy
+supergateway --stdio "uvx mcp-server-fetch" --port 8081
+```
+
+Then configure OpenHands to use the HTTP endpoint:
+
+```toml
+[mcp]
+# SSE Servers - Recommended approach using proxy tools
+sse_servers = [
+    # Basic SSE server with just a URL
+    "http://example.com:8080/mcp",
+
+    # SuperGateway proxy for fetch server
+    "http://localhost:8081/sse",
+
+    # External MCP service with authentication
+    {url="https://api.example.com/mcp/sse", api_key="your-api-key"}
+]
+```
+
+
+
+#### Alternative: Direct Stdio Servers (Not Recommended for Production)
+
+```toml
+[mcp]
+# Direct stdio servers - use only for development/testing
+stdio_servers = [
+    # Basic stdio server
+    {name="fetch", command="uvx", args=["mcp-server-fetch"]},
+
+    # Stdio server with environment variables
+    {
+        name="filesystem",
+        command="npx",
+        args=["@modelcontextprotocol/server-filesystem", "/"],
+        env={
+            "DEBUG": "true"
+        }
+    }
+]
+```
+
+## Configuration Options
+
+### SSE Servers
+
+SSE servers are configured using either a string URL or an object with the following properties:
+
+- `url` (required)
+  - Type: `str`
+  - Description: The URL of the SSE server
+
+- `api_key` (optional)
+  - Type: `str`
+  - Description: API key for authentication
+
+### SHTTP Servers
+
+SHTTP (Streamable HTTP) servers are configured using either a string URL or an object with the following properties:
+
+- `url` (required)
+  - Type: `str`
+  - Description: The URL of the SHTTP server
+
+- `api_key` (optional)
+  - Type: `str`
+  - Description: API key for authentication
+
+### Stdio Servers
+
+**Note**: While stdio servers are supported, we recommend using MCP proxies (see above) for better reliability and performance.
+
+Stdio servers are configured using an object with the following properties:
+
+- `name` (required)
+  - Type: `str`
+  - Description: A unique name for the server
+
+- `command` (required)
+  - Type: `str`
+  - Description: The command to run the server
+
+- `args` (optional)
+  - Type: `list of str`
+  - Default: `[]`
+  - Description: Command-line arguments to pass to the server
+
+- `env` (optional)
+  - Type: `dict of str to str`
+  - Default: `{}`
+  - Description: Environment variables to set for the server process
+
+
+#### When to Use Direct Stdio
+
+Direct stdio connections may still be appropriate in these scenarios:
+- **Development and testing**: Quick prototyping of MCP servers
+- **Simple, single-use tools**: Tools that don't require high reliability or concurrent access
+- **Local-only environments**: When you don't want to manage additional proxy processes
+
+For production use, we recommend using proxy tools like SuperGateway.
+
+### Other Proxy Tools
+
+Other options include:
+
+- **Custom FastAPI/Express servers**: Build your own HTTP wrapper around stdio MCP servers
+- **Docker-based proxies**: Containerized solutions for better isolation
+- **Cloud-hosted MCP services**: Third-party services that provide MCP endpoints
+
+### Troubleshooting MCP Connections
+
+#### Common Issues with Stdio Servers
+- **Process crashes**: Stdio processes may crash without proper error handling
+- **Deadlocks**: Stdio communication can deadlock under high load
+- **Resource leaks**: Zombie processes if not properly managed
+- **Debugging difficulty**: Hard to inspect stdio communication
+
+#### Benefits of Using Proxies
+- **HTTP status codes**: Clear error reporting via standard HTTP responses
+- **Request logging**: Easy to log and monitor HTTP requests
+- **Load balancing**: Can distribute requests across multiple server instances
+- **Health checks**: HTTP endpoints can provide health status
+- **CORS support**: Better integration with web-based tools
+
+## Transport Protocols
+
+OpenHands supports three different MCP transport protocols:
+
+### Server-Sent Events (SSE)
+SSE is a legacy HTTP-based transport that uses Server-Sent Events for server-to-client communication and HTTP POST requests for client-to-server communication. This transport is suitable for basic streaming scenarios but has limitations in session management and connection resumability.
+
+### Streamable HTTP (SHTTP)
+SHTTP is the modern HTTP-based transport protocol that provides enhanced features over SSE:
+
+- **Improved Session Management**: Supports stateful sessions with session IDs for maintaining context across requests
+- **Connection Resumability**: Can resume broken connections and replay missed messages using event IDs
+- **Bidirectional Communication**: Uses HTTP POST for client-to-server and optional SSE streams for server-to-client communication
+- **Better Error Handling**: Enhanced error reporting and recovery mechanisms
+
+SHTTP is the recommended transport for HTTP-based MCP servers as it provides better reliability and features compared to the legacy SSE transport.
+
+### Standard Input/Output (stdio)
+Stdio transport enables communication through standard input and output streams, making it ideal for local integrations and command-line tools. This transport is used for locally executed MCP servers that run as separate processes.
--- a/docs/usage/settings/mcp-settings.mdx
+++ b/docs/usage/settings/mcp-settings.mdx
@@ -1,194 +0,0 @@
---
-title: Model Context Protocol (MCP)
-description: This page outlines how to configure and use the Model Context Protocol (MCP) in OpenHands, allowing you
-  to extend the agent's capabilities with custom tools.
---
-
-## Overview
-
-Model Context Protocol (MCP) is a mechanism that allows OpenHands to communicate with external tool servers. These
-servers can provide additional functionality to the agent, such as specialized data processing, external API access,
-or custom tools. MCP is based on the open standard defined at [modelcontextprotocol.io](https://modelcontextprotocol.io).
-
-## Supported MCPs
-
-OpenHands supports the following MCP transport protocols:
-
-* [Server-Sent Events (SSE)](https://modelcontextprotocol.io/specification/2024-11-05/basic/transports#http-with-sse)
-* [Streamable HTTP (SHTTP)](https://modelcontextprotocol.io/specification/2025-06-18/basic/transports#streamable-http)
-* [Standard Input/Output (stdio)](https://modelcontextprotocol.io/specification/2025-06-18/basic/transports#stdio)
-
-## How MCP Works
-
-When OpenHands starts, it:
-
-1. Reads the MCP configuration.
-2. Connects to any configured SSE and SHTTP servers.
-3. Starts any configured stdio servers.
-4. Registers the tools provided by these servers with the agent.
-
-The agent can then use these tools just like any built-in tool. When the agent calls an MCP tool:
-
-1. OpenHands routes the call to the appropriate MCP server.
-2. The server processes the request and returns a response.
-3. OpenHands converts the response to an observation and presents it to the agent.
-
-## Configuration
-
-MCP configuration can be defined in:
-* The OpenHands UI in the `Settings > MCP` page.
-* The `config.toml` file under the `[mcp]` section if not using the UI.
-
-### Configuration Options
-
-<Tabs>
-  <Tab title="SSE Servers">
-    SSE servers are configured using either a string URL or an object with the following properties:
-
-    - `url` (required)
-      - Type: `str`
-      - Description: The URL of the SSE server.
-
-    - `api_key` (optional)
-      - Type: `str`
-      - Description: API key for authentication.
-  </Tab>
-  <Tab title="SHTTP Servers">
-    SHTTP (Streamable HTTP) servers are configured using either a string URL or an object with the following properties:
-
-  - `url` (required)
-    - Type: `str`
-    - Description: The URL of the SHTTP server.
-
-  - `api_key` (optional)
-    - Type: `str`
-    - Description: API key for authentication.
-
-  - `timeout` (optional)
-    - Type: `int`
-    - Default: `60`
-    - Range: `1-3600` seconds (1 hour maximum)
-    - Description: Timeout in seconds for tool execution. This prevents tool calls from hanging indefinitely.
-    - **Use Cases:**
-      - **Short timeout (1-30s)**: For lightweight operations like status checks or simple queries.
-      - **Medium timeout (30-300s)**: For standard processing tasks like data analysis or API calls.
-      - **Long timeout (300-3600s)**: For heavy operations like file processing, complex calculations, or batch operations.
-    <Note>
-      This timeout only applies to individual tool calls, not server connection establishment.
-    </Note>
-  </Tab>
-  <Tab title="Stdio Servers">
-    <Note>
-      While stdio servers are supported, [we recommend using MCP proxies](/usage/settings/mcp-settings#configuration-examples) for
-      better reliability and performance.
-    </Note>
-
-    Stdio servers are configured using an object with the following properties:
-
-    - `name` (required)
-      - Type: `str`
-      - Description: A unique name for the server.
-
-    - `command` (required)
-      - Type: `str`
-      - Description: The command to run the server.
-
-    - `args` (optional)
-      - Type: `list of str`
-      - Default: `[]`
-      - Description: Command-line arguments to pass to the server.
-
-    - `env` (optional)
-      - Type: `dict of str to str`
-      - Default: `{}`
-      - Description: Environment variables to set for the server process.
-  </Tab>
-</Tabs>
-
-#### When to Use Direct Stdio
-
-Direct stdio connections may still be appropriate in these scenarios:
- **Development and testing**: Quick prototyping of MCP servers.
- **Simple, single-use tools**: Tools that don't require high reliability or concurrent access.
- **Local-only environments**: When you don't want to manage additional proxy processes.
-
-### Configuration Examples
-
-<Tabs>
-  <Tab title="Proxy Servers (SSE/HTTP) - Recommended">
-    For stdio-based MCP servers, we recommend using MCP proxy tools like
-    [`supergateway`](https://github.com/supercorp-ai/supergateway) instead of direct stdio connections.
-    [SuperGateway](https://github.com/supercorp-ai/supergateway) is a popular MCP proxy that converts stdio MCP servers to
-    HTTP/SSE endpoints.
-
-    Start the proxy servers separately:
-    ```bash
-    # Terminal 1: Filesystem server proxy
-    supergateway --stdio "npx @modelcontextprotocol/server-filesystem /" --port 8080
-
-    # Terminal 2: Fetch server proxy
-    supergateway --stdio "uvx mcp-server-fetch" --port 8081
-    ```
-
-    Then configure OpenHands to use the HTTP endpoint:
-
-    ```toml
-    [mcp]
-    # SSE Servers - Recommended approach using proxy tools
-    sse_servers = [
-        # Basic SSE server with just a URL
-        "http://example.com:8080/mcp",
-
-        # SuperGateway proxy for fetch server
-        "http://localhost:8081/sse",
-
-        # External MCP service with authentication
-        {url="https://api.example.com/mcp/sse", api_key="your-api-key"}
-    ]
-
-    # SHTTP Servers - Modern streamable HTTP transport (recommended)
-    shttp_servers = [
-        # Basic SHTTP server with default 60s timeout
-        "https://api.example.com/mcp/shttp",
-
-        # Server with custom timeout for heavy operations
-        {
-            url = "https://files.example.com/mcp/shttp",
-            api_key = "your-api-key",
-            timeout = 1800  # 30 minutes for large file processing
-        }
-    ]
-    ```
-  </Tab>
-  <Tab title="Direct Stdio Servers">
-    <Note>
-      This setup is not Recommended for production.
-    </Note>
-    ```toml
-    [mcp]
-    # Direct stdio servers - use only for development/testing
-    stdio_servers = [
-        # Basic stdio server
-        {name="fetch", command="uvx", args=["mcp-server-fetch"]},
-
-        # Stdio server with environment variables
-        {
-            name="filesystem",
-            command="npx",
-            args=["@modelcontextprotocol/server-filesystem", "/"],
-            env={
-                "DEBUG": "true"
-            }
-        }
-    ]
-    ```
-
-    For production use, we recommend using proxy tools like SuperGateway.
-  </Tab>
-</Tabs>
-
-Other options include:
-
- **Custom FastAPI/Express servers**: Build your own HTTP wrapper around stdio MCP servers.
- **Docker-based proxies**: Containerized solutions for better isolation.
- **Cloud-hosted MCP services**: Third-party services that provide MCP endpoints.
--- a/enterprise/Dockerfile
+++ b/enterprise/Dockerfile
@@ -7,28 +7,14 @@ LABEL com.datadoghq.tags.service="deploy"
 LABEL com.datadoghq.tags.env="${DD_ENV}"

 # Install Node.js v20+ and npm (which includes npx)
-# Apply security updates to fix CVEs
 RUN apt-get update && \
    apt-get install -y curl && \
    curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
    apt-get install -y nodejs && \
    apt-get install -y jq gettext && \
-    # Apply security updates for packages with available fixes
-    apt-get upgrade -y \
-        libc-bin \
-        libc6 \
-        libgnutls30 \
-        libsqlite3-0 \
-        perl-base && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+    apt-get clean

-# Install Python packages with security fixes
-RUN pip install alembic psycopg2-binary cloud-sql-python-connector pg8000 gspread stripe python-keycloak asyncpg sqlalchemy[asyncio] resend tenacity slack-sdk ddtrace "posthog>=6.0.0" "limits==5.2.0" coredis prometheus-client shap scikit-learn pandas numpy && \
-    # Update packages with known CVE fixes
-    pip install --upgrade \
-        "mcp>=1.10.0" \
-        "pillow>=11.3.0"
+RUN pip install alembic psycopg2-binary cloud-sql-python-connector pg8000 gspread stripe python-keycloak asyncpg sqlalchemy[asyncio] resend tenacity slack-sdk ddtrace posthog "limits==5.2.0" coredis prometheus-client shap scikit-learn pandas numpy

 WORKDIR /app
 COPY enterprise .
--- a/enterprise/experiments/experiment_manager.py
+++ b/enterprise/experiments/experiment_manager.py
@@ -2,6 +2,7 @@ from experiments.constants import (
    ENABLE_EXPERIMENT_MANAGER,
 )
 from experiments.experiment_versions import (
+    handle_claude4_vs_gpt5_experiment,
    handle_condenser_max_step_experiment,
    handle_system_prompt_experiment,
 )
@@ -43,6 +44,9 @@ class SaaSExperimentManager(ExperimentManager):
            return conversation_settings

        # Apply conversation-scoped experiments
+        conversation_settings = handle_claude4_vs_gpt5_experiment(
+            user_id, conversation_id, conversation_settings
+        )
        conversation_settings = handle_condenser_max_step_experiment(
            user_id, conversation_id, conversation_settings
        )
--- a/enterprise/poetry.lock
+++ b/enterprise/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.4 and should not be changed by hand.

 [[package]]
 name = "aiofiles"
@@ -766,7 +766,7 @@ version = "1.17.1"
 description = "Foreign Function Interface for Python calling C code."
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "test"]
+groups = ["main"]
 files = [
    {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"},
    {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"},
@@ -836,7 +836,6 @@ files = [
    {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"},
    {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"},
 ]
-markers = {test = "platform_python_implementation == \"CPython\" and sys_platform == \"win32\""}

 [package.dependencies]
 pycparser = "*"
@@ -1061,7 +1060,7 @@ files = [
    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
-markers = {main = "platform_system == \"Windows\" or sys_platform == \"win32\" or os_name == \"nt\"", dev = "os_name == \"nt\"", test = "platform_system == \"Windows\" or sys_platform == \"win32\""}
+markers = {main = "platform_system == \"Windows\" or os_name == \"nt\" or sys_platform == \"win32\"", dev = "os_name == \"nt\"", test = "platform_system == \"Windows\" or sys_platform == \"win32\""}

 [[package]]
 name = "comm"
@@ -1902,25 +1901,25 @@ files = [

 [[package]]
 name = "fastapi"
-version = "0.117.1"
+version = "0.116.1"
 description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "fastapi-0.117.1-py3-none-any.whl", hash = "sha256:33c51a0d21cab2b9722d4e56dbb9316f3687155be6b276191790d8da03507552"},
-    {file = "fastapi-0.117.1.tar.gz", hash = "sha256:fb2d42082d22b185f904ca0ecad2e195b851030bd6c5e4c032d1c981240c631a"},
+    {file = "fastapi-0.116.1-py3-none-any.whl", hash = "sha256:c46ac7c312df840f0c9e220f7964bada936781bc4e2e6eb71f1c4d7553786565"},
+    {file = "fastapi-0.116.1.tar.gz", hash = "sha256:ed52cbf946abfd70c5a0dccb24673f0670deeb517a88b3544d03c2a6bf283143"},
 ]

 [package.dependencies]
 pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0"
-starlette = ">=0.40.0,<0.49.0"
+starlette = ">=0.40.0,<0.48.0"
 typing-extensions = ">=4.8.0"

 [package.extras]
-all = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.8)", "httpx (>=0.23.0,<1.0.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=3.1.5)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.18)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
-standard = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.8)", "httpx (>=0.23.0,<1.0.0)", "jinja2 (>=3.1.5)", "python-multipart (>=0.0.18)", "uvicorn[standard] (>=0.12.0)"]
-standard-no-fastapi-cloud-cli = ["email-validator (>=2.0.0)", "fastapi-cli[standard-no-fastapi-cloud-cli] (>=0.0.8)", "httpx (>=0.23.0,<1.0.0)", "jinja2 (>=3.1.5)", "python-multipart (>=0.0.18)", "uvicorn[standard] (>=0.12.0)"]
+all = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.8)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=3.1.5)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.18)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
+standard = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.8)", "httpx (>=0.23.0)", "jinja2 (>=3.1.5)", "python-multipart (>=0.0.18)", "uvicorn[standard] (>=0.12.0)"]
+standard-no-fastapi-cloud-cli = ["email-validator (>=2.0.0)", "fastapi-cli[standard-no-fastapi-cloud-cli] (>=0.0.8)", "httpx (>=0.23.0)", "jinja2 (>=3.1.5)", "python-multipart (>=0.0.18)", "uvicorn[standard] (>=0.12.0)"]

 [[package]]
 name = "fastjsonschema"
@@ -1990,7 +1989,6 @@ files = [
    {file = "fastuuid-0.12.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9b31dd488d0778c36f8279b306dc92a42f16904cba54acca71e107d65b60b0c"},
    {file = "fastuuid-0.12.0-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:b19361ee649365eefc717ec08005972d3d1eb9ee39908022d98e3bfa9da59e37"},
    {file = "fastuuid-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:8fc66b11423e6f3e1937385f655bedd67aebe56a3dcec0cb835351cfe7d358c9"},
-    {file = "fastuuid-0.12.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:2925f67b88d47cb16aa3eb1ab20fdcf21b94d74490e0818c91ea41434b987493"},
    {file = "fastuuid-0.12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7b15c54d300279ab20a9cc0579ada9c9f80d1bc92997fc61fb7bf3103d7cb26b"},
    {file = "fastuuid-0.12.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:458f1bc3ebbd76fdb89ad83e6b81ccd3b2a99fa6707cd3650b27606745cfb170"},
    {file = "fastuuid-0.12.0-cp38-cp38-manylinux_2_34_x86_64.whl", hash = "sha256:a8f0f83fbba6dc44271a11b22e15838641b8c45612cdf541b4822a5930f6893c"},
@@ -2293,72 +2291,6 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask[dataframe,test]", "moto
 test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard ; python_version < \"3.14\""]
 tqdm = ["tqdm"]

-[[package]]
-name = "gevent"
-version = "25.9.1"
-description = "Coroutine-based network library"
-optional = false
-python-versions = ">=3.9"
-groups = ["test"]
-files = [
-    {file = "gevent-25.9.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:856b990be5590e44c3a3dc6c8d48a40eaccbb42e99d2b791d11d1e7711a4297e"},
-    {file = "gevent-25.9.1-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:fe1599d0b30e6093eb3213551751b24feeb43db79f07e89d98dd2f3330c9063e"},
-    {file = "gevent-25.9.1-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:f0d8b64057b4bf1529b9ef9bd2259495747fba93d1f836c77bfeaacfec373fd0"},
-    {file = "gevent-25.9.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b56cbc820e3136ba52cd690bdf77e47a4c239964d5f80dc657c1068e0fe9521c"},
-    {file = "gevent-25.9.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c5fa9ce5122c085983e33e0dc058f81f5264cebe746de5c401654ab96dddfca8"},
-    {file = "gevent-25.9.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:03c74fec58eda4b4edc043311fca8ba4f8744ad1632eb0a41d5ec25413581975"},
-    {file = "gevent-25.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:a8ae9f895e8651d10b0a8328a61c9c53da11ea51b666388aa99b0ce90f9fdc27"},
-    {file = "gevent-25.9.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:18e5aff9e8342dc954adb9c9c524db56c2f3557999463445ba3d9cbe3dada7b7"},
-    {file = "gevent-25.9.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1cdf6db28f050ee103441caa8b0448ace545364f775059d5e2de089da975c457"},
-    {file = "gevent-25.9.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:812debe235a8295be3b2a63b136c2474241fa5c58af55e6a0f8cfc29d4936235"},
-    {file = "gevent-25.9.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b28b61ff9216a3d73fe8f35669eefcafa957f143ac534faf77e8a19eb9e6883a"},
-    {file = "gevent-25.9.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5e4b6278b37373306fc6b1e5f0f1cf56339a1377f67c35972775143d8d7776ff"},
-    {file = "gevent-25.9.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d99f0cb2ce43c2e8305bf75bee61a8bde06619d21b9d0316ea190fc7a0620a56"},
-    {file = "gevent-25.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:72152517ecf548e2f838c61b4be76637d99279dbaa7e01b3924df040aa996586"},
-    {file = "gevent-25.9.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:46b188248c84ffdec18a686fcac5dbb32365d76912e14fda350db5dc0bfd4f86"},
-    {file = "gevent-25.9.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f2b54ea3ca6f0c763281cd3f96010ac7e98c2e267feb1221b5a26e2ca0b9a692"},
-    {file = "gevent-25.9.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:7a834804ac00ed8a92a69d3826342c677be651b1c3cd66cc35df8bc711057aa2"},
-    {file = "gevent-25.9.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:323a27192ec4da6b22a9e51c3d9d896ff20bc53fdc9e45e56eaab76d1c39dd74"},
-    {file = "gevent-25.9.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6ea78b39a2c51d47ff0f130f4c755a9a4bbb2dd9721149420ad4712743911a51"},
-    {file = "gevent-25.9.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:dc45cd3e1cc07514a419960af932a62eb8515552ed004e56755e4bf20bad30c5"},
-    {file = "gevent-25.9.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:34e01e50c71eaf67e92c186ee0196a039d6e4f4b35670396baed4a2d8f1b347f"},
-    {file = "gevent-25.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acd6bcd5feabf22c7c5174bd3b9535ee9f088d2bbce789f740ad8d6554b18f3"},
-    {file = "gevent-25.9.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:4f84591d13845ee31c13f44bdf6bd6c3dbf385b5af98b2f25ec328213775f2ed"},
-    {file = "gevent-25.9.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9cdbb24c276a2d0110ad5c978e49daf620b153719ac8a548ce1250a7eb1b9245"},
-    {file = "gevent-25.9.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:88b6c07169468af631dcf0fdd3658f9246d6822cc51461d43f7c44f28b0abb82"},
-    {file = "gevent-25.9.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b7bb0e29a7b3e6ca9bed2394aa820244069982c36dc30b70eb1004dd67851a48"},
-    {file = "gevent-25.9.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2951bb070c0ee37b632ac9134e4fdaad70d2e660c931bb792983a0837fe5b7d7"},
-    {file = "gevent-25.9.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e4e17c2d57e9a42e25f2a73d297b22b60b2470a74be5a515b36c984e1a246d47"},
-    {file = "gevent-25.9.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8d94936f8f8b23d9de2251798fcb603b84f083fdf0d7f427183c1828fb64f117"},
-    {file = "gevent-25.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:eb51c5f9537b07da673258b4832f6635014fee31690c3f0944d34741b69f92fa"},
-    {file = "gevent-25.9.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:1a3fe4ea1c312dbf6b375b416925036fe79a40054e6bf6248ee46526ea628be1"},
-    {file = "gevent-25.9.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0adb937f13e5fb90cca2edf66d8d7e99d62a299687400ce2edee3f3504009356"},
-    {file = "gevent-25.9.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:427f869a2050a4202d93cf7fd6ab5cffb06d3e9113c10c967b6e2a0d45237cb8"},
-    {file = "gevent-25.9.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c049880175e8c93124188f9d926af0a62826a3b81aa6d3074928345f8238279e"},
-    {file = "gevent-25.9.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b5a67a0974ad9f24721034d1e008856111e0535f1541499f72a733a73d658d1c"},
-    {file = "gevent-25.9.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1d0f5d8d73f97e24ea8d24d8be0f51e0cf7c54b8021c1fddb580bf239474690f"},
-    {file = "gevent-25.9.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ddd3ff26e5c4240d3fbf5516c2d9d5f2a998ef87cfb73e1429cfaeaaec860fa6"},
-    {file = "gevent-25.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:bb63c0d6cb9950cc94036a4995b9cc4667b8915366613449236970f4394f94d7"},
-    {file = "gevent-25.9.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f18f80aef6b1f6907219affe15b36677904f7cfeed1f6a6bc198616e507ae2d7"},
-    {file = "gevent-25.9.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b274a53e818124a281540ebb4e7a2c524778f745b7a99b01bdecf0ca3ac0ddb0"},
-    {file = "gevent-25.9.1-cp39-cp39-win32.whl", hash = "sha256:c6c91f7e33c7f01237755884316110ee7ea076f5bdb9aa0982b6dc63243c0a38"},
-    {file = "gevent-25.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:012a44b0121f3d7c800740ff80351c897e85e76a7e4764690f35c5ad9ec17de5"},
-    {file = "gevent-25.9.1.tar.gz", hash = "sha256:adf9cd552de44a4e6754c51ff2e78d9193b7fa6eab123db9578a210e657235dd"},
-]
-
-[package.dependencies]
-cffi = {version = ">=1.17.1", markers = "platform_python_implementation == \"CPython\" and sys_platform == \"win32\""}
-greenlet = {version = ">=3.2.2", markers = "platform_python_implementation == \"CPython\""}
-"zope.event" = "*"
-"zope.interface" = "*"
-
-[package.extras]
-dnspython = ["dnspython (>=1.16.0,<2.0) ; python_version < \"3.10\"", "idna ; python_version < \"3.10\""]
-docs = ["furo", "repoze.sphinx.autointerface", "sphinx", "sphinxcontrib-programoutput", "zope.schema"]
-monitor = ["psutil (>=5.7.0) ; sys_platform != \"win32\" or platform_python_implementation == \"CPython\""]
-recommended = ["cffi (>=1.17.1) ; platform_python_implementation == \"CPython\"", "dnspython (>=1.16.0,<2.0) ; python_version < \"3.10\"", "idna ; python_version < \"3.10\"", "psutil (>=5.7.0) ; sys_platform != \"win32\" or platform_python_implementation == \"CPython\""]
-test = ["cffi (>=1.17.1) ; platform_python_implementation == \"CPython\"", "coverage (>=5.0) ; sys_platform != \"win32\"", "dnspython (>=1.16.0,<2.0) ; python_version < \"3.10\"", "idna ; python_version < \"3.10\"", "objgraph", "psutil (>=5.7.0) ; sys_platform != \"win32\" or platform_python_implementation == \"CPython\"", "requests"]
-
 [[package]]
 name = "gitdb"
 version = "4.0.12"
@@ -2775,7 +2707,7 @@ version = "3.2.4"
 description = "Lightweight in-process concurrent programming"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "test"]
+groups = ["main"]
 files = [
    {file = "greenlet-3.2.4-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8c68325b0d0acf8d91dde4e6f930967dd52a5302cd4062932a6b2e7c2969f47c"},
    {file = "greenlet-3.2.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:94385f101946790ae13da500603491f04a76b6e4c059dab271b3ce2e283b2590"},
@@ -2832,7 +2764,6 @@ files = [
    {file = "greenlet-3.2.4-cp39-cp39-win_amd64.whl", hash = "sha256:d2e685ade4dafd447ede19c31277a224a239a0a1a4eca4e6390efedf20260cfb"},
    {file = "greenlet-3.2.4.tar.gz", hash = "sha256:0dca0d95ff849f9a364385f36ab49f50065d76964944638be9691e1832e9f86d"},
 ]
-markers = {test = "platform_python_implementation == \"CPython\""}

 [package.extras]
 docs = ["Sphinx", "furo"]
@@ -5432,7 +5363,7 @@ llama = ["llama-index (>=0.12.29,<0.13.0)", "llama-index-core (>=0.12.29,<0.13.0

 [[package]]
 name = "openhands-ai"
-version = "0.57.0"
+version = "0.55.0"
 description = "OpenHands: Code Less, Make More"
 optional = false
 python-versions = "^3.12,<3.14"
@@ -5466,7 +5397,7 @@ json-repair = "*"
 jupyter_kernel_gateway = "*"
 kubernetes = "^33.1.0"
 libtmux = ">=0.37,<0.40"
-litellm = ">=1.74.3, <1.77.2, !=1.64.4, !=1.67.*"
+litellm = "^1.74.3, !=1.64.4, !=1.67.*"
 memory-profiler = "^0.61.0"
 numpy = "*"
 openai = "1.99.9"
@@ -5475,7 +5406,6 @@ opentelemetry-api = "^1.33.1"
 opentelemetry-exporter-otlp-proto-grpc = "^1.33.1"
 pathspec = "^0.12.1"
 pexpect = "*"
-pillow = "^11.3.0"
 poetry = "^2.1.2"
 prompt-toolkit = "^3.0.50"
 protobuf = "^5.0.0,<6.0.0"
@@ -5483,7 +5413,6 @@ psutil = "*"
 pygithub = "^2.5.0"
 pyjwt = "^2.9.0"
 pylatexenc = "*"
-pypdf = "^6.0.0"
 PyPDF2 = "*"
 python-docx = "*"
 python-dotenv = "*"
@@ -5497,17 +5426,13 @@ pyyaml = "^6.0.2"
 qtconsole = "^5.6.1"
 rapidfuzz = "^3.9.0"
 redis = ">=5.2,<7.0"
-requests = "^2.32.5"
-setuptools = ">=78.1.1"
 shellingham = "^1.5.4"
-sse-starlette = "^3.0.2"
-starlette = "^0.48.0"
+sse-starlette = "^2.1.3"
 tenacity = ">=8.5,<10.0"
 termcolor = "*"
 toml = "*"
 tornado = "*"
 types-toml = "*"
-urllib3 = "^2.5.0"
 uvicorn = "*"
 whatthepatch = "^1.0.6"
 zope-interface = "7.2"
@@ -6113,14 +6038,14 @@ files = [

 [[package]]
 name = "posthog"
-version = "6.7.6"
+version = "4.10.0"
 description = "Integrate PostHog into any python application."
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "posthog-6.7.6-py3-none-any.whl", hash = "sha256:b09a7e65a042ec416c28874b397d3accae412a80a8b0ef3fa686fbffc99e4d4b"},
-    {file = "posthog-6.7.6.tar.gz", hash = "sha256:ee5c5ad04b857d96d9b7a4f715e23916a2f206bfcf25e5a9d328a3d27664b0d3"},
+    {file = "posthog-4.10.0-py3-none-any.whl", hash = "sha256:b693d3d8209d000d8c5f4d6ea19096bfdfb83047fa8a14c937ae50a3394809a1"},
+    {file = "posthog-4.10.0.tar.gz", hash = "sha256:513bfbb21344013294abc046b1142173189c5422a3906cf2280d1389b0c2e28b"},
 ]

 [package.dependencies]
@@ -6129,11 +6054,11 @@ distro = ">=1.5.0"
 python-dateutil = ">=2.2"
 requests = ">=2.7,<3.0"
 six = ">=1.5"
-typing-extensions = ">=4.2.0"

 [package.extras]
 dev = ["django-stubs", "lxml", "mypy", "mypy-baseline", "packaging", "pre-commit", "pydantic", "ruff", "setuptools", "tomli", "tomli_w", "twine", "types-mock", "types-python-dateutil", "types-requests", "types-setuptools", "types-six", "wheel"]
 langchain = ["langchain (>=0.2.0)"]
+sentry = ["django", "sentry-sdk"]
 test = ["anthropic", "coverage", "django", "freezegun (==1.5.1)", "google-genai", "langchain-anthropic (>=0.3.15)", "langchain-community (>=0.3.25)", "langchain-core (>=0.3.65)", "langchain-openai (>=0.3.22)", "langgraph (>=0.4.8)", "mock (>=2.0.0)", "openai", "parameterized (>=0.8.1)", "pydantic", "pytest", "pytest-asyncio", "pytest-timeout"]

 [[package]]
@@ -6546,12 +6471,11 @@ version = "2.22"
 description = "C parser in Python"
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "test"]
+groups = ["main"]
 files = [
    {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
    {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
 ]
-markers = {test = "platform_python_implementation == \"CPython\" and sys_platform == \"win32\""}

 [[package]]
 name = "pydantic"
@@ -8341,7 +8265,7 @@ version = "80.9.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "test"]
+groups = ["main"]
 files = [
    {file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"},
    {file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"},
@@ -8707,14 +8631,14 @@ sqlcipher = ["sqlcipher3_binary"]

 [[package]]
 name = "sse-starlette"
-version = "3.0.2"
+version = "2.4.1"
 description = "SSE plugin for Starlette"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "sse_starlette-3.0.2-py3-none-any.whl", hash = "sha256:16b7cbfddbcd4eaca11f7b586f3b8a080f1afe952c15813455b162edea619e5a"},
-    {file = "sse_starlette-3.0.2.tar.gz", hash = "sha256:ccd60b5765ebb3584d0de2d7a6e4f745672581de4f5005ab31c3a25d10b52b3a"},
+    {file = "sse_starlette-2.4.1-py3-none-any.whl", hash = "sha256:08b77ea898ab1a13a428b2b6f73cfe6d0e607a7b4e15b9bb23e4a37b087fd39a"},
+    {file = "sse_starlette-2.4.1.tar.gz", hash = "sha256:7c8a800a1ca343e9165fc06bbda45c78e4c6166320707ae30b416c42da070926"},
 ]

 [package.dependencies]
@@ -8722,7 +8646,7 @@ anyio = ">=4.7.0"

 [package.extras]
 daphne = ["daphne (>=4.2.0)"]
-examples = ["aiosqlite (>=0.21.0)", "fastapi (>=0.115.12)", "sqlalchemy[asyncio] (>=2.0.41)", "starlette (>=0.41.3)", "uvicorn (>=0.34.0)"]
+examples = ["aiosqlite (>=0.21.0)", "fastapi (>=0.115.12)", "sqlalchemy[asyncio,examples] (>=2.0.41)", "starlette (>=0.41.3)", "uvicorn (>=0.34.0)"]
 granian = ["granian (>=2.3.1)"]
 uvicorn = ["uvicorn (>=0.34.0)"]

@@ -8778,14 +8702,14 @@ files = [

 [[package]]
 name = "starlette"
-version = "0.48.0"
+version = "0.47.3"
 description = "The little ASGI library that shines."
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659"},
-    {file = "starlette-0.48.0.tar.gz", hash = "sha256:7e8cee469a8ab2352911528110ce9088fdc6a37d9876926e73da7ce4aa4c7a46"},
+    {file = "starlette-0.47.3-py3-none-any.whl", hash = "sha256:89c0778ca62a76b826101e7c709e70680a1699ca7da6b44d38eb0a7e61fe4b51"},
+    {file = "starlette-0.47.3.tar.gz", hash = "sha256:6bc94f839cc176c4858894f1f8908f0ab79dfec1a6b8402f6da9be26ebea52e9"},
 ]

 [package.dependencies]
@@ -9914,32 +9838,13 @@ enabler = ["pytest-enabler (>=2.2)"]
 test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
 type = ["pytest-mypy"]

-[[package]]
-name = "zope-event"
-version = "6.0"
-description = "Very basic event publishing system"
-optional = false
-python-versions = ">=3.9"
-groups = ["test"]
-files = [
-    {file = "zope_event-6.0-py3-none-any.whl", hash = "sha256:6f0922593407cc673e7d8766b492c519f91bdc99f3080fe43dcec0a800d682a3"},
-    {file = "zope_event-6.0.tar.gz", hash = "sha256:0ebac894fa7c5f8b7a89141c272133d8c1de6ddc75ea4b1f327f00d1f890df92"},
-]
-
-[package.dependencies]
-setuptools = ">=75.8.2"
-
-[package.extras]
-docs = ["Sphinx"]
-test = ["zope.testrunner (>=6.4)"]
-
 [[package]]
 name = "zope-interface"
 version = "7.2"
 description = "Interfaces for Python"
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "test"]
+groups = ["main"]
 files = [
    {file = "zope.interface-7.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ce290e62229964715f1011c3dbeab7a4a1e4971fd6f31324c4519464473ef9f2"},
    {file = "zope.interface-7.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:05b910a5afe03256b58ab2ba6288960a2892dfeef01336dc4be6f1b9ed02ab0a"},
@@ -10103,4 +10008,4 @@ cffi = ["cffi (>=1.17) ; python_version >= \"3.13\" and platform_python_implemen
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.12,<3.14"
-content-hash = "fac67a8991a3e2c840a23702dc90f99e98d381f3537ad50b4c4739cdbde941ca"
+content-hash = "5771671ef2acc36e7b0931c73fa035ca1d329e8dac6827f7a349e1a569c3fd23"
--- a/enterprise/pyproject.toml
+++ b/enterprise/pyproject.toml
@@ -38,7 +38,7 @@ resend = "^2.7.0"
 tenacity = "^9.1.2"
 slack-sdk = "^3.35.0"
 ddtrace = "3.13.0"                                           #pin to avoid yanked version 3.12.4
-posthog = "^6.0.0"
+posthog = "^4.2.0"
 limits = "^5.2.0"
 coredis = "^4.22.0"
 httpx = "*"
@@ -63,7 +63,6 @@ openai = "*"
 opencv-python = "*"
 pandas = "*"
 reportlab = "*"
-gevent = ">=24.2.1,<26.0.0"

 [tool.poetry-dynamic-versioning]
 enable = true
@@ -86,7 +85,3 @@ lint.pydocstyle.convention = "google"
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
 asyncio_default_fixture_loop_scope = "function"
-
-[tool.coverage.run]
-relative_files = true
-omit = [ "tests/*" ]
--- a/enterprise/server/clustered_conversation_manager.py
+++ b/enterprise/server/clustered_conversation_manager.py
@@ -21,7 +21,6 @@ from openhands.events.event_store_abc import EventStoreABC
 from openhands.events.observation import AgentStateChangedObservation
 from openhands.events.stream import EventStreamSubscriber
 from openhands.llm.llm_registry import LLMRegistry
-from openhands.runtime.runtime_status import RuntimeStatus
 from openhands.server.config.server_config import ServerConfig
 from openhands.server.conversation_manager.conversation_manager import (
    ConversationManager,
@@ -687,7 +686,6 @@ class ClusteredConversationManager(StandaloneConversationManager):
                        url=self._get_conversation_url(conversation_id),
                        session_api_key=None,
                        event_store=EventStore(conversation_id, self.file_store, uid),
-                        runtime_status=RuntimeStatus.READY,
                    )
                )
        return results
--- a/enterprise/server/routes/auth.py
+++ b/enterprise/server/routes/auth.py
@@ -174,17 +174,19 @@ async def keycloak_callback(
    posthog_user_id = f'FEATURE_{user_id}' if IS_FEATURE_ENV else user_id

    try:
-        posthog.set(
-            distinct_id=posthog_user_id,
-            properties={
-                'user_id': posthog_user_id,
-                'original_user_id': user_id,
-                'is_feature_env': IS_FEATURE_ENV,
+        posthog.identify(
+            posthog_user_id,
+            {
+                '$set': {
+                    'user_id': posthog_user_id,  # Explicitly set as property
+                    'original_user_id': user_id,  # Store the original user_id
+                    'is_feature_env': IS_FEATURE_ENV,  # Track if this is a feature environment
+                }
            },
        )
    except Exception as e:
        logger.error(
-            'auth:posthog_set:failed',
+            'auth:posthog_identify:failed',
            extra={
                'user_id': user_id,
                'error': str(e),
--- a/enterprise/server/routes/user.py
+++ b/enterprise/server/routes/user.py
@@ -138,7 +138,6 @@ async def saas_search_repositories(
    per_page: int = 5,
    sort: str = 'stars',
    order: str = 'desc',
-    selected_provider: ProviderType | None = None,
    provider_tokens: PROVIDER_TOKEN_TYPE | None = Depends(get_provider_tokens),
    access_token: SecretStr | None = Depends(get_access_token),
    user_id: str | None = Depends(get_user_id),
@@ -156,7 +155,6 @@ async def saas_search_repositories(
        per_page=per_page,
        sort=sort,
        order=order,
-        selected_provider=selected_provider,
        provider_tokens=provider_tokens,
        access_token=access_token,
        user_id=user_id,
--- a/enterprise/server/saas_nested_conversation_manager.py
+++ b/enterprise/server/saas_nested_conversation_manager.py
@@ -60,14 +60,9 @@ from openhands.utils.utils import create_registry_and_conversation_stats
 RUNTIME_URL_PATTERN = os.getenv(
    'RUNTIME_URL_PATTERN', 'https://{runtime_id}.prod-runtime.all-hands.dev'
 )
-RUNTIME_ROUTING_MODE = os.getenv('RUNTIME_ROUTING_MODE', 'subdomain').lower()

 # Pattern for base URL for the runtime
-RUNTIME_CONVERSATION_URL = RUNTIME_URL_PATTERN + (
-    '/runtime/api/conversations/{conversation_id}'
-    if RUNTIME_ROUTING_MODE == 'path'
-    else '/api/conversations/{conversation_id}'
-)
+RUNTIME_CONVERSATION_URL = RUNTIME_URL_PATTERN + '/api/conversations/{conversation_id}'

 # Time in seconds before a Redis entry is considered expired if not refreshed
 _REDIS_ENTRY_TIMEOUT_SECONDS = 300
@@ -349,48 +344,18 @@ class SaasNestedConversationManager(ConversationManager):
        api_url: str,
        custom_secrets: MappingProxyType[str, Any] | None,
    ):
-        """Setup custom secrets for the nested conversation.
-
-        Note: When resuming conversations, secrets may already exist in the runtime.
-        We check for specific duplicate error messages to handle this case gracefully.
-        """
+        """Setup custom secrets for the nested conversation."""
        if custom_secrets:
            for key, secret in custom_secrets.items():
-                try:
-                    response = await client.post(
-                        f'{api_url}/api/secrets',
-                        json={
-                            'name': key,
-                            'description': secret.description,
-                            'value': secret.secret.get_secret_value(),
-                        },
-                    )
-                    response.raise_for_status()
-                    logger.debug(f'Successfully created secret: {key}')
-                except httpx.HTTPStatusError as e:
-                    if e.response.status_code == 400:
-                        # Only ignore if it's actually a duplicate error
-                        try:
-                            error_data = e.response.json()
-                            error_msg = error_data.get('message', '')
-                            # The API returns: "Secret {secret_name} already exists"
-                            if 'already exists' in error_msg:
-                                logger.info(
-                                    f'Secret "{key}" already exists, continuing - ignoring duplicate',
-                                    extra={'api_url': api_url},
-                                )
-                                continue
-                        except (KeyError, ValueError, TypeError):
-                            pass  # If we can't parse JSON, fall through to re-raise
-                    # Re-raise all other errors (including non-duplicate 400s)
-                    logger.error(
-                        f'Failed to setup secret "{key}": HTTP {e.response.status_code}',
-                        extra={
-                            'api_url': api_url,
-                            'response_text': e.response.text[:200],
-                        },
-                    )
-                    raise
+                response = await client.post(
+                    f'{api_url}/api/secrets',
+                    json={
+                        'name': key,
+                        'description': secret.description,
+                        'value': secret.secret.get_secret_value(),
+                    },
+                )
+                response.raise_for_status()

    def _get_mcp_config(self, user_id: str) -> MCPConfig | None:
        api_key_store = ApiKeyStore.get_instance()
--- a/enterprise/tests/unit/test_auth_routes.py
+++ b/enterprise/tests/unit/test_auth_routes.py
@@ -211,7 +211,7 @@ async def test_keycloak_callback_success_with_valid_offline_token(mock_request):
            secure=False,
            accepted_tos=True,
        )
-        mock_posthog.set.assert_called_once()
+        mock_posthog.identify.assert_called_once()


@pytest.mark.asyncio
@@ -278,7 +278,7 @@ async def test_keycloak_callback_success_without_offline_token(mock_request):
            secure=False,
            accepted_tos=True,
        )
-        mock_posthog.set.assert_called_once()
+        mock_posthog.identify.assert_called_once()


@pytest.mark.asyncio
--- a/enterprise/tests/unit/test_saas_conversation_manager_secrets.py
+++ b/enterprise/tests/unit/test_saas_conversation_manager_secrets.py
@@ -1,176 +0,0 @@
-"""Tests for SaasNestedConversationManager custom secrets handling during resume."""
-
-from types import MappingProxyType
-from unittest.mock import AsyncMock, MagicMock
-
-import httpx
-import pytest
-from pydantic import SecretStr
-from server.saas_nested_conversation_manager import SaasNestedConversationManager
-
-from openhands.core.config.openhands_config import OpenHandsConfig
-from openhands.integrations.provider import CustomSecret
-from openhands.server.config.server_config import ServerConfig
-from openhands.storage.memory import InMemoryFileStore
-
-
-class MockHTTPXResponse:
-    """Mock httpx.Response that behaves realistically."""
-
-    def __init__(self, status_code: int, json_data: dict | None = None):
-        self.status_code = status_code
-        self._json_data = json_data or {}
-        self.text = str(json_data) if json_data else ''
-
-    def json(self):
-        """Return JSON data."""
-        if self._json_data:
-            return self._json_data
-        raise ValueError('No JSON data')
-
-    def raise_for_status(self):
-        """Raise an exception for 4xx/5xx status codes."""
-        if self.status_code >= 400:
-            # Create a proper mock response for the exception
-            mock_response = MagicMock()
-            mock_response.status_code = self.status_code
-            mock_response.json = self.json
-            mock_response.text = self.text
-
-            error = httpx.HTTPStatusError(
-                f"Client error '{self.status_code}' for url 'test'",
-                request=MagicMock(),
-                response=mock_response,
-            )
-            raise error
-
-
-@pytest.fixture
-def saas_manager():
-    """Create a SaasNestedConversationManager instance for testing."""
-    manager = SaasNestedConversationManager(
-        sio=MagicMock(),
-        config=MagicMock(spec=OpenHandsConfig),
-        server_config=MagicMock(spec=ServerConfig),
-        file_store=MagicMock(spec=InMemoryFileStore),
-        event_retrieval=MagicMock(),
-    )
-    return manager
-
-
-@pytest.mark.asyncio
-async def test_duplicate_secrets_dont_crash_resume(saas_manager):
-    """Test that duplicate secrets during resume are handled gracefully."""
-    mock_client = AsyncMock(spec=httpx.AsyncClient)
-
-    # Simulate resume scenario: secret already exists (400)
-    mock_response = MockHTTPXResponse(
-        400, {'message': 'Secret MY_API_KEY already exists'}
-    )
-
-    async def mock_post(*args, **kwargs):
-        return mock_response
-
-    mock_client.post = AsyncMock(side_effect=mock_post)
-
-    custom_secrets = MappingProxyType(
-        {
-            'MY_API_KEY': CustomSecret(
-                secret=SecretStr('api_key_value'),
-                description='API Key that already exists on resume',
-            ),
-        }
-    )
-
-    # Should not raise despite 400 "already exists" error
-    await saas_manager._setup_custom_secrets(
-        client=mock_client,
-        api_url='https://runtime.example.com',
-        custom_secrets=custom_secrets,
-    )
-
-    assert mock_client.post.call_count == 1
-
-
-@pytest.mark.asyncio
-async def test_other_400_errors_still_fail(saas_manager):
-    """Test that non-duplicate 400 errors are still raised."""
-    mock_client = AsyncMock(spec=httpx.AsyncClient)
-
-    # 400 error but NOT a duplicate
-    mock_response = MockHTTPXResponse(400, {'message': 'Invalid secret name format'})
-
-    async def mock_post(*args, **kwargs):
-        return mock_response
-
-    mock_client.post = AsyncMock(side_effect=mock_post)
-
-    custom_secrets = MappingProxyType(
-        {
-            'INVALID!NAME': CustomSecret(
-                secret=SecretStr('value'), description='Secret with invalid name'
-            ),
-        }
-    )
-
-    with pytest.raises(httpx.HTTPStatusError) as exc_info:
-        await saas_manager._setup_custom_secrets(
-            client=mock_client,
-            api_url='https://runtime.example.com',
-            custom_secrets=custom_secrets,
-        )
-
-    assert exc_info.value.response.status_code == 400
-
-
-@pytest.mark.asyncio
-async def test_normal_secret_creation_still_works(saas_manager):
-    """Test that normal secret creation works correctly."""
-    mock_client = AsyncMock(spec=httpx.AsyncClient)
-
-    # Successful creation
-    mock_response = MockHTTPXResponse(200, {'message': 'Secret created'})
-
-    async def mock_post(*args, **kwargs):
-        return mock_response
-
-    mock_client.post = AsyncMock(side_effect=mock_post)
-
-    custom_secrets = MappingProxyType(
-        {
-            'NEW_SECRET': CustomSecret(
-                secret=SecretStr('new_value'), description='A new secret'
-            ),
-        }
-    )
-
-    await saas_manager._setup_custom_secrets(
-        client=mock_client,
-        api_url='https://runtime.example.com',
-        custom_secrets=custom_secrets,
-    )
-
-    assert mock_client.post.call_count == 1
-    call_args = mock_client.post.call_args_list[0]
-    assert call_args[1]['json']['name'] == 'NEW_SECRET'
-    assert call_args[1]['json']['value'] == 'new_value'
-
-
-@pytest.mark.asyncio
-async def test_handles_empty_secrets_gracefully(saas_manager):
-    """Test that empty or missing secrets are handled correctly."""
-    mock_client = AsyncMock(spec=httpx.AsyncClient)
-
-    # Test with None
-    await saas_manager._setup_custom_secrets(
-        client=mock_client, api_url='https://runtime.example.com', custom_secrets=None
-    )
-    assert mock_client.post.call_count == 0
-
-    # Test with empty dict
-    await saas_manager._setup_custom_secrets(
-        client=mock_client,
-        api_url='https://runtime.example.com',
-        custom_secrets=MappingProxyType({}),
-    )
-    assert mock_client.post.call_count == 0
--- a/evaluation/benchmarks/multi_swe_bench/SWE-Gym.md
+++ b/evaluation/benchmarks/multi_swe_bench/SWE-Gym.md
@@ -1,152 +0,0 @@
-<h1 align="center"> Training Software Engineering Agents and Verifiers with SWE-Gym </h1>
-
-A Multi-SWE-bench implementation of SWE-Gym.
-
-<p align="center">
-  <a href="https://www.jiayipan.com/" style="text-decoration: none;">Jiayi Pan<sup>*,1</sup></a>,
-  <a href="https://xwang.dev/" style="text-decoration: none;">Xingyao Wang<sup>*,2</sup></a>,
-  <a href="https://www.phontron.com/" style="text-decoration: none;">Graham Neubig<sup>3</sup></a>,
-  <a href="https://www.cs.toronto.edu/~ndjaitly/" style="text-decoration: none;">Navdeep Jaitly<sup>4</sup></a>,
-  <a href="https://blender.cs.illinois.edu/hengji.html" style="text-decoration: none;">Heng Ji<sup>2</sup></a>,
-  <a href="https://www.alanesuhr.com/" style="text-decoration: none;">Alane Suhr<sup>^,1</sup></a>,
-  <a href="https://dreasysnail.github.io/" style="text-decoration: none;">Yizhe Zhang<sup>^,4</sup></a>
-</p>
-
-<p align="center">
-  <sup>1</sup>UC Berkeley, <sup>2</sup>UIUC, <sup>3</sup>CMU, <sup>4</sup>Apple </br>
-  <sub><sup>*</sup>Equal contribution, <sup>^</sup>Equal supervision</sub>
-</p>
-
-<p align="center">
-<a href="https://arxiv.org/abs/2412.21139">📃 Paper</a>
-•
-<a href="https://huggingface.co/SWE-Gym" >🤗 Data & Models</a>
-</p>
-
-We present **SWE-Gym**, the first environment for training real-world software engineering agents.
-We use it to train strong LM agents that achieve state-of-the-art open results on SWE-Bench, with early, promising scaling characteristics as we increase training and inference-time compute.
-
-<p align="center">
-  <img src="https://github.com/SWE-Gym/SWE-Gym/blob/main/assets/images/teaser.jpg?raw=true" width="100%" alt="teaser">
-</p>
-
---
-# Run SWE-Gym with OpenHands
-
-The process of running SWE-Gym is very similar to how you'd run SWE-Bench evaluation.
-
-
-1. First, clone OpenHands repo `git clone https://github.com/All-Hands-AI/OpenHands.git`
-2. Then setup the repo following [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md)
-3. Then you can simply serve your own model as an OpenAI compatible endpoint, put those info in config.toml. You can do this by following instruction [here](../../README.md#setup).
-4. And then simply do the following to sample for 16x parallelism:
-
-```bash
-export ALLHANDS_API_KEY=ah-yourkey  # You don't need to set this when running these in local docker container
-./evaluation/benchmarks/multi_swe_bench/scripts/rollout_swegym.sh llm.mymodel-temp05 'train-t05' 16
-```
-
-NOTE: SWE-Gym sampling with parallelism is currently only tested with AllHands RemoteRuntime (limited beta). Fill [this form](https://docs.google.com/forms/d/e/1FAIpQLSckVz_JFwg2_mOxNZjCtr7aoBFI2Mwdan3f75J_TrdMS1JV2g/viewform) to apply for access.
-
-
-5. When `rollout_swegym.sh` finishes, you will get a file called `output.with_completions.jsonl.gz`. Then you can use [`./scripts/swegym/convert_data.ipynb`](./scripts/swegym/convert_data.ipynb) to convert them into SFT data format.
-
-## Running the Jupyter Notebook
-
-To run the data conversion notebook, follow these steps:
-
-1. Navigate to the OpenHands repository root:
-```bash
-cd openhands_repo
-```
-
-2. Set the PYTHONPATH and start Jupyter notebook:
-```bash
-PYTHONPATH=$(pwd) jupyter notebook
-```
-
-3. In the Jupyter interface, navigate to `evaluation/benchmarks/swe_bench/scripts/swegym/convert_data.ipynb`
-
-4. Update the file paths in the notebook:
-   - Set `FILE_PATHS` to point to your `output.with_completions.jsonl.gz` files
-   - Set `YOUR_OUTPUT_FOLDER` to your desired output directory
-
-5. Run the notebook cells sequentially to process your data and generate the SFT training format.
-
---
-# More info about SWE-Gym
-
-Progress in agents for software engineering has been limited by the lack of training environments that both include rigorous verification for reinforcement learning and cover the expansive tasks encountered in real-world repository-level engineering.
-
-We introduce SWE-Gym: An Open Environment for Training Software Engineering Agents & Verifiers.
-Our baselines achieve new open SOTA - 32%/26% on SWE-Bench Verified/Lite, with promising scaling trends.
-
-![SWE-Gym Scaling](https://github.com/SWE-Gym/SWE-Gym/blob/main/assets/images/scaling.jpg?raw=true)
-*SWE-Gym enables scalable improvements for software engineering agents at both training and inference time. Our current results is primarily bottlenecked by training and inference compute, rather than the size of our environment.*
-
-## SWE-Gym Environment
-
-We create SWE-Gym, the first environment for training SWE agents, with **2.4K real tasks from 11 Python repos** & a Lite split of 234 instances. SWE-Gym combines real-world Python tasks, repository context, executable environments, and test verification to train agents for solving software engineering problems.
-
-![SWE-Gym Repo Distribution](https://github.com/SWE-Gym/SWE-Gym/blob/main/assets/images/swe-gym.jpg?raw=true)
-
-
-## SWE-Gym trains LMs as agents
-
-When fine-tuned on less than 500 agent-environment interaction trajectories sampled from it from GPT-4o and Claude 3.5 Sonnet, we achieve **+14%** absolute gains on SWE-Bench Verified with an 32B LM-powered OpenHands agent.
-
-![OpenHands Performance diff before and after training](https://github.com/SWE-Gym/SWE-Gym/blob/main/assets/images/oh-agent.jpg?raw=true)
-
-
-## SWE-Gym enables self-improvement
-
-SWE-Gym is also effective across agent scaffolds. With rejection sampling fine-tuning and MoatlessTools scaffold, our 32B and 7B models achieve 20% and 10% respectively on SWE-Bench Lite through self-improvement.
-
-<p align="center">
-  <img src="https://github.com/SWE-Gym/SWE-Gym/blob/main/assets/images/ml-agent.jpg?raw=true" width="80%" alt="Moatless self-improvement">
-</p>
-
-
-
-## SWE-Gym enables inference-time scaling
-
-SWE-Gym enables inference-time scaling through verifiers trained on agent trajectories.
-These verifiers identify most promising solutions via best-of-n selection, together with our learned agents, they achieve 32%/26% on SWE-Bench Verified/Lite, a new open SoTA.
-
-
-![Inference Time Scaling for Moatless Agent](https://github.com/SWE-Gym/SWE-Gym/blob/main/assets/images/inference-ml.jpg?raw=true)
-*Inference Time Scaling for Moatless Agent*
-
-![Inference Time Scaling for OpenHands Agent](https://github.com/SWE-Gym/SWE-Gym/blob/main/assets/images/inference-oh.jpg?raw=true)
-*Inference Time Scaling for OpenHands Agent*
-
-
-## Our baselines on SWE-Gym shows strong scaling trends
-
-Lastly, our ablations reveal strong scaling trends - performance is now bottlenecked by train and inference compute, rather than the size of our dataset. Pushing and improving these scaling trends further is an exciting direction for future work.
-
-![](https://github.com/SWE-Gym/SWE-Gym/blob/main/assets/images/scaling.jpg?raw=true)
-
-## Reproducing Results
-**The Dataset**
-
-To access SWE-Gym dataset, checkout our huggingface hub page [SWE-Gym](https://huggingface.co/SWE-Gym)
-
-The environment constants are currently saved at [SWE-Bench-Fork](https://github.com/SWE-Gym/SWE-Bench-Fork)
-
-We also have pre-built docker images for each instance under [xingyaoww/sweb.eval.x86_64](https://hub.docker.com/search?q=xingyaoww%2Fsweb.eval.x86_64.) prefix at docker hub.
-
-
-## 📚 Citation
-
-```bibtex
-@misc{pan2024trainingsoftwareengineeringagents,
-      title={Training Software Engineering Agents and Verifiers with SWE-Gym},
-      author={Jiayi Pan and Xingyao Wang and Graham Neubig and Navdeep Jaitly and Heng Ji and Alane Suhr and Yizhe Zhang},
-      year={2024},
-      eprint={2412.21139},
-      archivePrefix={arXiv},
-      primaryClass={cs.SE},
-      url={https://arxiv.org/abs/2412.21139},
-}
-```
--- a/evaluation/benchmarks/multi_swe_bench/run_infer.py
+++ b/evaluation/benchmarks/multi_swe_bench/run_infer.py
@@ -51,8 +51,8 @@ RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'tru

 # TODO: migrate all swe-bench docker to ghcr.io/openhands
 # TODO: 适应所有的语言
-DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'mswebench')
-LANGUAGE = os.environ.get('LANGUAGE', 'java')
+DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', '')
+LANGUAGE = os.environ.get('LANGUAGE', 'python')
 logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')


@@ -305,19 +305,31 @@ def get_instance_docker_image(instance: pd.Series):
        instance_id = instance.get('instance_id', '')
        tag_suffix = instance_id.split('-')[-1] if instance_id else ''
        container_tag = f'pr-{tag_suffix}'
-        return f'{DOCKER_IMAGE_PREFIX}/{container_name}:{container_tag}'
+        # pdb.set_trace()
+        return f'mswebench/{container_name}:{container_tag}'
+        # return "kong/insomnia:pr-8284"
+        # return "'sweb.eval.x86_64.local_insomnia"
+        # return "local_insomnia_why"
+        # return "local/kong-insomnia:pr-8117"


 def get_config(
    instance: pd.Series,
    metadata: EvalMetadata,
 ) -> OpenHandsConfig:
-    base_container_image = get_instance_docker_image(instance)
-    logger.info(
-        f'Using instance container image: {base_container_image}. '
-        f'Please make sure this image exists. '
-        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
-    )
+    SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'
+    if USE_INSTANCE_IMAGE:
+        # We use a different instance image for the each instance of swe-bench eval
+        # base_container_image = get_instance_docker_image(instance['instance_id'])
+        base_container_image = get_instance_docker_image(instance)
+        logger.info(
+            f'Using instance container image: {base_container_image}. '
+            f'Please make sure this image exists. '
+            f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
+        )
+    else:
+        base_container_image = SWE_BENCH_CONTAINER_IMAGE
+        logger.info(f'Using swe-bench container image: {base_container_image}')

    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = base_container_image
@@ -760,6 +772,7 @@ if __name__ == '__main__':
    parser.add_argument(
        '--dataset',
        type=str,
+        default='princeton-nlp/SWE-bench',
        help='data set to evaluate on, either full-test or lite-test',
    )
    parser.add_argument(
@@ -774,7 +787,6 @@ if __name__ == '__main__':
    # so we don't need to manage file uploading to OpenHands's repo
    # dataset = load_dataset(args.dataset, split=args.split)
    # dataset = load_dataset(args.dataset)
-    logger.info(f'Loading dataset {args.dataset} with split {args.split} ')
    dataset = load_dataset('json', data_files=args.dataset)
    dataset = dataset[args.split]
    swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
@@ -827,7 +839,7 @@ if __name__ == '__main__':
        args.eval_num_workers,
        process_instance,
        timeout_seconds=120 * 60,  # 2 hour PER instance should be more than enough
-        max_retries=3,
+        max_retries=5,
    )
    # Check if any instances reached maximum retries
    check_maximum_retries_exceeded(metadata.eval_output_dir)
--- a/evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py
@@ -1,54 +1,37 @@
-import argparse
 import json

+input_file = 'XXX.jsonl'
+output_file = 'YYY.jsonl'

-def main(input_file, output_file):
-    with (
-        open(input_file, 'r', encoding='utf-8') as fin,
-        open(output_file, 'w', encoding='utf-8') as fout,
-    ):
-        for line in fin:
-            line = line.strip()
-            if not line:
-                continue
+with (
+    open(input_file, 'r', encoding='utf-8') as fin,
+    open(output_file, 'w', encoding='utf-8') as fout,
+):
+    for line in fin:
+        line = line.strip()
+        if not line:
+            continue

-            data = json.loads(line)
-            item = data
+        data = json.loads(line)
+        item = data

-            # Skip instances that don't have resolved_issues or have empty resolved_issues
-            if not item.get('resolved_issues') or len(item['resolved_issues']) == 0:
-                print(
-                    f'Skipping instance {item.get("org", "")}/{item.get("repo", "")}-{item.get("number", "")} - no resolved_issues'
-                )
-                continue
+        # 提取原始数据
+        org = item.get('org', '')
+        repo = item.get('repo', '')
+        number = str(item.get('number', ''))

-            # 提取原始数据
-            org = item.get('org', '')
-            repo = item.get('repo', '')
-            number = str(item.get('number', ''))
+        new_item = {}
+        new_item['repo'] = f'{org}/{repo}'
+        new_item['instance_id'] = f'{org}__{repo}-{number}'
+        new_item['problem_statement'] = (
+            item['resolved_issues'][0].get('title', '')
+            + '\n'
+            + item['resolved_issues'][0].get('body', '')
+        )
+        new_item['FAIL_TO_PASS'] = []
+        new_item['PASS_TO_PASS'] = []
+        new_item['base_commit'] = item['base'].get('sha', '')
+        new_item['version'] = '0.1'  # depends

-            new_item = {}
-            new_item['repo'] = f'{org}/{repo}'
-            new_item['instance_id'] = f'{org}__{repo}-{number}'
-
-            # Get the first resolved issue
-            resolved_issue = item['resolved_issues'][0]
-            title = resolved_issue.get('title') or ''
-            body = resolved_issue.get('body') or ''
-
-            new_item['problem_statement'] = title + '\n' + body
-            new_item['FAIL_TO_PASS'] = []
-            new_item['PASS_TO_PASS'] = []
-            new_item['base_commit'] = item['base'].get('sha', '')
-            new_item['version'] = '0.1'  # depends
-
-            output_data = new_item
-            fout.write(json.dumps(output_data, ensure_ascii=False) + '\n')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input', required=True, help='Input .jsonl file path')
-    parser.add_argument('--output', required=True, help='Output .jsonl file path')
-    args = parser.parse_args()
-    main(args.input, args.output)
+        output_data = new_item
+        fout.write(json.dumps(output_data, ensure_ascii=False) + '\n')
--- a/evaluation/benchmarks/multi_swe_bench/scripts/eval/combine_final_completions.py
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/eval/combine_final_completions.py
@@ -1,69 +0,0 @@
-import argparse
-import gzip
-import json
-import os
-from glob import glob
-
-from tqdm import tqdm
-
-tqdm.pandas()
-
-
-# Load trajectories for resolved instances
-def load_completions(output_dir: str, instance_id: str):
-    glob_path = os.path.join(output_dir, 'llm_completions', instance_id, '*.json')
-    files = sorted(glob(glob_path))  # this is ascending order
-    # pick the last file (last turn)
-    try:
-        file_path = files[-1]
-    except IndexError:
-        # print(f'No files found for instance {instance_id}: files={files}')
-        return None
-    with open(file_path, 'r') as f:
-        result = json.load(f)
-    # create messages
-    messages = result['messages']
-    messages.append(result['response']['choices'][0]['message'])
-    tools = result['kwargs'].get('tools', [])
-    return {
-        'messages': messages,
-        'tools': tools,
-    }
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument('jsonl_path', type=str)
-args = parser.parse_args()
-
-output_dir = os.path.dirname(args.jsonl_path)
-output_path = os.path.join(output_dir, 'output.with_completions.jsonl.gz')
-
-# Check if output would be different from input
-needs_update = False
-with open(args.jsonl_path, 'r') as f_in:
-    for line in tqdm(f_in, desc='Checking for changes'):
-        data = json.loads(line)
-        new_completions = load_completions(output_dir, data['instance_id'])
-        current_completions = data.get('raw_completions')
-        if current_completions != new_completions:
-            needs_update = True
-            break
-
-if not needs_update:
-    print('No updates required. Skipping file update.')
-    exit(0)
-
-if os.path.exists(output_path):
-    print(f'Output file already exists at {output_path}, overwriting? (y/n)')
-    if input() != 'y':
-        print('Exiting...')
-        exit(0)
-
-# Process line by line
-with open(args.jsonl_path, 'r') as f_in, gzip.open(output_path, 'wt') as f_out:
-    for line in tqdm(f_in):
-        data = json.loads(line)
-        data['raw_completions'] = load_completions(output_dir, data['instance_id'])
-        f_out.write(json.dumps(data) + '\n')
-
-print(f'Saved compressed output to {output_path}')
--- a/evaluation/benchmarks/multi_swe_bench/scripts/eval/convert.py
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/eval/convert.py
@@ -1,11 +1,13 @@
-import argparse
 import json
 import re

+IN_FILE = 'output.jsonl'
+OUT_FILE = 'patch.jsonl'

-def main(input_file, output_file):
-    with open(input_file, 'r') as fin:
-        with open(output_file, 'w') as fout:
+
+def main():
+    with open(IN_FILE, 'r') as fin:
+        with open(OUT_FILE, 'w') as fout:
            for line in fin:
                data = json.loads(line)
                groups = re.match(r'(.*)__(.*)-(.*)', data['instance_id'])
@@ -13,14 +15,10 @@ def main(input_file, output_file):
                    'org': groups.group(1),
                    'repo': groups.group(2),
                    'number': groups.group(3),
-                    'fix_patch': data.get('test_result', {}).get('git_patch', '') or '',
+                    'fix_patch': data['test_result']['git_patch'],
                }
                fout.write(json.dumps(patch) + '\n')


 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input', required=True, help='Input .jsonl file path')
-    parser.add_argument('--output', required=True, help='Output .jsonl file path')
-    args = parser.parse_args()
-    main(args.input, args.output)
+    main()
--- a/evaluation/benchmarks/multi_swe_bench/scripts/eval/update_multi_swe_bench_config.py
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/eval/update_multi_swe_bench_config.py
@@ -1,70 +0,0 @@
-import argparse
-import json
-import os
-import subprocess
-
-
-def update_multi_swe_config(output_jsonl_path, config_path, dataset):
-    path_to_parent = os.path.dirname(os.path.abspath(output_jsonl_path))
-    converted_path = os.path.join(path_to_parent, 'output_converted.jsonl')
-
-    # Run the conversion script
-    subprocess.run(
-        [
-            'python3',
-            './evaluation/benchmarks/multi_swe_bench/scripts/eval/convert.py',
-            '--input',
-            output_jsonl_path,
-            '--output',
-            converted_path,
-        ],
-        check=True,
-    )
-
-    # Create required directories
-    os.makedirs(os.path.join(path_to_parent, 'eval_files', 'dataset'), exist_ok=True)
-    os.makedirs(os.path.join(path_to_parent, 'eval_files', 'workdir'), exist_ok=True)
-    os.makedirs(os.path.join(path_to_parent, 'eval_files', 'repos'), exist_ok=True)
-    os.makedirs(os.path.join(path_to_parent, 'eval_files', 'logs'), exist_ok=True)
-
-    # Prepare config dict
-    config = {
-        'mode': 'evaluation',
-        'workdir': os.path.join(path_to_parent, 'eval_files', 'workdir'),
-        'patch_files': [converted_path],
-        'dataset_files': [dataset],
-        'force_build': True,
-        'output_dir': os.path.join(path_to_parent, 'eval_files', 'dataset'),
-        'specifics': [],
-        'skips': [],
-        'repo_dir': os.path.join(path_to_parent, 'eval_files', 'repos'),
-        'need_clone': True,
-        'global_env': [],
-        'clear_env': True,
-        'stop_on_error': False,
-        'max_workers': 5,
-        'max_workers_build_image': 5,
-        'max_workers_run_instance': 5,
-        'log_dir': os.path.join(path_to_parent, 'eval_files', 'logs'),
-        'log_level': 'DEBUG',
-        'fix_patch_run_cmd': (
-            'bash -c "apt update ; apt install -y patch ; '
-            "sed -i 's@git apply.*@patch --batch --fuzz=5 -p1 -i /home/test.patch;"
-            'patch --batch --fuzz=5 -p1 -i /home/fix.patch@g\' /home/fix-run.sh ; chmod +x /home/*.sh  ; /home/fix-run.sh"'
-        ),
-    }
-
-    # Save to multibench.config
-    os.makedirs(os.path.dirname(config_path), exist_ok=True)
-    with open(config_path, 'w') as f:
-        json.dump(config, f, indent=4)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input', required=True, help='Path to input file')
-    parser.add_argument('--output', required=True, help='Path to create config')
-    parser.add_argument('--dataset', required=True, help='Path to dataset')
-    args = parser.parse_args()
-
-    update_multi_swe_config(args.input, args.output, args.dataset)
--- a/evaluation/benchmarks/multi_swe_bench/scripts/eval/update_output_with_eval.py
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/eval/update_output_with_eval.py
@@ -1,176 +0,0 @@
-import argparse
-import json
-import os
-from collections import defaultdict
-
-from tqdm import tqdm
-
-parser = argparse.ArgumentParser()
-parser.add_argument('input_file', type=str)
-parser.add_argument(
-    '--force',
-    action='store_true',
-    help='Force update all reports even if no changes are detected',
-)
-parser.add_argument(
-    '--overwrite-backup',
-    action='store_true',
-    help='Automatically overwrite existing backup files without prompting',
-)
-args = parser.parse_args()
-
-dirname = os.path.dirname(args.input_file)
-
-# Initialize counters and data structures
-instance_id_to_status = defaultdict(
-    lambda: {
-        'empty_generation': False,
-        'resolved': False,
-        'failed_apply_patch': False,
-        'error_eval': False,
-        'test_timeout': False,
-    }
-)
-
-# Process official report if it exists
-swebench_official_report_json = os.path.join(
-    dirname, 'eval_files/dataset/final_report.json'
-)
-openhands_remote_report_jsonl = args.input_file.replace(
-    '.jsonl', '.swebench_eval.jsonl'
-)
-
-if os.path.exists(swebench_official_report_json):
-    output_md_filepath = os.path.join(dirname, 'README.md')
-    with open(swebench_official_report_json, 'r') as f:
-        report = json.load(f)
-
-    # Convert instance IDs from "repo/name:pr-123" format to "repo__name-123" format
-    def convert_instance_id(instance_id):
-        """Convert instance ID from slash/colon-pr format to double underscore/dash format."""
-        if '/' in instance_id and ':pr-' in instance_id:
-            # Split on '/' and ':pr-'
-            parts = instance_id.split('/')
-            if len(parts) == 2:
-                repo_part = parts[0]
-                name_and_pr = parts[1]
-                if ':pr-' in name_and_pr:
-                    name, pr_number = name_and_pr.split(':pr-')
-                    return f'{repo_part}__{name}-{pr_number}'
-        return instance_id
-
-    # Convert all instance ID lists in the report
-    for key in [
-        'resolved_ids',
-        'unresolved_ids',
-        'error_ids',
-        'empty_patch_ids',
-        'incomplete_ids',
-    ]:
-        if key in report:
-            report[key] = [
-                convert_instance_id(instance_id) for instance_id in report[key]
-            ]
-
-    output_md = (
-        '# Multi-SWE-bench Report\n'
-        'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
-        '## Summary\n'
-        f'- total instances: {report["total_instances"]}\n'
-        f'- submitted instances: {report["submitted_instances"]}\n'
-        f'- completed instances: {report["completed_instances"]}\n'
-        f'- empty patch instances: {report["empty_patch_instances"]}\n'
-        f'- resolved instances: {report["resolved_instances"]}\n'
-        f'- unresolved instances: {report["unresolved_instances"]}\n'
-        f'- error instances: {report["error_instances"]}\n'
-    )
-
-    output_md += '\n## Resolved Instances\n'
-    # instance_id to status
-    for instance_id in report['resolved_ids']:
-        instance_id_to_status[instance_id]['resolved'] = True
-        output_md += (
-            f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
-        )
-
-    output_md += '\n## Unresolved Instances\n'
-    for instance_id in report['unresolved_ids']:
-        output_md += (
-            f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
-        )
-
-    output_md += '\n## Error Instances\n'
-    for instance_id in report['error_ids']:
-        instance_id_to_status[instance_id]['error_eval'] = True
-        output_md += (
-            f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
-        )
-
-    output_md += '\n## Empty Patch Instances\n'
-    for instance_id in report['empty_patch_ids']:
-        instance_id_to_status[instance_id]['empty_generation'] = True
-        output_md += (
-            f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
-        )
-
-    output_md += '\n## Incomplete Instances\n'
-    for instance_id in report['incomplete_ids']:
-        output_md += (
-            f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
-        )
-
-    with open(output_md_filepath, 'w') as f:
-        f.write(output_md)
-
-else:
-    print(
-        f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.'
-    )
-    exit()
-
-# Before backup and update, check if any changes would be made (unless --force is used)
-if not args.force:
-    needs_update = False
-    with open(args.input_file, 'r') as infile:
-        for line in tqdm(infile, desc='Checking for changes'):
-            data = json.loads(line)
-            instance_id = data['instance_id']
-            current_report = data.get('report', {})
-            new_report = instance_id_to_status[
-                instance_id
-            ]  # if no report, it's not resolved
-            if current_report != new_report:
-                needs_update = True
-                break
-
-    if not needs_update:
-        print('No updates detected. Skipping file update.')
-        exit()
-else:
-    print('Force flag enabled. Updating all reports regardless of changes.')
-
-# Backup and update the original file row by row
-if os.path.exists(args.input_file + '.bak'):
-    if args.overwrite_backup:
-        print(
-            'Existing backup file found. Overwriting automatically due to --overwrite-backup flag.'
-        )
-        os.remove(args.input_file + '.bak')
-    else:
-        conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
-        if conf != 'y':
-            exit()
-        os.remove(args.input_file + '.bak')
-
-os.rename(args.input_file, args.input_file + '.bak')
-
-# Process and write file row by row
-with (
-    open(args.input_file + '.bak', 'r') as infile,
-    open(args.input_file, 'w') as outfile,
-):
-    for line in tqdm(infile, desc='Updating output file'):
-        data = json.loads(line)
-        instance_id = data['instance_id']
-        data['report'] = instance_id_to_status[instance_id]
-        outfile.write(json.dumps(data) + '\n')
--- a/evaluation/benchmarks/multi_swe_bench/scripts/rollout_multi_swegym.sh
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/rollout_multi_swegym.sh
@@ -1,146 +0,0 @@
-#!/bin/bash
-
-# NOTE: this script is for rolling out the Multi-SWE-Gym dataset for **TRAINING**
-# For more information, please refer to
-# 1. the Github Repo: https://github.com/SWE-Gym/SWE-Gym
-# 2. the paper: https://arxiv.org/abs/2412.21139
-
-MODEL=$1  # eg your llm config name in config.toml (eg: "llm.claude-3-5-sonnet-20241022-t05")
-EXP_NAME=$2 # "train-t05"
-EVAL_DATASET=$3  # path to original dataset (jsonl file)
-N_WORKERS=${4:-64}
-N_RUNS=${5:-1}
-
-export EXP_NAME=$EXP_NAME
-# use 2x resources for rollout since some codebases are pretty resource-intensive
-export DEFAULT_RUNTIME_RESOURCE_FACTOR=2
-echo "MODEL: $MODEL"
-echo "EXP_NAME: $EXP_NAME"
-echo "EVAL_DATASET: $EVAL_DATASET"
-# Generate DATASET path by adding _with_runtime_ before .jsonl extension
-DATASET="${EVAL_DATASET%.jsonl}_with_runtime_.jsonl"  # path to converted dataset
-
-# Create the converted dataset file
-echo "Creating converted dataset at: $DATASET"
-poetry run python ./evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py --input "$EVAL_DATASET" --output "$DATASET"
-
-SPLIT="train"
-export LANGUAGE=java
-
-if [ -z "$ALLHANDS_API_KEY" ] || [ "$RUNTIME" != "remote" ]; then
-    echo "ALLHANDS_API_KEY is not set or RUNTIME is not set to remote. Will rollout and evaluate locally using Docker. WARNING: A large value of N_WORKERS will result in a large number of Docker containers being spun up and may crash your machine."
-    export RUNTIME=docker
-else
-    echo "ALLHANDS_API_KEY is set and RUNTIME is set to remote. Continuing rollout and evaluation with remote runtime..."
-    export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
-fi
-
-#EVAL_LIMIT=3000
-MAX_ITER=100
-
-
-# ===== Run inference =====
-source "evaluation/utils/version_control.sh"
-get_openhands_version
-
-echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
-echo "MODEL_CONFIG: $MODEL_CONFIG"
-echo "DATASET: $DATASET"
-echo "EVAL_DOCKER_IMAGE_PREFIX: $EVAL_DOCKER_IMAGE_PREFIX"
-
-# Default to NOT use Hint
-export USE_INSTANCE_IMAGE=true
-export USE_HINT_TEXT=false
-export RUN_WITH_BROWSING=false
-echo "USE_HINT_TEXT: $USE_HINT_TEXT"
-EVAL_NOTE="$OPENHANDS_VERSION-no-hint-$EXP_NAME"
-
-function run_eval() {
-  local eval_note=$1
-  export LANGUAGE=java
-  echo "About to run command"
-  COMMAND="EVAL_DOCKER_IMAGE_PREFIX=$EVAL_DOCKER_IMAGE_PREFIX; LANGUAGE=java;
-    poetry run python evaluation/benchmarks/multi_swe_bench/run_infer.py \
-    --agent-cls CodeActAgent \
-    --llm-config $MODEL \
-    --max-iterations $MAX_ITER \
-    --eval-num-workers $N_WORKERS \
-    --eval-note $eval_note \
-    --dataset $DATASET \
-    --split $SPLIT"
-
-  echo "Running command: $COMMAND"
-  if [ -n "$EVAL_LIMIT" ]; then
-    echo "EVAL_LIMIT: $EVAL_LIMIT"
-    COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
-  fi
-
-  # Run the command
-  eval $COMMAND
-}
-
-for run_idx in $(seq 1 $N_RUNS); do
-
-    while true; do
-        echo "### Running inference... ###"
-        unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
-        current_eval_note="$EVAL_NOTE-run_$run_idx"
-        echo "EVAL_NOTE: $current_eval_note"
-        echo "DATASET command: $DATASET"
-        #INFER_OUTPUT=$(run_eval $current_eval_note)
-        INFER_OUTPUT=$(run_eval $current_eval_note | tee /dev/stderr)
-        INFER_STATUS=$?  # Capture the exit status of run_infer.sh
-        echo "INFER_STATUS: $INFER_STATUS"
-
-        echo "### Cleaning up remote runtime... ###"
-        ./evaluation/utils/scripts/cleanup_remote_runtime.sh
-
-        if [ $INFER_STATUS -eq 0 ]; then
-            echo "### Inference completed successfully. ###"
-            break
-        else
-            echo "### Inference failed with exit code $INFER_STATUS. Retrying... ###"
-        fi
-    done
-
-    # Extract the output directory using the special delimiters
-    OUTPUT_FILE=$(echo "$INFER_OUTPUT" | grep -o '### OUTPUT FILE:.* ###' | sed 's/### OUTPUT FILE: \(.*\) ###/\1/')
-    echo "Got OUTPUT_FILE: $OUTPUT_FILE"
-
-    while true; do
-        echo "### Evaluating on $OUTPUT_FILE ... ###"
-        OUTPUT_CONFIG_FILE="${OUTPUT_FILE%.jsonl}_config.json"
-        export EVAL_SKIP_BUILD_ERRORS=true
-        pip install multi-swe-bench --quiet --disable-pip-version-check > /dev/null 2>&1
-        COMMAND="poetry run python ./evaluation/benchmarks/multi_swe_bench/scripts/eval/update_multi_swe_bench_config.py --input $OUTPUT_FILE --output $OUTPUT_CONFIG_FILE --dataset $EVAL_DATASET;
-        python -m multi_swe_bench.harness.run_evaluation --config $OUTPUT_CONFIG_FILE
-        "
-
-        if [ -n "$EVAL_LIMIT" ]; then
-        echo "EVAL_LIMIT: $EVAL_LIMIT"
-        COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
-        fi
-        echo "Running command: $COMMAND"
-        # Run the command
-        eval $COMMAND
-        EVAL_STATUS=$?
-        if [ $EVAL_STATUS -eq 0 ]; then
-            echo "### Evaluation completed successfully. ###"
-            break
-        else
-            echo "### Evaluation failed with exit code $EVAL_STATUS. Retrying... ###"
-        fi
-
-        ./evaluation/utils/scripts/cleanup_remote_runtime.sh
-    done
-
-    # update the output with evaluation results
-    echo "### Updating the output with evaluation results... ###"
-    poetry run python evaluation/benchmarks/multi_swe_bench/scripts/eval/update_output_with_eval.py $OUTPUT_FILE
-
-    echo "### Combining the final completions... ###"
-    poetry run python evaluation/benchmarks/multi_swe_bench/scripts/eval/combine_final_completions.py $OUTPUT_FILE
-
-    echo "### DONE for run $run_idx! ###"
-    echo "You can find the final output at $(dirname $OUTPUT_FILE)/$FINAL_OUTPUT_FILE"
-done
--- a/evaluation/benchmarks/multi_swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/run_infer.sh
@@ -47,8 +47,8 @@ if [ -z "$DATASET" ]; then
 fi

 if [ -z "$LANGUAGE" ]; then
-  echo "LANGUAGE not specified, use default python"
-  LANGUAGE="java"
+  echo "LANUGUAGE not specified, use default python"
+  LANGUAGE="python"
 fi

 if [ -z "$SPLIT" ]; then
@@ -69,10 +69,10 @@ fi

 if [ -z "$EVAL_DOCKER_IMAGE_PREFIX" ]; then
  if [ "$LANGUAGE" = "python" ]; then
-  echo "EVAL_DOCKER_IMAGE_PREFIX is docker.io/xingyaoww/ as default as LANGUAGE is python"
+  echo "EVAL_DOCKER_IMAGE_PREFIX is docker.io/xingyaoww/ as default as LANUGUAGE is python"
    EVAL_DOCKER_IMAGE_PREFIX="docker.io/xingyaoww/"
  elif [ "$LANGUAGE" = "java" ]; then
-  echo "EVAL_DOCKER_IMAGE_PREFIX is empty as LANGUAGE is java"
+  echo "EVAL_DOCKER_IMAGE_PREFIX is java_verified as LANUGUAGE is java"
    EVAL_DOCKER_IMAGE_PREFIX=""
  fi
 fi
--- a/evaluation/benchmarks/multi_swe_bench/scripts/swegym/convert_data.ipynb
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/swegym/convert_data.ipynb
@@ -1,344 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "import pandas as pd\n",
-    "from tqdm import tqdm\n",
-    "\n",
-    "tqdm.pandas()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 1. Load raw data and convert to training data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import gzip\n",
-    "import json\n",
-    "\n",
-    "from tqdm import tqdm\n",
-    "\n",
-    "FILE_PATHS = [\n",
-    "    'YOURPATH-no-hint-train-t05-run_1/output.with_completions.jsonl.gz',\n",
-    "    'YOURPATH-no-hint-train-t05-run_2/output.with_completions.jsonl.gz',\n",
-    "]\n",
-    "\n",
-    "# More memory efficient for large files\n",
-    "# Initialize lists to store the data\n",
-    "data = []\n",
-    "\n",
-    "\n",
-    "# Read file line by line\n",
-    "for FILE_PATH in FILE_PATHS:\n",
-    "    with gzip.open(FILE_PATH, 'rb') as f:  # Use 'rb' for gzipped files\n",
-    "        for i, line in tqdm(\n",
-    "            enumerate(f), desc=f'Processing {FILE_PATH.split(\"/\")[-1]}'\n",
-    "        ):\n",
-    "            # Parse only the fields we need\n",
-    "            raw_data = json.loads(line)\n",
-    "            data.append(\n",
-    "                {\n",
-    "                    'resolved': raw_data['report']['resolved'],\n",
-    "                    'messages': raw_data['raw_completions']['messages']\n",
-    "                    if raw_data['raw_completions'] is not None\n",
-    "                    else None,\n",
-    "                    'git_patch': raw_data['test_result'].get('git_patch', ''),\n",
-    "                    'tools': raw_data['raw_completions']['tools']\n",
-    "                    if raw_data['raw_completions'] is not None\n",
-    "                    and 'tools' in raw_data['raw_completions']\n",
-    "                    else None,\n",
-    "                }\n",
-    "            )\n",
-    "\n",
-    "# Convert to DataFrame after collecting all data\n",
-    "df = pd.DataFrame(data)\n",
-    "print(f'#total amount of data={len(df)}')\n",
-    "df = df[~df['messages'].isna()]\n",
-    "print(f'#total amount of data after removing nan={len(df)}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Filter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def _contains_multiple_tool_calls(messages: list[dict]) -> bool:\n",
-    "    return any(\n",
-    "        message.get('tool_calls') and len(message['tool_calls']) > 1\n",
-    "        for message in messages\n",
-    "    )\n",
-    "\n",
-    "\n",
-    "df['contains_multiple_tool_calls'] = df['messages'].apply(_contains_multiple_tool_calls)\n",
-    "display(df.groupby(['contains_multiple_tool_calls'])['resolved'].sum())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "import copy\n",
-    "\n",
-    "# Convert function calling messages to non-function calling messages\n",
-    "from openhands.llm.fn_call_converter import (\n",
-    "    FunctionCallConversionError,\n",
-    "    convert_fncall_messages_to_non_fncall_messages,\n",
-    "    convert_from_multiple_tool_calls_to_single_tool_call_messages,\n",
-    ")\n",
-    "\n",
-    "total_failed = 0\n",
-    "\n",
-    "\n",
-    "def _convert_messages(messages: list[dict], tools: list[dict]) -> list[dict]:\n",
-    "    global total_failed\n",
-    "    message_copy = copy.deepcopy(messages)\n",
-    "    for message in message_copy:\n",
-    "        if message['content'] is None:\n",
-    "            message['content'] = ''\n",
-    "    try:\n",
-    "        return convert_fncall_messages_to_non_fncall_messages(\n",
-    "            message_copy, tools, add_in_context_learning_example=False\n",
-    "        )\n",
-    "    except FunctionCallConversionError:\n",
-    "        total_failed += 1\n",
-    "        # print(f'Failed to convert messages: {messages}\\nTools: {tools}')\n",
-    "        # traceback.print_exc()\n",
-    "        return None\n",
-    "\n",
-    "\n",
-    "df['converted_messages'] = df.apply(\n",
-    "    lambda row: convert_from_multiple_tool_calls_to_single_tool_call_messages(\n",
-    "        row['messages'], ignore_final_tool_result=True\n",
-    "    ),\n",
-    "    axis=1,\n",
-    ")\n",
-    "df['nonfncall_messages'] = df.apply(\n",
-    "    lambda row: _convert_messages(row['converted_messages'], row['tools']), axis=1\n",
-    ")\n",
-    "print('total nan', df['nonfncall_messages'].isna().sum())\n",
-    "df = df[~df['nonfncall_messages'].isna()]\n",
-    "print(df['nonfncall_messages'].iloc[0])\n",
-    "\n",
-    "print(f'Total failed: {total_failed}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Tokenization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pandarallel import pandarallel\n",
-    "from transformers import AutoTokenizer\n",
-    "\n",
-    "os.environ['TOKENIZERS_PARALLELISM'] = 'false'\n",
-    "pandarallel.initialize(progress_bar=True, verbose=1, nb_workers=16)\n",
-    "tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-7B-Instruct')\n",
-    "\n",
-    "\n",
-    "def clean_messages(messages):\n",
-    "    clean = []\n",
-    "    for msg in messages:\n",
-    "        if not isinstance(msg, dict):\n",
-    "            continue\n",
-    "        role = msg.get('role')\n",
-    "        content = msg.get('content')\n",
-    "        if isinstance(content, str):\n",
-    "            text = content\n",
-    "        elif isinstance(content, dict):\n",
-    "            text = content.get('text')\n",
-    "        elif (\n",
-    "            isinstance(content, list)\n",
-    "            and len(content) == 1\n",
-    "            and isinstance(content[0], dict)\n",
-    "        ):\n",
-    "            text = content[0].get('text')\n",
-    "        else:\n",
-    "            print(f'Format not accepted {content}')\n",
-    "        clean.append({'role': role, 'content': text})\n",
-    "    return clean\n",
-    "\n",
-    "\n",
-    "# Step 1: Clean the messages\n",
-    "df['nonfncall_messages'] = df['nonfncall_messages'].apply(clean_messages)\n",
-    "\n",
-    "# Step 2: Compute token count\n",
-    "df['n_tokens'] = df['nonfncall_messages'].parallel_apply(\n",
-    "    lambda x: len(tokenizer.apply_chat_template(x))\n",
-    ")\n",
-    "\n",
-    "# print(df['nonfncall_messages'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(f'BEFORE: #total={len(df)}')\n",
-    "df_selected = df[df['n_tokens'] < 131072]\n",
-    "print(f'AFTER(truncated to 128k): #total={len(df_selected)}')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_selected['n_tokens'].describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ecdf of n_tokens\n",
-    "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns\n",
-    "\n",
-    "display(df.groupby(['resolved'])['n_tokens'].describe())\n",
-    "sns.ecdfplot(x='n_tokens', data=df, hue='resolved')\n",
-    "plt.show()\n",
-    "\n",
-    "print(f'#total={len(df)}')\n",
-    "df_selected = df[df['n_tokens'] < 131072]\n",
-    "print(f'#selected={len(df_selected)}')\n",
-    "display(df_selected.groupby(['resolved'])['n_tokens'].describe())\n",
-    "sns.ecdfplot(x='n_tokens', data=df_selected, hue='resolved')\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_selected[~df_selected['resolved']]['n_tokens'].describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_selected['resolved'].value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_selected.groupby(['resolved'])['n_tokens'].describe()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Save Resolved Messages for SFT"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Flatten messages and change format to {\"content\": \"\", \"role\": \"\"}\n",
-    "df_selected[df_selected['resolved']][['nonfncall_messages']].rename(\n",
-    "    columns={'nonfncall_messages': 'messages'}\n",
-    ").to_json(\n",
-    "    os.path.join(\n",
-    "        'PATH_TO_FILE',\n",
-    "        f'policy_traj_128k_swegym_{df_selected[\"resolved\"].value_counts()[True]}i.jsonl',\n",
-    "    ),\n",
-    "    lines=True,\n",
-    "    orient='records',\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
--- a/evaluation/benchmarks/swe_perf/README.md
+++ b/evaluation/benchmarks/swe_perf/README.md
@@ -1,81 +0,0 @@
-# SWE-Perf Evaluation
-
-This folder contains the OpenHands inference generation of the [SWE-Perf benchmark](https://swe-perf.github.io/) ([paper](https://arxiv.org/pdf/2507.12415v1)).
-
-The evaluation consists of three steps:
-
-1. Environment setup: [install python environment](../../README.md#development-environment) and [configure LLM config](../../README.md#configure-openhands-and-your-llm).
-2. [Run inference](#running-inference-locally-with-docker): Generate a edit patch for each Github issue
-3. [Evaluate patches](#evaluate-generated-patches)
-
-## Setup Environment and LLM Configuration
-
-Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
-
-## Running inference Locally with Docker
-
-Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-PErf set you are running on) for the instance-level docker image.
-
-When the `run_infer.sh` script is started, it will automatically pull the relevant SWE-Perf images.
-For example, for instance ID `scikit-learn_scikit-learn-11674`, it will try to pull our pre-build docker image `betty1202/sweb.eval.x86_64.scikit-learn_s_scikit-learn-11674` from DockerHub.
-This image will be used create an OpenHands runtime image where the agent will operate on.
-
-```bash
-./evaluation/benchmarks/swe_perf/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] [n_runs] [mode]
-
-# Example
-./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 500 100 1 SWE-Perf/SWE-Perf test
-```
-
-where `model_config` is mandatory, and the rest are optional.
-
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
-LLM settings, as defined in your `config.toml`.
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would
-like to evaluate. It could also be a release tag like `0.6.2`.
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
-to `CodeActAgent`.
- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
-default, the script evaluates the entire SWE-Perf test set (140 issues). Note:
-in order to use `eval_limit`, you must also set `agent`.
- `max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
-default, it is set to 100.
- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
-default, it is set to 1.
- `dataset`, a huggingface dataset name. e.g. `SWE-Perf/SWE-Perf`, specifies which dataset to evaluate on.
- `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.
-
- `n_runs`, e.g. `3`, is the number of times to run the evaluation. Default is 1.
- `mode`, e.g. `swt`, `swt-ci`, or `swe`, specifies the evaluation mode. Default is `swe`.
-
-> [!CAUTION]
-> Setting `num_workers` larger than 1 is not officially tested, YMMV.
-
-
-Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and CodeActAgent,
-
-then your command would be:
-
-```bash
-./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
-```
-
-## Evaluate Generated Patches
-
-
-To evaluate the generated patch, follow these steps:
-
-### 1. Convert output to the evaluation standard format
-Run the following command:
-```bash
-python -m evaluation.benchmarks.swe_perf.format_conversion \
-    --input_path [input_path] \
-    --output_path [output_path]
-```
-
-* `input_path`: Path to the raw generated patch file.
-* `output_path`: Path where the converted file will be saved.
-
-### 2. Run the SWE-Perf benchmark official evaluation
-
-Once the output is converted, use the [official SWE-Perf benchmark evaluation](https://github.com/SWE-Perf/SWE-Perf/tree/main/evaluation) to evaluate it.
--- a/evaluation/benchmarks/swe_perf/init.py
+++ b/evaluation/benchmarks/swe_perf/init.py
--- a/evaluation/benchmarks/swe_perf/binary_patch_utils.py
+++ b/evaluation/benchmarks/swe_perf/binary_patch_utils.py
@@ -1,52 +0,0 @@
-"""
-Utilities for handling binary files and patch generation in SWE-Perf evaluation.
-"""
-
-
-def remove_binary_diffs(patch_text):
-    """
-    Remove binary file diffs from a git patch.
-
-    Args:
-        patch_text (str): The git patch text
-
-    Returns:
-        str: The cleaned patch text with binary diffs removed
-    """
-    lines = patch_text.splitlines()
-    cleaned_lines = []
-    block = []
-    is_binary_block = False
-
-    for line in lines:
-        if line.startswith('diff --git '):
-            if block and not is_binary_block:
-                cleaned_lines.extend(block)
-            block = [line]
-            is_binary_block = False
-        elif 'Binary files' in line:
-            is_binary_block = True
-            block.append(line)
-        else:
-            block.append(line)
-
-    if block and not is_binary_block:
-        cleaned_lines.extend(block)
-    return '\n'.join(cleaned_lines)
-
-
-def remove_binary_files_from_git():
-    """
-    Generate a bash command to remove binary files from git staging.
-
-    Returns:
-        str: A bash command that removes binary files from git staging
-    """
-    return """
-    for file in $(git status --porcelain | grep -E "^(M| M|\\?\\?|A| A)" | cut -c4-); do
-        if [ -f "$file" ] && (file "$file" | grep -q "executable" || git check-attr binary "$file" | grep -q "binary: set"); then
-            git rm -f "$file" 2>/dev/null || rm -f "$file"
-            echo "Removed: $file"
-        fi
-    done
-    """.strip()
--- a/evaluation/benchmarks/swe_perf/format_conversion.py
+++ b/evaluation/benchmarks/swe_perf/format_conversion.py
@@ -1,45 +0,0 @@
-import json
-import os
-from argparse import ArgumentParser
-
-parser = ArgumentParser()
-parser.add_argument('--input_path', type=str, help='Name of input path to JSON file.')
-parser.add_argument('--output_path', type=str, help='Name of output path to JSON file.')
-args = parser.parse_args()
-
-input_path = args.input_path
-output_path = args.output_path
-os.makedirs(output_path, exist_ok=True)
-
-
-def load_jsonl(file_path):
-    """Load JSONL file into a list of dictionaries."""
-    data = []
-    with open(file_path, 'r') as f:
-        for line in f:
-            data.append(json.loads(line))
-    return data
-
-
-dataset = load_jsonl(input_path)
-ooutput_dataset = []
-for data in dataset:
-    instance_id = data['instance_id']
-    model_name_or_path = 'openhands'
-    model_patch = (
-        data['test_result']['git_patch']
-        if 'test_result' in data and 'git_patch' in data['test_result']
-        else None
-    )
-    ooutput_dataset.append(
-        {
-            'instance_id': instance_id,
-            'model_name_or_path': model_name_or_path,
-            'model_patch': model_patch,
-        }
-    )
-
-with open(os.path.join(output_path, 'output.jsonl'), 'w') as f:
-    for item in ooutput_dataset:
-        json_line = json.dumps(item, ensure_ascii=False)
-        f.write(json_line + '\n')
--- a/evaluation/benchmarks/swe_perf/resource/mapping.py
+++ b/evaluation/benchmarks/swe_perf/resource/mapping.py
@@ -1,39 +0,0 @@
-"""Mapping instance_id to resource_factor.
-
-Different instances may have different resource requirements.
-e.g., some instances may require more memory/CPU to run inference.
-This file tracks the resource requirements of different instances.
-"""
-
-import json
-import os
-
-from openhands.core.logger import openhands_logger as logger
-
-CUR_DIR = os.path.dirname(os.path.abspath(__file__))
-DEFAULT_RUNTIME_RESOURCE_FACTOR = int(
-    os.environ.get('DEFAULT_RUNTIME_RESOURCE_FACTOR', 1)
-)
-
-# dataset to resource mapping
-_global_resource_mapping: dict[str, dict[str, float]] = {}
-
-
-def get_resource_mapping(dataset_name: str) -> dict[str, float]:
-    if dataset_name not in _global_resource_mapping:
-        file_path = os.path.join(CUR_DIR, f'{dataset_name}.json')
-        if not os.path.exists(file_path):
-            logger.info(f'Resource mapping for {dataset_name} not found.')
-            return None
-
-        with open(file_path, 'r') as f:
-            _global_resource_mapping[dataset_name] = json.load(f)
-        logger.debug(f'Loaded resource mapping for {dataset_name}')
-    return _global_resource_mapping[dataset_name]
-
-
-def get_instance_resource_factor(dataset_name: str, instance_id: str) -> int:
-    resource_mapping = get_resource_mapping(dataset_name)
-    if resource_mapping is None:
-        return DEFAULT_RUNTIME_RESOURCE_FACTOR
-    return int(resource_mapping.get(instance_id, DEFAULT_RUNTIME_RESOURCE_FACTOR))
--- a/evaluation/benchmarks/swe_perf/resource/swt_bench_constants.py
+++ b/evaluation/benchmarks/swe_perf/resource/swt_bench_constants.py
@@ -1,842 +0,0 @@
-# Based on https://github.com/logic-star-ai/swt-bench/blob/master/src/constants.py
-
-# Constants - Installation Specifications
-MAP_VERSION_TO_INSTALL_SKLEARN = {
-    k: {
-        'python': '3.6',
-        'packages': 'numpy scipy cython pytest pandas matplotlib',
-        'install': 'python -m pip install -v --no-use-pep517 --no-build-isolation -e .',
-        'pip_packages': [
-            'cython',
-            'numpy==1.19.2',
-            'setuptools',
-            'scipy==1.5.2',
-        ],
-    }
-    for k in ['0.20', '0.21', '0.22']
-}
-MAP_VERSION_TO_INSTALL_SKLEARN.update(
-    {
-        k: {
-            'python': '3.9',
-            'packages': "'numpy==1.19.2' 'scipy==1.5.2' 'cython==3.0.10' pytest 'pandas<2.0.0' 'matplotlib<3.9.0' setuptools pytest joblib threadpoolctl",
-            'install': 'python -m pip install -v --no-use-pep517 --no-build-isolation -e .',
-            'pip_packages': ['cython', 'setuptools', 'numpy', 'scipy'],
-        }
-        for k in ['1.3', '1.4']
-    }
-)
-MAP_VERSION_TO_INSTALL_FLASK = {
-    '2.0': {
-        'python': '3.9',
-        'packages': 'requirements.txt',
-        'install': 'python -m pip install -e .',
-        'pip_packages': [
-            'setuptools==70.0.0',
-            'Werkzeug==2.3.7',
-            'Jinja2==3.0.1',
-            'itsdangerous==2.1.2',
-            'click==8.0.1',
-            'MarkupSafe==2.1.3',
-        ],
-    },
-    '2.1': {
-        'python': '3.10',
-        'packages': 'requirements.txt',
-        'install': 'python -m pip install -e .',
-        'pip_packages': [
-            'click==8.1.3',
-            'itsdangerous==2.1.2',
-            'Jinja2==3.1.2',
-            'MarkupSafe==2.1.1',
-            'Werkzeug==2.3.7',
-        ],
-    },
-}
-MAP_VERSION_TO_INSTALL_FLASK.update(
-    {
-        k: {
-            'python': '3.11',
-            'packages': 'requirements.txt',
-            'install': 'python -m pip install -e .',
-            'pip_packages': [
-                'click==8.1.3',
-                'itsdangerous==2.1.2',
-                'Jinja2==3.1.2',
-                'MarkupSafe==2.1.1',
-                'Werkzeug==2.3.7',
-            ],
-        }
-        for k in ['2.2', '2.3']
-    }
-)
-MAP_VERSION_TO_INSTALL_DJANGO = {
-    k: {
-        'python': '3.5',
-        'packages': 'requirements.txt',
-        'pre_install': [
-            'apt-get update && apt-get install -y locales',
-            "echo 'en_US UTF-8' > /etc/locale.gen",
-            'locale-gen en_US.UTF-8',
-        ],
-        'install': 'python setup.py install',
-        'pip_packages': ['setuptools'],
-        'eval_commands': [
-            'export LANG=en_US.UTF-8',
-            'export LC_ALL=en_US.UTF-8',
-            'export PYTHONIOENCODING=utf8',
-            'export LANGUAGE=en_US:en',
-        ],
-    }
-    for k in ['1.7', '1.8', '1.9', '1.10', '1.11', '2.0', '2.1', '2.2']
-}
-MAP_VERSION_TO_INSTALL_DJANGO.update(
-    {
-        k: {'python': '3.5', 'install': 'python setup.py install'}
-        for k in ['1.4', '1.5', '1.6']
-    }
-)
-MAP_VERSION_TO_INSTALL_DJANGO.update(
-    {
-        k: {
-            'python': '3.6',
-            'packages': 'requirements.txt',
-            'install': 'python -m pip install -e .',
-            'eval_commands': [
-                "sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen",
-                'export LANG=en_US.UTF-8',
-                'export LANGUAGE=en_US:en',
-                'export LC_ALL=en_US.UTF-8',
-            ],
-        }
-        for k in ['3.0', '3.1', '3.2']
-    }
-)
-MAP_VERSION_TO_INSTALL_DJANGO.update(
-    {
-        k: {
-            'python': '3.8',
-            'packages': 'requirements.txt',
-            'install': 'python -m pip install -e .',
-        }
-        for k in ['4.0']
-    }
-)
-MAP_VERSION_TO_INSTALL_DJANGO.update(
-    {
-        k: {
-            'python': '3.9',
-            'packages': 'requirements.txt',
-            'install': 'python -m pip install -e .',
-        }
-        for k in ['4.1', '4.2']
-    }
-)
-MAP_VERSION_TO_INSTALL_DJANGO.update(
-    {
-        k: {
-            'python': '3.11',
-            'packages': 'requirements.txt',
-            'install': 'python -m pip install -e .',
-        }
-        for k in ['5.0']
-    }
-)
-MAP_VERSION_TO_INSTALL_REQUESTS = {
-    k: {'python': '3.9', 'packages': 'pytest', 'install': 'python -m pip install .'}
-    for k in ['0.7', '0.8', '0.9', '0.11', '0.13', '0.14', '1.1', '1.2', '2.0', '2.2']
-    + ['2.3', '2.4', '2.5', '2.7', '2.8', '2.9', '2.10', '2.11', '2.12', '2.17']
-    + ['2.18', '2.19', '2.22', '2.26', '2.25', '2.27', '3.0']
-}
-MAP_VERSION_TO_INSTALL_SEABORN = {
-    k: {
-        'python': '3.9',
-        'install': 'python -m pip install -e .',
-        'pip_packages': [
-            'contourpy==1.1.0',
-            'cycler==0.11.0',
-            'fonttools==4.42.1',
-            'importlib-resources==6.0.1',
-            'kiwisolver==1.4.5',
-            'matplotlib==3.7.2',
-            'numpy==1.25.2',
-            'packaging==23.1',
-            'pandas==1.3.5',  # 2.0.3
-            'pillow==10.0.0',
-            'pyparsing==3.0.9',
-            'pytest',
-            'python-dateutil==2.8.2',
-            'pytz==2023.3.post1',
-            'scipy==1.11.2',
-            'six==1.16.0',
-            'tzdata==2023.1',
-            'zipp==3.16.2',
-        ],
-    }
-    for k in ['0.11']
-}
-MAP_VERSION_TO_INSTALL_SEABORN.update(
-    {
-        k: {
-            'python': '3.9',
-            'install': 'python -m pip install -e .[dev]',
-            'pip_packages': [
-                'contourpy==1.1.0',
-                'cycler==0.11.0',
-                'fonttools==4.42.1',
-                'importlib-resources==6.0.1',
-                'kiwisolver==1.4.5',
-                'matplotlib==3.7.2',
-                'numpy==1.25.2',
-                'packaging==23.1',
-                'pandas==2.0.0',
-                'pillow==10.0.0',
-                'pyparsing==3.0.9',
-                'pytest',
-                'python-dateutil==2.8.2',
-                'pytz==2023.3.post1',
-                'scipy==1.11.2',
-                'six==1.16.0',
-                'tzdata==2023.1',
-                'zipp==3.16.2',
-            ],
-        }
-        for k in ['0.12', '0.13']
-    }
-)
-MAP_VERSION_TO_INSTALL_PYTEST = {
-    k: {'python': '3.9', 'install': 'python -m pip install -e .'}
-    for k in [
-        '4.4',
-        '4.5',
-        '4.6',
-        '5.0',
-        '5.1',
-        '5.2',
-        '5.3',
-        '5.4',
-        '6.0',
-        '6.2',
-        '6.3',
-        '7.0',
-        '7.1',
-        '7.2',
-        '7.4',
-        '8.0',
-    ]
-}
-MAP_VERSION_TO_INSTALL_PYTEST['4.4']['pip_packages'] = [
-    'atomicwrites==1.4.1',
-    'attrs==23.1.0',
-    'more-itertools==10.1.0',
-    'pluggy==0.13.1',
-    'py==1.11.0',
-    'setuptools==68.0.0',
-    'six==1.16.0',
-]
-MAP_VERSION_TO_INSTALL_PYTEST['4.5']['pip_packages'] = [
-    'atomicwrites==1.4.1',
-    'attrs==23.1.0',
-    'more-itertools==10.1.0',
-    'pluggy==0.11.0',
-    'py==1.11.0',
-    'setuptools==68.0.0',
-    'six==1.16.0',
-    'wcwidth==0.2.6',
-]
-MAP_VERSION_TO_INSTALL_PYTEST['4.6']['pip_packages'] = [
-    'atomicwrites==1.4.1',
-    'attrs==23.1.0',
-    'more-itertools==10.1.0',
-    'packaging==23.1',
-    'pluggy==0.13.1',
-    'py==1.11.0',
-    'six==1.16.0',
-    'wcwidth==0.2.6',
-]
-for k in ['5.0', '5.1', '5.2']:
-    MAP_VERSION_TO_INSTALL_PYTEST[k]['pip_packages'] = [
-        'atomicwrites==1.4.1',
-        'attrs==23.1.0',
-        'more-itertools==10.1.0',
-        'packaging==23.1',
-        'pluggy==0.13.1',
-        'py==1.11.0',
-        'wcwidth==0.2.6',
-    ]
-MAP_VERSION_TO_INSTALL_PYTEST['5.3']['pip_packages'] = [
-    'attrs==23.1.0',
-    'more-itertools==10.1.0',
-    'packaging==23.1',
-    'pluggy==0.13.1',
-    'py==1.11.0',
-    'wcwidth==0.2.6',
-]
-MAP_VERSION_TO_INSTALL_PYTEST['5.4']['pip_packages'] = [
-    'py==1.11.0',
-    'packaging==23.1',
-    'attrs==23.1.0',
-    'more-itertools==10.1.0',
-    'pluggy==0.13.1',
-]
-MAP_VERSION_TO_INSTALL_PYTEST['6.0']['pip_packages'] = [
-    'attrs==23.1.0',
-    'iniconfig==2.0.0',
-    'more-itertools==10.1.0',
-    'packaging==23.1',
-    'pluggy==0.13.1',
-    'py==1.11.0',
-    'toml==0.10.2',
-]
-for k in ['6.2', '6.3']:
-    MAP_VERSION_TO_INSTALL_PYTEST[k]['pip_packages'] = [
-        'attrs==23.1.0',
-        'iniconfig==2.0.0',
-        'packaging==23.1',
-        'pluggy==0.13.1',
-        'py==1.11.0',
-        'toml==0.10.2',
-    ]
-MAP_VERSION_TO_INSTALL_PYTEST['7.0']['pip_packages'] = [
-    'attrs==23.1.0',
-    'iniconfig==2.0.0',
-    'packaging==23.1',
-    'pluggy==0.13.1',
-    'py==1.11.0',
-]
-for k in ['7.1', '7.2']:
-    MAP_VERSION_TO_INSTALL_PYTEST[k]['pip_packages'] = [
-        'attrs==23.1.0',
-        'iniconfig==2.0.0',
-        'packaging==23.1',
-        'pluggy==0.13.1',
-        'py==1.11.0',
-        'tomli==2.0.1',
-    ]
-MAP_VERSION_TO_INSTALL_PYTEST['7.4']['pip_packages'] = [
-    'iniconfig==2.0.0',
-    'packaging==23.1',
-    'pluggy==1.3.0',
-    'exceptiongroup==1.1.3',
-    'tomli==2.0.1',
-]
-MAP_VERSION_TO_INSTALL_PYTEST['8.0']['pip_packages'] = [
-    'iniconfig==2.0.0',
-    'packaging==23.1',
-    'pluggy==1.3.0',
-    'exceptiongroup==1.1.3',
-    'tomli==2.0.1',
-]
-MAP_VERSION_TO_INSTALL_MATPLOTLIB = {
-    k: {
-        'python': '3.11',
-        'packages': 'environment.yml',
-        'install': 'python -m pip install -e .',
-        'pre_install': [
-            'apt-get -y update && apt-get -y upgrade && apt-get install -y imagemagick ffmpeg texlive texlive-latex-extra texlive-fonts-recommended texlive-xetex texlive-luatex cm-super dvipng'
-        ],
-        'pip_packages': [
-            'contourpy==1.1.0',
-            'cycler==0.11.0',
-            'fonttools==4.42.1',
-            'ghostscript',
-            'kiwisolver==1.4.5',
-            'numpy==1.25.2',
-            'packaging==23.1',
-            'pillow==10.0.0',
-            'pikepdf',
-            'pyparsing==3.0.9',
-            'python-dateutil==2.8.2',
-            'six==1.16.0',
-            'setuptools==68.1.2',
-            'setuptools-scm==7.1.0',
-            'typing-extensions==4.7.1',
-        ],
-    }
-    for k in ['3.5', '3.6', '3.7']
-}
-MAP_VERSION_TO_INSTALL_MATPLOTLIB.update(
-    {
-        k: {
-            'python': '3.8',
-            'packages': 'requirements.txt',
-            'install': 'python -m pip install -e .',
-            'pre_install': [
-                'apt-get -y update && apt-get -y upgrade && apt-get install -y imagemagick ffmpeg libfreetype6-dev pkg-config texlive texlive-latex-extra texlive-fonts-recommended texlive-xetex texlive-luatex cm-super'
-            ],
-            'pip_packages': ['pytest', 'ipython'],
-        }
-        for k in ['3.1', '3.2', '3.3', '3.4']
-    }
-)
-MAP_VERSION_TO_INSTALL_MATPLOTLIB.update(
-    {
-        k: {
-            'python': '3.7',
-            'packages': 'requirements.txt',
-            'install': 'python -m pip install -e .',
-            'pre_install': [
-                'apt-get -y update && apt-get -y upgrade && apt-get install -y imagemagick ffmpeg libfreetype6-dev pkg-config'
-            ],
-            'pip_packages': ['pytest'],
-        }
-        for k in ['3.0']
-    }
-)
-MAP_VERSION_TO_INSTALL_MATPLOTLIB.update(
-    {
-        k: {
-            'python': '3.5',
-            'install': 'python setup.py build; python setup.py install',
-            'pre_install': [
-                'apt-get -y update && apt-get -y upgrade && && apt-get install -y imagemagick ffmpeg'
-            ],
-            'pip_packages': ['pytest'],
-            'execute_test_as_nonroot': True,
-        }
-        for k in ['2.0', '2.1', '2.2', '1.0', '1.1', '1.2', '1.3', '1.4', '1.5']
-    }
-)
-MAP_VERSION_TO_INSTALL_SPHINX = {
-    k: {
-        'python': '3.9',
-        'pip_packages': ['tox==4.16.0', 'tox-current-env==0.0.11'],
-        'install': 'python -m pip install -e .[test]',
-        'pre_install': ["sed -i 's/pytest/pytest -rA/' tox.ini"],
-    }
-    for k in ['1.5', '1.6', '1.7', '1.8', '2.0', '2.1', '2.2', '2.3', '2.4', '3.0']
-    + ['3.1', '3.2', '3.3', '3.4', '3.5', '4.0', '4.1', '4.2', '4.3', '4.4']
-    + ['4.5', '5.0', '5.1', '5.2', '5.3', '6.0', '6.2', '7.0', '7.1', '7.2']
-}
-for k in ['3.0', '3.1', '3.2', '3.3', '3.4', '3.5', '4.0', '4.1', '4.2', '4.3', '4.4']:
-    MAP_VERSION_TO_INSTALL_SPHINX[k]['pre_install'].extend(
-        [
-            "sed -i 's/Jinja2>=2.3/Jinja2<3.0/' setup.py",
-            "sed -i 's/sphinxcontrib-applehelp/sphinxcontrib-applehelp<=1.0.7/' setup.py",
-            "sed -i 's/sphinxcontrib-devhelp/sphinxcontrib-devhelp<=1.0.5/' setup.py",
-            "sed -i 's/sphinxcontrib-qthelp/sphinxcontrib-qthelp<=1.0.6/' setup.py",
-            "sed -i 's/alabaster>=0.7,<0.8/alabaster>=0.7,<0.7.12/' setup.py",
-            "sed -i \"s/'packaging',/'packaging', 'markupsafe<=2.0.1',/\" setup.py",
-        ]
-    )
-    if k in ['4.2', '4.3', '4.4']:
-        MAP_VERSION_TO_INSTALL_SPHINX[k]['pre_install'].extend(
-            [
-                "sed -i 's/sphinxcontrib-htmlhelp>=2.0.0/sphinxcontrib-htmlhelp>=2.0.0,<=2.0.4/' setup.py",
-                "sed -i 's/sphinxcontrib-serializinghtml>=1.1.5/sphinxcontrib-serializinghtml>=1.1.5,<=1.1.9/' setup.py",
-            ]
-        )
-    elif k == '4.1':
-        MAP_VERSION_TO_INSTALL_SPHINX[k]['pre_install'].extend(
-            [
-                (
-                    "grep -q 'sphinxcontrib-htmlhelp>=2.0.0' setup.py && "
-                    "sed -i 's/sphinxcontrib-htmlhelp>=2.0.0/sphinxcontrib-htmlhelp>=2.0.0,<=2.0.4/' setup.py || "
-                    "sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp<=2.0.4/' setup.py"
-                ),
-                (
-                    "grep -q 'sphinxcontrib-serializinghtml>=1.1.5' setup.py && "
-                    "sed -i 's/sphinxcontrib-serializinghtml>=1.1.5/sphinxcontrib-serializinghtml>=1.1.5,<=1.1.9/' setup.py || "
-                    "sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml<=1.1.9/' setup.py"
-                ),
-            ]
-        )
-    else:
-        MAP_VERSION_TO_INSTALL_SPHINX[k]['pre_install'].extend(
-            [
-                "sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp<=2.0.4/' setup.py",
-                "sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml<=1.1.9/' setup.py",
-            ]
-        )
-MAP_VERSION_TO_INSTALL_SPHINX['7.2']['pre_install'] += [
-    'apt-get update && apt-get install -y graphviz'
-]
-MAP_VERSION_TO_INSTALL_ASTROPY = {
-    k: {
-        'python': '3.9',
-        'install': 'python -m pip install -e .[test] --verbose',
-        'pip_packages': [
-            'attrs==23.1.0',
-            'exceptiongroup==1.1.3',
-            'execnet==2.0.2',
-            'hypothesis==6.82.6',
-            'iniconfig==2.0.0',
-            'numpy==1.25.2',
-            'packaging==23.1',
-            'pluggy==1.3.0',
-            'psutil==5.9.5',
-            'pyerfa==2.0.0.3',
-            'pytest-arraydiff==0.5.0',
-            'pytest-astropy-header==0.2.2',
-            'pytest-astropy==0.10.0',
-            'pytest-cov==4.1.0',
-            'pytest-doctestplus==1.0.0',
-            'pytest-filter-subpackage==0.1.2',
-            'pytest-mock==3.11.1',
-            'pytest-openfiles==0.5.0',
-            'pytest-remotedata==0.4.0',
-            'pytest-xdist==3.3.1',
-            'pytest==7.4.0',
-            'PyYAML==6.0.1',
-            'setuptools==68.0.0',
-            'sortedcontainers==2.4.0',
-            'tomli==2.0.1',
-        ],
-    }
-    for k in ['0.1', '0.2', '0.3', '0.4', '1.1', '1.2', '1.3', '3.0', '3.1', '3.2']
-    + ['4.1', '4.2', '4.3', '5.0', '5.1', '5.2']
-}
-for k in ['4.1', '4.2', '4.3', '5.0', '5.1', '5.2']:
-    MAP_VERSION_TO_INSTALL_ASTROPY[k]['pre_install'] = [
-        'sed -i \'s/requires = \\["setuptools",/requires = \\["setuptools==68.0.0",/\' pyproject.toml'
-    ]
-MAP_VERSION_TO_INSTALL_SYMPY = {
-    k: {
-        'python': '3.9',
-        'packages': 'mpmath flake8',
-        'pip_packages': ['mpmath==1.3.0', 'flake8-comprehensions'],
-        'install': 'python -m pip install -e .',
-    }
-    for k in ['0.7', '1.0', '1.1', '1.10', '1.11', '1.12', '1.2', '1.4', '1.5', '1.6']
-    + ['1.7', '1.8', '1.9']
-}
-MAP_VERSION_TO_INSTALL_SYMPY.update(
-    {
-        k: {
-            'python': '3.9',
-            'packages': 'requirements.txt',
-            'install': 'python -m pip install -e .',
-            'pip_packages': ['mpmath==1.3.0'],
-        }
-        for k in ['1.13']
-    }
-)
-MAP_VERSION_TO_INSTALL_PYLINT = {
-    k: {
-        'python': '3.9',
-        'packages': 'requirements.txt',
-        'install': 'python -m pip install -e .',
-    }
-    for k in [
-        '2.10',
-        '2.11',
-        '2.13',
-        '2.14',
-        '2.15',
-        '2.16',
-        '2.17',
-        '2.8',
-        '2.9',
-        '3.0',
-    ]
-}
-MAP_VERSION_TO_INSTALL_PYLINT['2.8']['pip_packages'] = ['pyenchant==3.2']
-MAP_VERSION_TO_INSTALL_PYLINT['2.8']['pre_install'] = [
-    'apt-get update && apt-get install -y libenchant-2-dev hunspell-en-us'
-]
-MAP_VERSION_TO_INSTALL_PYLINT.update(
-    {
-        k: {
-            **MAP_VERSION_TO_INSTALL_PYLINT[k],
-            'pip_packages': ['astroid==3.0.0a6', 'setuptools'],
-        }
-        for k in ['3.0']
-    }
-)
-
-MAP_VERSION_TO_INSTALL_XARRAY = {
-    k: {
-        'python': '3.10',
-        'packages': 'environment.yml',
-        'install': 'python -m pip install -e .',
-        'pip_packages': [
-            'numpy==1.23.0',
-            'packaging==23.1',
-            'pandas==1.5.3',
-            'pytest==7.4.0',
-            'python-dateutil==2.8.2',
-            'pytz==2023.3',
-            'six==1.16.0',
-            'scipy==1.11.1',
-            'setuptools==68.0.0',
-        ],
-        'no_use_env': True,
-    }
-    for k in ['0.12', '0.18', '0.19', '0.20', '2022.03', '2022.06', '2022.09']
-}
-
-MAP_VERSION_TO_INSTALL_SQLFLUFF = {
-    k: {
-        'python': '3.9',
-        'packages': 'requirements.txt',
-        'install': 'python -m pip install -e .',
-    }
-    for k in [
-        '0.10',
-        '0.11',
-        '0.12',
-        '0.13',
-        '0.4',
-        '0.5',
-        '0.6',
-        '0.8',
-        '0.9',
-        '1.0',
-        '1.1',
-        '1.2',
-        '1.3',
-        '1.4',
-        '2.0',
-        '2.1',
-        '2.2',
-    ]
-}
-MAP_VERSION_TO_INSTALL_DBT_CORE = {
-    k: {
-        'python': '3.9',
-        'packages': 'requirements.txt',
-        'install': 'python -m pip install -e .',
-    }
-    for k in [
-        '0.13',
-        '0.14',
-        '0.15',
-        '0.16',
-        '0.17',
-        '0.18',
-        '0.19',
-        '0.20',
-        '0.21',
-        '1.0',
-        '1.1',
-        '1.2',
-        '1.3',
-        '1.4',
-        '1.5',
-        '1.6',
-        '1.7',
-    ]
-}
-MAP_VERSION_TO_INSTALL_PYVISTA = {
-    k: {
-        'python': '3.9',
-        'install': 'python -m pip install -e .',
-        'pip_packages': ['pytest'],
-    }
-    for k in ['0.20', '0.21', '0.22', '0.23']
-}
-MAP_VERSION_TO_INSTALL_PYVISTA.update(
-    {
-        k: {
-            'python': '3.9',
-            'packages': 'requirements.txt',
-            'install': 'python -m pip install -e .',
-            'pip_packages': ['pytest'],
-        }
-        for k in [
-            '0.24',
-            '0.25',
-            '0.26',
-            '0.27',
-            '0.28',
-            '0.29',
-            '0.30',
-            '0.31',
-            '0.32',
-            '0.33',
-            '0.34',
-            '0.35',
-            '0.36',
-            '0.37',
-            '0.38',
-            '0.39',
-            '0.40',
-            '0.41',
-            '0.42',
-            '0.43',
-        ]
-    }
-)
-MAP_VERSION_TO_INSTALL_ASTROID = {
-    k: {
-        'python': '3.9',
-        'install': 'python -m pip install -e .',
-        'pip_packages': ['pytest'],
-    }
-    for k in [
-        '2.10',
-        '2.12',
-        '2.13',
-        '2.14',
-        '2.15',
-        '2.16',
-        '2.5',
-        '2.6',
-        '2.7',
-        '2.8',
-        '2.9',
-        '3.0',
-    ]
-}
-MAP_VERSION_TO_INSTALL_MARSHMALLOW = {
-    k: {
-        'python': '3.9',
-        'install': "python -m pip install -e '.[dev]'",
-    }
-    for k in [
-        '2.18',
-        '2.19',
-        '2.20',
-        '3.0',
-        '3.1',
-        '3.10',
-        '3.11',
-        '3.12',
-        '3.13',
-        '3.15',
-        '3.16',
-        '3.19',
-        '3.2',
-        '3.4',
-        '3.8',
-        '3.9',
-    ]
-}
-MAP_VERSION_TO_INSTALL_PVLIB = {
-    k: {
-        'python': '3.9',
-        'install': 'python -m pip install -e .[all]',
-        'packages': 'pandas scipy',
-        'pip_packages': ['jupyter', 'ipython', 'matplotlib', 'pytest', 'flake8'],
-    }
-    for k in ['0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9']
-}
-MAP_VERSION_TO_INSTALL_PYDICOM = {
-    k: {'python': '3.6', 'install': 'python -m pip install -e .', 'packages': 'numpy'}
-    for k in [
-        '1.0',
-        '1.1',
-        '1.2',
-        '1.3',
-        '1.4',
-        '2.0',
-        '2.1',
-        '2.2',
-        '2.3',
-        '2.4',
-        '3.0',
-    ]
-}
-MAP_VERSION_TO_INSTALL_PYDICOM.update(
-    {k: {**MAP_VERSION_TO_INSTALL_PYDICOM[k], 'python': '3.8'} for k in ['1.4', '2.0']}
-)
-MAP_VERSION_TO_INSTALL_PYDICOM.update(
-    {k: {**MAP_VERSION_TO_INSTALL_PYDICOM[k], 'python': '3.9'} for k in ['2.1', '2.2']}
-)
-MAP_VERSION_TO_INSTALL_PYDICOM.update(
-    {k: {**MAP_VERSION_TO_INSTALL_PYDICOM[k], 'python': '3.10'} for k in ['2.3']}
-)
-MAP_VERSION_TO_INSTALL_PYDICOM.update(
-    {k: {**MAP_VERSION_TO_INSTALL_PYDICOM[k], 'python': '3.11'} for k in ['2.4', '3.0']}
-)
-MAP_VERSION_TO_INSTALL_HUMANEVAL = {k: {'python': '3.9'} for k in ['1.0']}
-MAP_VERSION_TO_INSTALL_HUMANEVAL_FIX = {
-    k: {'python': '3.10', 'packages': 'pytest'} for k in ['0.0.1']
-}
-
-# Constants - Task Instance Instllation Environment
-MAP_VERSION_TO_INSTALL = {
-    'astropy/astropy': MAP_VERSION_TO_INSTALL_ASTROPY,
-    'dbt-labs/dbt-core': MAP_VERSION_TO_INSTALL_DBT_CORE,
-    'django/django': MAP_VERSION_TO_INSTALL_DJANGO,
-    'matplotlib/matplotlib': MAP_VERSION_TO_INSTALL_MATPLOTLIB,
-    'marshmallow-code/marshmallow': MAP_VERSION_TO_INSTALL_MARSHMALLOW,
-    'mwaskom/seaborn': MAP_VERSION_TO_INSTALL_SEABORN,
-    'pallets/flask': MAP_VERSION_TO_INSTALL_FLASK,
-    'psf/requests': MAP_VERSION_TO_INSTALL_REQUESTS,
-    'pvlib/pvlib-python': MAP_VERSION_TO_INSTALL_PVLIB,
-    'pydata/xarray': MAP_VERSION_TO_INSTALL_XARRAY,
-    'pydicom/pydicom': MAP_VERSION_TO_INSTALL_PYDICOM,
-    'pylint-dev/astroid': MAP_VERSION_TO_INSTALL_ASTROID,
-    'pylint-dev/pylint': MAP_VERSION_TO_INSTALL_PYLINT,
-    'pytest-dev/pytest': MAP_VERSION_TO_INSTALL_PYTEST,
-    'pyvista/pyvista': MAP_VERSION_TO_INSTALL_PYVISTA,
-    'scikit-learn/scikit-learn': MAP_VERSION_TO_INSTALL_SKLEARN,
-    'sphinx-doc/sphinx': MAP_VERSION_TO_INSTALL_SPHINX,
-    'sqlfluff/sqlfluff': MAP_VERSION_TO_INSTALL_SQLFLUFF,
-    'swe-bench/humaneval': MAP_VERSION_TO_INSTALL_HUMANEVAL,
-    'nielstron/humaneval_fix': MAP_VERSION_TO_INSTALL_HUMANEVAL_FIX,
-    'sympy/sympy': MAP_VERSION_TO_INSTALL_SYMPY,
-}
-
-# Constants - Repository Specific Installation Instructions
-MAP_REPO_TO_INSTALL = {}
-
-# Constants - Task Instance Test Frameworks
-TEST_PYTEST_VERBOSE = 'pytest -rA --tb=long -p no:cacheprovider'
-MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE = {
-    'astropy/astropy': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_ASTROPY.keys()
-    },
-    'django/django': {
-        k: './tests/runtests.py --verbosity 2 --settings=test_sqlite --parallel 1'
-        for k in MAP_VERSION_TO_INSTALL_DJANGO.keys()
-    },
-    'marshmallow-code/marshmallow': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_MARSHMALLOW.keys()
-    },
-    'matplotlib/matplotlib': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_MATPLOTLIB.keys()
-    },
-    'mwaskom/seaborn': {
-        k: 'pytest -rA --tb=long' for k in MAP_VERSION_TO_INSTALL_SEABORN.keys()
-    },
-    'pallets/flask': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_FLASK.keys()
-    },
-    'psf/requests': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_REQUESTS.keys()
-    },
-    'pvlib/pvlib-python': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_PVLIB.keys()
-    },
-    'pydata/xarray': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_XARRAY.keys()
-    },
-    'pydicom/pydicom': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_PYDICOM.keys()
-    },
-    'pylint-dev/astroid': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_ASTROID.keys()
-    },
-    'pylint-dev/pylint': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_PYLINT.keys()
-    },
-    'pytest-dev/pytest': {
-        k: 'pytest -rA --tb=long' for k in MAP_VERSION_TO_INSTALL_PYTEST.keys()
-    },
-    'pyvista/pyvista': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_PYVISTA.keys()
-    },
-    'scikit-learn/scikit-learn': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_SKLEARN.keys()
-    },
-    'sphinx-doc/sphinx': {
-        k: 'tox -epy39 -v --' for k in MAP_VERSION_TO_INSTALL_SPHINX.keys()
-    },
-    'sqlfluff/sqlfluff': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_SQLFLUFF.keys()
-    },
-    'swe-bench/humaneval': {
-        k: 'python' for k in MAP_VERSION_TO_INSTALL_HUMANEVAL.keys()
-    },
-    'nielstron/humaneval_fix': {
-        k: TEST_PYTEST_VERBOSE for k in MAP_VERSION_TO_INSTALL_HUMANEVAL.keys()
-    },
-    'sympy/sympy': {
-        k: 'bin/test -C --verbose' for k in MAP_VERSION_TO_INSTALL_SYMPY.keys()
-    },
-}
-MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE['django/django']['1.9'] = (
-    './tests/runtests.py --verbosity 2'
-)
--- a/evaluation/benchmarks/swe_perf/run_infer.py
+++ b/evaluation/benchmarks/swe_perf/run_infer.py
@@ -1,978 +0,0 @@
-import asyncio
-import copy
-import json
-import os
-import tempfile
-from typing import Any, Literal
-
-import pandas as pd
-import toml
-from datasets import load_dataset
-
-import openhands.agenthub
-from evaluation.benchmarks.swe_perf.binary_patch_utils import (
-    remove_binary_diffs,
-    remove_binary_files_from_git,
-)
-from evaluation.benchmarks.swe_perf.resource.mapping import (
-    get_instance_resource_factor,
-)
-from evaluation.benchmarks.swe_perf.resource.swt_bench_constants import (
-    MAP_REPO_TO_INSTALL,
-    MAP_VERSION_TO_INSTALL,
-)
-from evaluation.utils.shared import (
-    EvalException,
-    EvalMetadata,
-    EvalOutput,
-    assert_and_raise,
-    check_maximum_retries_exceeded,
-    codeact_user_response,
-    get_default_sandbox_config_for_eval,
-    get_metrics,
-    is_fatal_evaluation_error,
-    make_metadata,
-    prepare_dataset,
-    reset_logger_for_multiprocessing,
-    run_evaluation,
-    update_llm_config_for_completions_logging,
-)
-from openhands.controller.state.state import State
-from openhands.core.config import (
-    AgentConfig,
-    OpenHandsConfig,
-    get_evaluation_parser,
-    get_llm_config_arg,
-)
-from openhands.core.config.condenser_config import NoOpCondenserConfig
-from openhands.core.config.utils import get_condenser_config_arg
-from openhands.core.logger import openhands_logger as logger
-from openhands.core.main import create_runtime, run_controller
-from openhands.critic import AgentFinishedCritic
-from openhands.events.action import CmdRunAction, FileReadAction, MessageAction
-from openhands.events.observation import (
-    CmdOutputObservation,
-    ErrorObservation,
-    FileReadObservation,
-)
-from openhands.events.serialization.event import event_from_dict, event_to_dict
-from openhands.runtime.base import Runtime
-from openhands.utils.async_utils import call_async_from_sync
-from openhands.utils.shutdown_listener import sleep_if_should_continue
-
-USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
-RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
-ENABLE_LLM_EDITOR = os.environ.get('ENABLE_LLM_EDITOR', 'false').lower() == 'true'
-BenchMode = Literal['swe', 'swt', 'swt-ci']
-
-# Global variable to track dataset type
-DATASET_TYPE = 'SWE-Perf'
-
-
-AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
-    'CodeActAgent': codeact_user_response,
-}
-
-
-def _get_sweperf_workspace_dir_name(instance: pd.Series) -> str:
-    return f'{instance.repo}__{instance.version}'.replace('/', '__')
-
-
-def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
-    workspace_dir_name = _get_sweperf_workspace_dir_name(instance)
-
-    # The instruction
-    instruction = f"""
-<uploaded_files>
-/workspace/{workspace_dir_name}
-</uploaded_files>
-
-I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:
-
-
-<issue_description>
-{instance.problem_statement_realistic}
-</issue_description>
-
-Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?
-I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
-Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.
-Your task is to make the minimal changes to non-test files in the /workspace/{workspace_dir_name} directory to ensure the <issue_description> is satisfied.
-
-Follow these phases to resolve the issue:
-
-## ⚙️ Phase 1: Understand the Problem & Test Reuse
-
-**1.1. Install the package locally:**
-
-```bash
-python -m pip install pyinstrument
-python -m pip install -e .
-```
-
-> Only proceed to README-based install if the above fails.
-
-**1.2. Identify relevant modules and logic:**
-
-* Use test cases mentioned in `<issue_description>` to locate the functions and files involved.
-* Focus on potential performance bottlenecks: loops, I/O, locks, cache access, data structures, etc.
-
-**1.3. Run initial benchmark:**
-
-```bash
-pytest -rA --durations=0 --disable-warnings -p no:warnings --tb=no <test_case>
-```
-
-## 📊 Phase 2: Localization (Hierarchical Bottleneck Detection)
-
-**2.1. Global profiling using `pyinstrument`:**
-
-```bash
-pyinstrument -m pytest -rA --durations=0 --disable-warnings --tb=no --continue-on-collection-errors -p no:warnings <test_case>
-```
-
-**2.2. Analyze performance stack if necessary:**
-
-* 🔍 **Module level**: Identify hot files and methods.
-* 🔬 **Function level**: Focus on top-consuming classes/functions.
-* 🧬 **Line level**: Add fine-grained sampling/logging if needed.
-
-**2.3. Output a layered summary** showing where time is spent and why.
-
-
-## 🧠 Phase 3: Repair (Design Candidate Fixes)
-
-**3.1. Propose multiple optimization ideas:**
-
-* Algorithm refinement
-* Data structure improvement
-* Parallelism / async
-* Caching / batching
-
-**3.2. For each candidate:**
-
-* Describe the idea using pseudocode or `diff`
-* Evaluate expected gain vs implementation complexity
-
---
-
-## 🔬 Phase 4: Patch Validation (Quantitative Benchmarking)
-
-**4.1. Apply each patch separately**
-
-**4.2. Re-run benchmark using the same test case**
-
-**4.3. Record improvements in pytest `call` phase**
-
-**4.4. Build a comparison table:**
-
-| Patch | Call Time (s) | % Improvement | Notes |
-| ----- | ------------- | ------------- | ----- |
-
-**4.5. Select the best-performing, low-risk patch**
-
---
-
-## 📌 Phase 5: Fix Analysis (Explain the Fix)
-
-**5.1.** Clearly describe the **problem**
-
-**5.2.** Identify the **exact location** in the code
-
-**5.3.** Describe **how the test reveals** the problem
-
-**5.4.** State **best practices** relevant to the fix
-
-**5.5.** Explain **how the fix resolves** the issue
-
---
-
-## 🧩 Phase 6: Fix Implementation
-
-**6.1.** Implement the selected fix in the non-test source files
-**6.2.** Ensure the changes are minimal and performance-focused
-
---
-
-## ✅ Phase 7: Final Verification & Review
-
-**7.1.** Run **full regression tests** to ensure correctness
-
-**7.2.** Re-benchmark performance and compare with base commit `{instance['base_commit']}`
-
-**7.3.** Ensure `call` time is significantly improved in `pytest` output
-
-**7.4.** If gain is limited, consider deeper optimization
-
-**7.5.** Write a concise final report including:
-
-* Bottleneck path
-* Fix alternatives considered
-* Final solution and metrics
-* Code diff summary
-
-## Important ##
-1. Please do not optimize specifically for certain test cases, as this may be considered cheating.
-2. Delete all files unrelated to optimization, including temporary files, auxiliary test code, and so on.
-3. Avoid excessive optimization and unnecessary divergence; if the improvement is not significant, stop promptly to maintain efficiency and focus.
-
-Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity.
-"""
-
-    if RUN_WITH_BROWSING:
-        instruction += (
-            '<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
-        )
-
-    if 'image_assets' in instance:
-        assets = json.loads(instance['image_assets'])
-        assert 'problem_statement' in assets, (
-            'problem_statement is required in image_assets'
-        )
-        image_urls = assets['problem_statement']
-        return MessageAction(content=instruction, image_urls=image_urls)
-    return MessageAction(content=instruction)
-
-
-def get_instance_docker_image(
-    instance_id: str,
-) -> str:
-    docker_image_prefix = 'docker.io/betty1202/'
-    image_name = 'sweb.eval.x86_64.' + instance_id
-    image_name = image_name.replace(
-        '__', '_s_'
-    )  # to comply with docker image naming convention
-    return (docker_image_prefix.rstrip('/') + '/' + image_name).lower()
-
-
-def get_config(
-    instance: pd.Series,
-    metadata: EvalMetadata,
-) -> OpenHandsConfig:
-    base_container_image = get_instance_docker_image(
-        instance['instance_id'],
-    )
-    logger.info(
-        f'Using instance container image: {base_container_image}. '
-        f'Please make sure this image exists. '
-        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
-    )
-
-    sandbox_config = get_default_sandbox_config_for_eval()
-    sandbox_config.base_container_image = base_container_image
-    sandbox_config.enable_auto_lint = True
-    sandbox_config.use_host_network = False
-    # Add platform to the sandbox config to solve issue 4401
-    sandbox_config.platform = 'linux/amd64'
-    sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
-        dataset_name=metadata.dataset,
-        instance_id=instance['instance_id'],
-    )
-
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
-        enable_browser=RUN_WITH_BROWSING,
-        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
-    )
-
-    config.set_llm_config(
-        update_llm_config_for_completions_logging(
-            metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
-        )
-    )
-    # get 'draft_editor' config if exists
-    config.set_llm_config(get_llm_config_arg('draft_editor'), 'draft_editor')
-
-    agent_config = AgentConfig(
-        enable_jupyter=False,
-        enable_browsing=RUN_WITH_BROWSING,
-        enable_llm_editor=ENABLE_LLM_EDITOR,
-        enable_mcp=False,
-        condenser=metadata.condenser_config,
-        enable_prompt_extensions=False,
-    )
-    config.set_agent_config(agent_config)
-    return config
-
-
-def initialize_runtime(
-    runtime: Runtime,
-    instance: pd.Series,  # this argument is not required
-    metadata: EvalMetadata,
-):
-    """Initialize the runtime for the agent.
-
-    This function is called before the runtime is used to run the agent.
-    """
-    logger.info('-' * 30)
-    logger.info('BEGIN Runtime Initialization Fn')
-    logger.info('-' * 30)
-    workspace_dir_name = _get_sweperf_workspace_dir_name(instance)
-    obs: CmdOutputObservation
-
-    # Set instance id and git configuration
-    action = CmdRunAction(
-        command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc && git config --global core.pager "" && git config --global diff.binary false"""
-    )
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0,
-        f'Failed to export SWE_INSTANCE_ID and configure git: {str(obs)}',
-    )
-
-    action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
-
-    # inject the init script
-    script_dir = os.path.dirname(__file__)
-
-    # inject the instance info
-    action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0,
-        f'Failed to create /swe_util/eval_data/instances: {str(obs)}',
-    )
-
-    swe_instance_json_name = 'swe-perf-instance.json'
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Construct the full path for the desired file name within the temporary directory
-        temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
-        # Write to the file with the desired name within the temporary directory
-        with open(temp_file_path, 'w') as f:
-            if not isinstance(instance, dict):
-                json.dump([instance.to_dict()], f)
-            else:
-                json.dump([instance], f)
-
-        # Copy the file to the desired location
-        runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
-
-        # inject the instance swe entry
-        entry_script_path = 'instance_swe_entry.sh'
-        runtime.copy_to(
-            str(os.path.join(script_dir, f'scripts/setup/{entry_script_path}')),
-            '/swe_util/',
-        )
-
-    action = CmdRunAction(command='cat ~/.bashrc')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
-
-    action = CmdRunAction(command='source ~/.bashrc')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    if isinstance(obs, ErrorObservation):
-        logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
-    assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
-
-    action = CmdRunAction(command=f'source /swe_util/{entry_script_path}')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0,
-        f'Failed to source /swe_util/{entry_script_path}: {str(obs)}',
-    )
-
-    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0,
-        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
-    )
-
-    action = CmdRunAction(command='git reset --hard')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}')
-
-    action = CmdRunAction(
-        command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
-    )
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
-
-    if metadata.details['mode'] == 'swt-ci':
-        # set up repo
-        setup_commands = []
-        if instance['repo'] in MAP_REPO_TO_INSTALL:
-            setup_commands.append(MAP_REPO_TO_INSTALL[instance['repo']])
-
-        # Run pre-install set up if provided
-        install = MAP_VERSION_TO_INSTALL.get(instance['repo'], {}).get(
-            instance['version'], []
-        )
-        if 'pre_install' in install:
-            for pre_install in install['pre_install']:
-                setup_commands.append(pre_install)
-
-        if 'install' in install:
-            setup_commands.append(install['install'])
-
-        for command in setup_commands:
-            action = CmdRunAction(command=command)
-            action.set_hard_timeout(600)
-            logger.info(action, extra={'msg_type': 'ACTION'})
-            obs = runtime.run_action(action)
-            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    action = CmdRunAction(command='which python')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0 and 'testbed' in obs.content,
-        f'Expected to find python interpreter from testbed, but got: {str(obs)}',
-    )
-
-    logger.info('-' * 30)
-    logger.info('END Runtime Initialization Fn')
-    logger.info('-' * 30)
-
-
-def complete_runtime(
-    runtime: Runtime,
-    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
-) -> dict[str, Any]:
-    """Complete the runtime for the agent.
-
-    This function is called before the runtime is used to run the agent.
-    If you need to do something in the sandbox to get the correctness metric after
-    the agent has run, modify this function.
-    """
-    logger.info('-' * 30)
-    logger.info('BEGIN Runtime Completion Fn')
-    logger.info('-' * 30)
-    obs: CmdOutputObservation
-    workspace_dir_name = _get_sweperf_workspace_dir_name(instance)
-
-    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    if obs.exit_code == -1:
-        # The previous command is still running
-        # We need to kill previous command
-        logger.info('The previous command is still running, trying to kill it...')
-        action = CmdRunAction(command='C-c')
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-        # Then run the command again
-        action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-        action.set_hard_timeout(600)
-        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    if obs.exit_code == -1:
-        # The previous command is still running
-        # We need to kill previous command
-        logger.info('The previous command is still running, trying to ctrl+z it...')
-        action = CmdRunAction(command='C-z')
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-        # Then run the command again
-        action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-        action.set_hard_timeout(600)
-        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
-    )
-
-    action = CmdRunAction(command='git config --global core.pager ""')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to git config --global core.pager "": {str(obs)}',
-    )
-
-    # First check for any git repositories in subdirectories
-    action = CmdRunAction(command='find . -type d -name .git -not -path "./.git"')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to find git repositories: {str(obs)}',
-    )
-
-    git_dirs = [p for p in obs.content.strip().split('\n') if p]
-    if git_dirs:
-        # Remove all .git directories in subdirectories
-        for git_dir in git_dirs:
-            action = CmdRunAction(command=f'rm -rf "{git_dir}"')
-            action.set_hard_timeout(600)
-            logger.info(action, extra={'msg_type': 'ACTION'})
-            obs = runtime.run_action(action)
-            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-            assert_and_raise(
-                isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-                f'Failed to remove git directory {git_dir}: {str(obs)}',
-            )
-
-    # add all files
-    action = CmdRunAction(command='git add -A')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to git add -A: {str(obs)}',
-    )
-
-    # Remove binary files from git staging
-    action = CmdRunAction(command=remove_binary_files_from_git())
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to remove binary files: {str(obs)}',
-    )
-
-    n_retries = 0
-    git_patch = None
-    while n_retries < 5:
-        action = CmdRunAction(
-            command=f'git diff --no-color --cached {instance["base_commit"]} > patch.diff'
-        )
-        action.set_hard_timeout(max(300 + 100 * n_retries, 600))
-        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        n_retries += 1
-        if isinstance(obs, CmdOutputObservation):
-            if obs.exit_code == 0:
-                # Read the patch file
-                action = FileReadAction(path='patch.diff')
-                action.set_hard_timeout(max(300 + 100 * n_retries, 600))
-                logger.info(action, extra={'msg_type': 'ACTION'})
-                obs = runtime.run_action(action)
-                logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-                if isinstance(obs, FileReadObservation):
-                    git_patch = obs.content
-                    break
-                elif isinstance(obs, ErrorObservation):
-                    # Fall back to cat "patch.diff" to get the patch
-                    assert 'File could not be decoded as utf-8' in obs.content
-                    action = CmdRunAction(command='cat patch.diff')
-                    action.set_hard_timeout(max(300 + 100 * n_retries, 600))
-                    logger.info(action, extra={'msg_type': 'ACTION'})
-                    obs = runtime.run_action(action)
-                    assert isinstance(obs, CmdOutputObservation) and obs.exit_code == 0
-                    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-                    git_patch = obs.content
-                    break
-                else:
-                    assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
-            else:
-                logger.info('Failed to get git diff, retrying...')
-                sleep_if_should_continue(10)
-        elif isinstance(obs, ErrorObservation):
-            logger.error(f'Error occurred: {obs.content}. Retrying...')
-            sleep_if_should_continue(10)
-        else:
-            assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
-
-    assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
-
-    # Remove binary diffs from the patch
-    git_patch = remove_binary_diffs(git_patch)
-
-    logger.info('-' * 30)
-    logger.info('END Runtime Completion Fn')
-    logger.info('-' * 30)
-    return {'git_patch': git_patch}
-
-
-def process_instance(
-    instance: pd.Series,
-    metadata: EvalMetadata,
-    reset_logger: bool = True,
-    runtime_failure_count: int = 0,
-) -> EvalOutput:
-    config = get_config(instance, metadata)
-
-    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-    if reset_logger:
-        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
-        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
-    else:
-        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
-
-    # Increase resource_factor with increasing attempt_id
-    if runtime_failure_count > 0:
-        config.sandbox.remote_runtime_resource_factor = min(
-            config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
-            8,
-        )
-        logger.warning(
-            f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
-        )
-
-    metadata = copy.deepcopy(metadata)
-    metadata.details['runtime_failure_count'] = runtime_failure_count
-    metadata.details['remote_runtime_resource_factor'] = (
-        config.sandbox.remote_runtime_resource_factor
-    )
-
-    runtime = create_runtime(config)
-    call_async_from_sync(runtime.connect)
-
-    try:
-        initialize_runtime(runtime, instance, metadata)
-
-        message_action = get_instruction(instance, metadata)
-
-        # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State | None = asyncio.run(
-            run_controller(
-                config=config,
-                initial_user_action=message_action,
-                runtime=runtime,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                    metadata.agent_class
-                ],
-            )
-        )
-
-        # if fatal error, throw EvalError to trigger re-run
-        if is_fatal_evaluation_error(state.last_error):
-            raise EvalException('Fatal error detected: ' + state.last_error)
-
-        # Get git patch
-        complete_runtime_fn = complete_runtime
-        return_val = complete_runtime_fn(runtime, instance)
-        git_patch = return_val['git_patch']
-        logger.info(
-            f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
-        )
-    finally:
-        runtime.close()
-    # ==========================================
-
-    # ======= Attempt to evaluate the agent's edits =======
-    # we use eval_infer.sh to evaluate the agent's edits, not here
-    # because the agent may alter the environment / testcases
-    test_result = {
-        'git_patch': git_patch,
-    }
-
-    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-    if state is None:
-        raise ValueError('State should not be None.')
-
-    # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
-    histories = [event_to_dict(event) for event in state.history]
-    metrics = get_metrics(state)
-
-    # Save the output
-    instruction = message_action.content
-    if message_action.image_urls:
-        instruction += (
-            '\n\n<image_urls>' + '\n'.join(message_action.image_urls) + '</image_urls>'
-        )
-    output = EvalOutput(
-        instance_id=instance.instance_id,
-        instruction=instruction,
-        instance=instance.to_dict(),  # SWE Bench specific
-        test_result=test_result,
-        metadata=metadata,
-        history=histories,
-        metrics=metrics,
-        error=state.last_error if state and state.last_error else None,
-    )
-    return output
-
-
-def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
-    file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml')
-    if os.path.exists(file_path):
-        with open(file_path, 'r') as file:
-            data = toml.load(file)
-            if 'selected_ids' in data:
-                selected_ids = data['selected_ids']
-                logger.info(
-                    f'Filtering {len(selected_ids)} tasks from "selected_ids"...'
-                )
-                subset = dataset[dataset[filter_column].isin(selected_ids)]
-                logger.info(f'Retained {subset.shape[0]} tasks after filtering')
-                return subset
-            if 'selected_repos' in data:
-                selected_repos = data['selected_repos']
-                if isinstance(selected_repos, str):
-                    selected_repos = [selected_repos]
-                assert isinstance(selected_repos, list)
-                logger.info(
-                    f'Filtering {selected_repos} tasks from "selected_repos"...'
-                )
-                subset = dataset[dataset['repo'].isin(selected_repos)]
-                logger.info(f'Retained {subset.shape[0]} tasks after filtering')
-                return subset
-
-    skip_ids = os.environ.get('SKIP_IDS', '').split(',')
-    if len(skip_ids) > 0:
-        logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
-        return dataset[~dataset[filter_column].isin(skip_ids)]
-    return dataset
-
-
-if __name__ == '__main__':
-    parser = get_evaluation_parser()
-    parser.add_argument(
-        '--dataset',
-        type=str,
-        default='SWE-Perf/SWE-Perf',
-        help='data set to evaluate on, either full-test or lite-test',
-    )
-    parser.add_argument(
-        '--split',
-        type=str,
-        default='test',
-        help='split to evaluate on',
-    )
-    parser.add_argument(
-        '--mode',
-        type=str,
-        default='swe',
-        choices=['swe', 'swt', 'swt-ci'],
-        help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'",
-    )
-
-    args, _ = parser.parse_known_args()
-
-    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
-    # so we don't need to manage file uploading to OpenHands's repo
-    dataset = load_dataset(args.dataset, split=args.split)
-
-    swe_perf_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
-    logger.info(
-        f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_perf_tests)} tasks'
-    )
-
-    llm_config = None
-    if args.llm_config:
-        llm_config = get_llm_config_arg(args.llm_config)
-        llm_config.log_completions = True
-        # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
-        llm_config.modify_params = False
-
-    if llm_config is None:
-        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
-
-    # Get condenser config from environment variable
-    condenser_name = os.environ.get('EVAL_CONDENSER')
-    if condenser_name:
-        condenser_config = get_condenser_config_arg(condenser_name)
-        if condenser_config is None:
-            raise ValueError(
-                f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}'
-            )
-    else:
-        # If no specific condenser config is provided via env var, default to NoOpCondenser
-        condenser_config = NoOpCondenserConfig()
-        logger.debug(
-            'No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.'
-        )
-
-    details = {'mode': args.mode}
-    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
-
-    dataset_descrption = (
-        args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__')
-    )
-    metadata = make_metadata(
-        llm_config,
-        dataset_descrption,
-        args.agent_cls,
-        args.max_iterations,
-        args.eval_note,
-        args.eval_output_dir,
-        details=details,
-        condenser_config=condenser_config,
-    )
-
-    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    print(f'### OUTPUT FILE: {output_file} ###')
-
-    # Run evaluation in iterative mode:
-    # If a rollout fails to output AgentFinishAction, we will try again until it succeeds OR total 3 attempts have been made.
-    ITERATIVE_EVAL_MODE = (
-        os.environ.get('ITERATIVE_EVAL_MODE', 'false').lower() == 'true'
-    )
-    ITERATIVE_EVAL_MODE_MAX_ATTEMPTS = int(
-        os.environ.get('ITERATIVE_EVAL_MODE_MAX_ATTEMPTS', '3')
-    )
-
-    if not ITERATIVE_EVAL_MODE:
-        # load the dataset
-        instances = prepare_dataset(swe_perf_tests, output_file, args.eval_n_limit)
-
-        run_evaluation(
-            instances,
-            metadata,
-            output_file,
-            args.eval_num_workers,
-            process_instance,
-            timeout_seconds=8
-            * 60
-            * 60,  # 8 hour PER instance should be more than enough
-            max_retries=5,
-        )
-    else:
-        critic = AgentFinishedCritic()
-
-        def get_cur_output_file_path(attempt: int) -> str:
-            return (
-                f'{output_file.removesuffix(".jsonl")}.critic_attempt_{attempt}.jsonl'
-            )
-
-        eval_ids = None
-        for attempt in range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1):
-            cur_output_file = get_cur_output_file_path(attempt)
-            logger.info(
-                f'Running evaluation with critic {critic.__class__.__name__} for attempt {attempt} of {ITERATIVE_EVAL_MODE_MAX_ATTEMPTS}.'
-            )
-
-            # For deterministic eval, we set temperature to 0.1 for (>1) attempt
-            # so hopefully we get slightly different results
-            if attempt > 1 and metadata.llm_config.temperature == 0:
-                logger.info(
-                    f'Detected temperature is 0 for (>1) attempt {attempt}. Setting temperature to 0.1...'
-                )
-                metadata.llm_config.temperature = 0.1
-
-            # Load instances - at first attempt, we evaluate all instances
-            # On subsequent attempts, we only evaluate the instances that failed the previous attempt determined by critic
-            instances = prepare_dataset(
-                swe_perf_tests, cur_output_file, args.eval_n_limit, eval_ids=eval_ids
-            )
-
-            # Run evaluation - but save them to cur_output_file
-            logger.info(
-                f'Evaluating {len(instances)} instances for attempt {attempt}...'
-            )
-            run_evaluation(
-                instances,
-                metadata,
-                cur_output_file,
-                args.eval_num_workers,
-                process_instance,
-                timeout_seconds=8
-                * 60
-                * 60,  # 8 hour PER instance should be more than enough
-                max_retries=5,
-            )
-
-            # When eval is done, we update eval_ids to the instances that failed the current attempt
-            instances_failed = []
-            logger.info(
-                f'Use critic {critic.__class__.__name__} to check {len(instances)} instances for attempt {attempt}...'
-            )
-            with open(cur_output_file, 'r') as f:
-                for line in f:
-                    instance = json.loads(line)
-                    try:
-                        history = [
-                            event_from_dict(event) for event in instance['history']
-                        ]
-                        critic_result = critic.evaluate(
-                            history, instance['test_result'].get('git_patch', '')
-                        )
-                        if not critic_result.success:
-                            instances_failed.append(instance['instance_id'])
-                    except Exception as e:
-                        logger.error(
-                            f'Error loading history for instance {instance["instance_id"]}: {e}'
-                        )
-                        instances_failed.append(instance['instance_id'])
-            logger.info(
-                f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}'
-            )
-            eval_ids = instances_failed
-
-            # If no instances failed, we break
-            if len(instances_failed) == 0:
-                break
-
-        # Then we should aggregate the results from all attempts into the original output file
-        # and remove the intermediate files
-        logger.info(
-            'Aggregating results from all attempts into the original output file...'
-        )
-        fout = open(output_file, 'w')
-        added_instance_ids = set()
-        for attempt in reversed(range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1)):
-            cur_output_file = get_cur_output_file_path(attempt)
-            if not os.path.exists(cur_output_file):
-                logger.warning(
-                    f'Intermediate output file {cur_output_file} does not exist. Skipping...'
-                )
-                continue
-
-            with open(cur_output_file, 'r') as f:
-                for line in f:
-                    instance = json.loads(line)
-                    # Also make sure git_patch is not empty - otherwise we fall back to previous attempt (empty patch is worse than anything else)
-                    if (
-                        instance['instance_id'] not in added_instance_ids
-                        and instance['test_result'].get('git_patch', '').strip()
-                    ):
-                        fout.write(line)
-                        added_instance_ids.add(instance['instance_id'])
-            logger.info(
-                f'Aggregated instances from {cur_output_file}. Total instances added so far: {len(added_instance_ids)}'
-            )
-        fout.close()
-        logger.info(
-            f'Done! Total {len(added_instance_ids)} instances added to {output_file}'
-        )
-        # Check if any instances reached maximum retries
-        check_maximum_retries_exceeded(metadata.eval_output_dir)
--- a/evaluation/benchmarks/swe_perf/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swe_perf/scripts/run_infer.sh
@@ -1,146 +0,0 @@
-#!/usr/bin/env bash
-set -eo pipefail
-
-source "evaluation/utils/version_control.sh"
-
-MODEL_CONFIG=$1
-COMMIT_HASH=$2
-AGENT=$3
-EVAL_LIMIT=$4
-MAX_ITER=$5
-NUM_WORKERS=$6
-DATASET=$7
-SPLIT=$8
-N_RUNS=$9
-MODE=${10}
-
-
-if [ -z "$NUM_WORKERS" ]; then
-  NUM_WORKERS=1
-  echo "Number of workers not specified, use default $NUM_WORKERS"
-fi
-checkout_eval_branch
-
-if [ -z "$AGENT" ]; then
-  echo "Agent not specified, use default CodeActAgent"
-  AGENT="CodeActAgent"
-fi
-
-if [ -z "$MAX_ITER" ]; then
-  echo "MAX_ITER not specified, use default 100"
-  MAX_ITER=100
-fi
-
-if [ -z "$RUN_WITH_BROWSING" ]; then
-  echo "RUN_WITH_BROWSING not specified, use default false"
-  RUN_WITH_BROWSING=false
-fi
-
-
-if [ -z "$DATASET" ]; then
-  echo "DATASET not specified, use default SWE-Perf/SWE-Perf"
-  DATASET="SWE-Perf/SWE-Perf"
-fi
-
-if [ -z "$SPLIT" ]; then
-  echo "SPLIT not specified, use default test"
-  SPLIT="test"
-fi
-
-if [ -z "$MODE" ]; then
-  MODE="swe"
-  echo "MODE not specified, use default $MODE"
-fi
-
-if [ -n "$EVAL_CONDENSER" ]; then
-  echo "Using Condenser Config: $EVAL_CONDENSER"
-else
-  echo "No Condenser Config provided via EVAL_CONDENSER, use default (NoOpCondenser)."
-fi
-
-export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
-echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
-
-get_openhands_version
-
-echo "AGENT: $AGENT"
-echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
-echo "MODEL_CONFIG: $MODEL_CONFIG"
-echo "DATASET: $DATASET"
-echo "SPLIT: $SPLIT"
-echo "MAX_ITER: $MAX_ITER"
-echo "NUM_WORKERS: $NUM_WORKERS"
-echo "COMMIT_HASH: $COMMIT_HASH"
-echo "MODE: $MODE"
-echo "EVAL_CONDENSER: $EVAL_CONDENSER"
-
-# Default to NOT use Hint
-if [ -z "$USE_HINT_TEXT" ]; then
-  export USE_HINT_TEXT=false
-fi
-echo "USE_HINT_TEXT: $USE_HINT_TEXT"
-EVAL_NOTE="$OPENHANDS_VERSION"
-# if not using Hint, add -no-hint to the eval note
-if [ "$USE_HINT_TEXT" = false ]; then
-  EVAL_NOTE="$EVAL_NOTE-no-hint"
-fi
-
-if [ "$RUN_WITH_BROWSING" = true ]; then
-  EVAL_NOTE="$EVAL_NOTE-with-browsing"
-fi
-
-if [ -n "$EXP_NAME" ]; then
-  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
-fi
-# if mode != swe, add mode to the eval note
-if [ "$MODE" != "swe" ]; then
-  EVAL_NOTE="${EVAL_NOTE}-${MODE}"
-fi
-# Add condenser config to eval note if provided
-if [ -n "$EVAL_CONDENSER" ]; then
-  EVAL_NOTE="${EVAL_NOTE}-${EVAL_CONDENSER}"
-fi
-
-function run_eval() {
-  local eval_note="${1}"
-  COMMAND="poetry run python evaluation/benchmarks/swe_perf/run_infer.py \
-    --agent-cls $AGENT \
-    --llm-config $MODEL_CONFIG \
-    --max-iterations $MAX_ITER \
-    --eval-num-workers $NUM_WORKERS \
-    --eval-note $eval_note \
-    --dataset $DATASET \
-    --split $SPLIT \
-    --mode $MODE"
-
-
-
-  if [ -n "$EVAL_LIMIT" ]; then
-    echo "EVAL_LIMIT: $EVAL_LIMIT"
-    COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
-  fi
-
-  # Run the command
-  eval $COMMAND
-}
-
-unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
-if [ -z "$N_RUNS" ]; then
-  N_RUNS=1
-  echo "N_RUNS not specified, use default $N_RUNS"
-fi
-
-# Skip runs if the run number is in the SKIP_RUNS list
-# read from env variable SKIP_RUNS as a comma separated list of run numbers
-SKIP_RUNS=(${SKIP_RUNS//,/ })
-for i in $(seq 1 $N_RUNS); do
-  if [[ " ${SKIP_RUNS[@]} " =~ " $i " ]]; then
-    echo "Skipping run $i"
-    continue
-  fi
-  current_eval_note="$EVAL_NOTE-run_$i"
-  echo "EVAL_NOTE: $current_eval_note"
-  run_eval $current_eval_note
-done
-
-checkout_original_branch
--- a/evaluation/benchmarks/swe_perf/scripts/setup/compare_patch_filename.py
+++ b/evaluation/benchmarks/swe_perf/scripts/setup/compare_patch_filename.py
@@ -1,54 +0,0 @@
-"""This script compares gold patches with OpenHands-generated patches and check whether
-OpenHands found the right (set of) files to modify.
-"""
-
-import argparse
-import json
-import re
-
-
-def extract_modified_files(patch):
-    modified_files = set()
-    file_pattern = re.compile(r'^diff --git a/(.*?) b/')
-
-    for line in patch.split('\n'):
-        match = file_pattern.match(line)
-        if match:
-            modified_files.add(match.group(1))
-
-    return modified_files
-
-
-def process_report(oh_output_file):
-    succ = 0
-    fail = 0
-    for line in open(oh_output_file):
-        line = json.loads(line)
-        instance_id = line['instance_id']
-        gold_patch = line['swe_instance']['patch']
-        generated_patch = line['git_patch']
-        gold_modified_files = extract_modified_files(gold_patch)
-        # swe-bench lite only: a gold patch always contains exactly one file
-        assert len(gold_modified_files) == 1
-        generated_modified_files = extract_modified_files(generated_patch)
-
-        # Check if all files in gold_patch are also in generated_patch
-        all_files_in_generated = gold_modified_files.issubset(generated_modified_files)
-        if all_files_in_generated:
-            succ += 1
-        else:
-            fail += 1
-            print(
-                f'{instance_id}: file mismatch, gold = {gold_modified_files}, generated = {generated_modified_files}'
-            )
-    print(
-        f'\nSUMMARY: {succ} out of {succ + fail} instances found correct files to edit, success rate = {succ / float(succ + fail)}'
-    )
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--oh_output_file', help='Path to the OH output file')
-    args = parser.parse_args()
-
-    process_report(args.oh_output_file)
--- a/evaluation/benchmarks/swe_perf/scripts/setup/instance_swe_entry.sh
+++ b/evaluation/benchmarks/swe_perf/scripts/setup/instance_swe_entry.sh
@@ -1,43 +0,0 @@
-#!/usr/bin/env bash
-
-source ~/.bashrc
-SWEUTIL_DIR=/swe_util
-
-# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
-# SWE_INSTANCE_ID=django__django-11099
-if [ -z "$SWE_INSTANCE_ID" ]; then
-    echo "Error: SWE_INSTANCE_ID is not set." >&2
-    exit 1
-fi
-
-# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
-item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
-
-if [[ -z "$item" ]]; then
-  echo "No item found for the provided instance ID."
-  exit 1
-fi
-
-
-WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")')
-
-echo "WORKSPACE_NAME: $WORKSPACE_NAME"
-
-# Clear the workspace
-if [ -d /workspace ]; then
-    rm -rf /workspace/*
-else
-    mkdir /workspace
-fi
-# Copy repo to workspace
-if [ -d /workspace/$WORKSPACE_NAME ]; then
-    rm -rf /workspace/$WORKSPACE_NAME
-fi
-mkdir -p /workspace
-cp -r /testbed /workspace/$WORKSPACE_NAME
-
-# Activate instance-specific environment
-if [ -d /opt/miniconda3 ]; then
-    . /opt/miniconda3/etc/profile.d/conda.sh
-    conda activate testbed
-fi
--- a/frontend/tests/components/browser.test.tsx
+++ b/frontend/tests/components/browser.test.tsx
@@ -13,8 +13,7 @@ vi.mock("react-router", async () => {

 vi.mock("#/context/conversation-context", () => ({
  useConversation: () => ({ conversationId: "test-conversation-id" }),
-  ConversationProvider: ({ children }: { children: React.ReactNode }) =>
-    children,
+  ConversationProvider: ({ children }: { children: React.ReactNode }) => children,
 }));

 vi.mock("react-i18next", async () => {
@@ -30,18 +29,21 @@ vi.mock("react-i18next", async () => {
  };
 });

-// Mock Zustand browser store
+// Mock redux
+const mockDispatch = vi.fn();
 let mockBrowserState = {
  url: "https://example.com",
  screenshotSrc: "",
-  setUrl: vi.fn(),
-  setScreenshotSrc: vi.fn(),
-  reset: vi.fn(),
 };

-vi.mock("#/stores/browser-store", () => ({
-  useBrowserStore: () => mockBrowserState,
-}));
+vi.mock("react-redux", async () => {
+  const actual = await vi.importActual("react-redux");
+  return {
+    ...actual,
+    useDispatch: () => mockDispatch,
+    useSelector: () => mockBrowserState,
+  };
+});

 // Import the component after all mocks are set up
 import { BrowserPanel } from "#/components/features/browser/browser";
@@ -53,9 +55,6 @@ describe("Browser", () => {
    mockBrowserState = {
      url: "https://example.com",
      screenshotSrc: "",
-      setUrl: vi.fn(),
-      setScreenshotSrc: vi.fn(),
-      reset: vi.fn(),
    };
  });

@@ -64,9 +63,6 @@ describe("Browser", () => {
    mockBrowserState = {
      url: "https://example.com",
      screenshotSrc: "",
-      setUrl: vi.fn(),
-      setScreenshotSrc: vi.fn(),
-      reset: vi.fn(),
    };

    render(<BrowserPanel />);
@@ -79,11 +75,7 @@ describe("Browser", () => {
    // Set the mock state for this test
    mockBrowserState = {
      url: "https://example.com",
-      screenshotSrc:
-        "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mN0uGvyHwAFCAJS091fQwAAAABJRU5ErkJggg==",
-      setUrl: vi.fn(),
-      setScreenshotSrc: vi.fn(),
-      reset: vi.fn(),
+      screenshotSrc: "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mN0uGvyHwAFCAJS091fQwAAAABJRU5ErkJggg==",
    };

    render(<BrowserPanel />);
--- a/frontend/tests/components/chat/chat-interface.test.tsx
+++ b/frontend/tests/components/chat/chat-interface.test.tsx
@@ -17,8 +17,8 @@ import type { Message } from "#/message";
 import { SUGGESTIONS } from "#/utils/suggestions";
 import { ChatInterface } from "#/components/features/chat/chat-interface";
 import { useWsClient } from "#/context/ws-client-provider";
-import { useErrorMessageStore } from "#/stores/error-message-store";
-import { useOptimisticUserMessageStore } from "#/stores/optimistic-user-message-store";
+import { useOptimisticUserMessage } from "#/hooks/use-optimistic-user-message";
+import { useWSErrorMessage } from "#/hooks/use-ws-error-message";
 import { useConfig } from "#/hooks/query/use-config";
 import { useGetTrajectory } from "#/hooks/mutation/use-get-trajectory";
 import { useUploadFiles } from "#/hooks/mutation/use-upload-files";
@@ -26,8 +26,8 @@ import { OpenHandsAction } from "#/types/core/actions";

 // Mock the hooks
 vi.mock("#/context/ws-client-provider");
-vi.mock("#/stores/error-message-store");
-vi.mock("#/stores/optimistic-user-message-store");
+vi.mock("#/hooks/use-optimistic-user-message");
+vi.mock("#/hooks/use-ws-error-message");
 vi.mock("#/hooks/query/use-config");
 vi.mock("#/hooks/mutation/use-get-trajectory");
 vi.mock("#/hooks/mutation/use-upload-files");
@@ -61,6 +61,39 @@ vi.mock("#/hooks/use-conversation-name-context-menu", () => ({
  }),
 }));

+vi.mock("react-redux", async () => {
+  const actual = await vi.importActual("react-redux");
+  return {
+    ...actual,
+    useSelector: vi.fn((selector) => {
+      // Create a mock state object
+      const mockState = {
+        agent: {
+          curAgentState: "AWAITING_USER_INPUT",
+        },
+        initialQuery: {
+          selectedRepository: null,
+          replayJson: null,
+        },
+        conversation: {
+          messageToSend: null,
+          files: [],
+          images: [],
+          loadingFiles: [],
+          loadingImages: [],
+        },
+        status: {
+          curStatusMessage: null,
+        },
+      };
+
+      // Execute the selector function with our mock state
+      return selector(mockState);
+    }),
+    useDispatch: vi.fn(() => vi.fn()),
+  };
+});
+
 // Helper function to render with Router context
 const renderChatInterfaceWithRouter = () =>
  renderWithProviders(
@@ -108,14 +141,13 @@ describe("ChatInterface - Chat Suggestions", () => {
      parsedEvents: [],
    });
    (
-      useOptimisticUserMessageStore as unknown as ReturnType<typeof vi.fn>
+      useOptimisticUserMessage as unknown as ReturnType<typeof vi.fn>
    ).mockReturnValue({
      setOptimisticUserMessage: vi.fn(),
      getOptimisticUserMessage: vi.fn(() => null),
    });
-    (
-      useErrorMessageStore as unknown as ReturnType<typeof vi.fn>
-    ).mockReturnValue({
+    (useWSErrorMessage as unknown as ReturnType<typeof vi.fn>).mockReturnValue({
+      getErrorMessage: vi.fn(() => null),
      setErrorMessage: vi.fn(),
      removeErrorMessage: vi.fn(),
    });
@@ -203,7 +235,7 @@ describe("ChatInterface - Chat Suggestions", () => {

  test("should hide chat suggestions when there is an optimistic user message", () => {
    (
-      useOptimisticUserMessageStore as unknown as ReturnType<typeof vi.fn>
+      useOptimisticUserMessage as unknown as ReturnType<typeof vi.fn>
    ).mockReturnValue({
      setOptimisticUserMessage: vi.fn(),
      getOptimisticUserMessage: vi.fn(() => "Optimistic message"),
@@ -246,14 +278,13 @@ describe("ChatInterface - Empty state", () => {
      parsedEvents: [],
    });
    (
-      useOptimisticUserMessageStore as unknown as ReturnType<typeof vi.fn>
+      useOptimisticUserMessage as unknown as ReturnType<typeof vi.fn>
    ).mockReturnValue({
      setOptimisticUserMessage: vi.fn(),
      getOptimisticUserMessage: vi.fn(() => null),
    });
-    (
-      useErrorMessageStore as unknown as ReturnType<typeof vi.fn>
-    ).mockReturnValue({
+    (useWSErrorMessage as unknown as ReturnType<typeof vi.fn>).mockReturnValue({
+      getErrorMessage: vi.fn(() => null),
      setErrorMessage: vi.fn(),
      removeErrorMessage: vi.fn(),
    });
--- a/frontend/tests/components/features/conversation-panel/conversation-card.test.tsx
+++ b/frontend/tests/components/features/conversation-panel/conversation-card.test.tsx
@@ -357,6 +357,69 @@ describe("ConversationCard", () => {
    expect(onClick).not.toHaveBeenCalled();
  });

+  it("should show display cost button only when showOptions is true", async () => {
+    const onContextMenuToggle = vi.fn();
+    const { rerender } = renderWithProviders(
+      <ConversationCard
+        onDelete={onDelete}
+        onChangeTitle={onChangeTitle}
+        title="Conversation 1"
+        selectedRepository={null}
+        lastUpdatedAt="2021-10-01T12:00:00Z"
+        contextMenuOpen
+        onContextMenuToggle={onContextMenuToggle}
+      />,
+    );
+
+    // Wait for context menu to appear
+    const menu = await screen.findByTestId("context-menu");
+    expect(
+      within(menu).queryByTestId("display-cost-button"),
+    ).not.toBeInTheDocument();
+
+    rerender(
+      <ConversationCard
+        onDelete={onDelete}
+        onChangeTitle={onChangeTitle}
+        showOptions
+        title="Conversation 1"
+        selectedRepository={null}
+        lastUpdatedAt="2021-10-01T12:00:00Z"
+        contextMenuOpen
+        onContextMenuToggle={onContextMenuToggle}
+      />,
+    );
+
+    // Wait for context menu to appear and check for display cost button
+    const newMenu = await screen.findByTestId("context-menu");
+    within(newMenu).getByTestId("display-cost-button");
+  });
+
+  it("should show metrics modal when clicking the display cost button", async () => {
+    const user = userEvent.setup();
+    const onContextMenuToggle = vi.fn();
+    renderWithProviders(
+      <ConversationCard
+        onDelete={onDelete}
+        onChangeTitle={onChangeTitle}
+        title="Conversation 1"
+        selectedRepository={null}
+        lastUpdatedAt="2021-10-01T12:00:00Z"
+        showOptions
+        contextMenuOpen
+        onContextMenuToggle={onContextMenuToggle}
+      />,
+    );
+
+    const menu = screen.getByTestId("context-menu");
+    const displayCostButton = within(menu).getByTestId("display-cost-button");
+
+    await user.click(displayCostButton);
+
+    // Verify if metrics modal is displayed by checking for the modal content
+    expect(screen.getByTestId("metrics-modal")).toBeInTheDocument();
+  });
+
  it("should not display the edit or delete options if the handler is not provided", async () => {
    const onContextMenuToggle = vi.fn();
    const { rerender } = renderWithProviders(
--- a/frontend/tests/components/features/conversation-panel/conversation-panel.test.tsx
+++ b/frontend/tests/components/features/conversation-panel/conversation-panel.test.tsx
@@ -1,5 +1,6 @@
 import { screen, waitFor, within } from "@testing-library/react";
 import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
+import { QueryClientConfig } from "@tanstack/react-query";
 import userEvent from "@testing-library/user-event";
 import { createRoutesStub } from "react-router";
 import React from "react";
@@ -17,7 +18,16 @@ describe("ConversationPanel", () => {
    },
  ]);

-  const renderConversationPanel = () => renderWithProviders(<RouterStub />);
+  const renderConversationPanel = (config?: QueryClientConfig) =>
+    renderWithProviders(<RouterStub />, {
+      preloadedState: {
+        metrics: {
+          cost: null,
+          max_budget_per_task: null,
+          usage: null,
+        },
+      },
+    });

  beforeAll(() => {
    vi.mock("react-router", async (importOriginal) => ({
@@ -287,7 +297,15 @@ describe("ConversationPanel", () => {
      },
    ]);

-    renderWithProviders(<MyRouterStub />);
+    renderWithProviders(<MyRouterStub />, {
+      preloadedState: {
+        metrics: {
+          cost: null,
+          max_budget_per_task: null,
+          usage: null,
+        },
+      },
+    });

    const toggleButton = screen.getByText("Toggle");

--- a/frontend/tests/components/features/conversation/server-status.test.tsx
+++ b/frontend/tests/components/features/conversation/server-status.test.tsx
@@ -6,11 +6,31 @@ import { ServerStatus } from "#/components/features/controls/server-status";
 import { ServerStatusContextMenu } from "#/components/features/controls/server-status-context-menu";
 import { ConversationStatus } from "#/types/conversation-status";
 import { AgentState } from "#/types/agent-state";
-import { useAgentStore } from "#/stores/agent-store";

-// Mock the agent store
-vi.mock("#/stores/agent-store", () => ({
-  useAgentStore: vi.fn(),
+// Mock the conversation slice actions
+vi.mock("#/state/conversation-slice", () => ({
+  setShouldStopConversation: vi.fn(),
+  setShouldStartConversation: vi.fn(),
+  default: {
+    name: "conversation",
+    initialState: {
+      isRightPanelShown: true,
+      shouldStopConversation: false,
+      shouldStartConversation: false,
+    },
+    reducers: {},
+  },
+}));
+
+// Mock react-redux
+vi.mock("react-redux", () => ({
+  useSelector: vi.fn((selector) => {
+    // Mock the selector to return different agent states based on test needs
+    return {
+      curAgentState: AgentState.RUNNING,
+    };
+  }),
+  Provider: ({ children }: { children: React.ReactNode }) => children,
 }));

 // Mock the custom hooks
@@ -66,23 +86,11 @@ vi.mock("react-i18next", async () => {
 });

 describe("ServerStatus", () => {
-  // Helper function to mock agent store with specific state
-  const mockAgentStore = (agentState: AgentState) => {
-    vi.mocked(useAgentStore).mockReturnValue({
-      curAgentState: agentState,
-      setCurrentAgentState: vi.fn(),
-      reset: vi.fn(),
-    });
-  };
-
  afterEach(() => {
    vi.clearAllMocks();
  });

  it("should render server status with different conversation statuses", () => {
-    // Mock agent store to return RUNNING state
-    mockAgentStore(AgentState.RUNNING);
-
    // Test RUNNING status
    const { rerender } = renderWithProviders(
      <ServerStatus conversationStatus="RUNNING" />,
@@ -104,10 +112,6 @@ describe("ServerStatus", () => {

  it("should show context menu when clicked with RUNNING status", async () => {
    const user = userEvent.setup();
-
-    // Mock agent store to return RUNNING state
-    mockAgentStore(AgentState.RUNNING);
-
    renderWithProviders(<ServerStatus conversationStatus="RUNNING" />);

    const statusContainer = screen.getByText("Running").closest("div");
@@ -124,10 +128,6 @@ describe("ServerStatus", () => {

  it("should show context menu when clicked with STOPPED status", async () => {
    const user = userEvent.setup();
-
-    // Mock agent store to return STOPPED state
-    mockAgentStore(AgentState.STOPPED);
-
    renderWithProviders(<ServerStatus conversationStatus="STOPPED" />);

    const statusContainer = screen.getByText("Server Stopped").closest("div");
@@ -144,10 +144,6 @@ describe("ServerStatus", () => {

  it("should not show context menu when clicked with other statuses", async () => {
    const user = userEvent.setup();
-
-    // Mock agent store to return RUNNING state
-    mockAgentStore(AgentState.RUNNING);
-
    renderWithProviders(<ServerStatus conversationStatus="STARTING" />);

    const statusContainer = screen.getByText("Running").closest("div");
@@ -167,9 +163,6 @@ describe("ServerStatus", () => {
    // Clear previous calls
    mockStopConversationMutate.mockClear();

-    // Mock agent store to return RUNNING state
-    mockAgentStore(AgentState.RUNNING);
-
    renderWithProviders(<ServerStatus conversationStatus="RUNNING" />);

    const statusContainer = screen.getByText("Running").closest("div");
@@ -189,9 +182,6 @@ describe("ServerStatus", () => {
    // Clear previous calls
    mockStartConversationMutate.mockClear();

-    // Mock agent store to return STOPPED state
-    mockAgentStore(AgentState.STOPPED);
-
    renderWithProviders(<ServerStatus conversationStatus="STOPPED" />);

    const statusContainer = screen.getByText("Server Stopped").closest("div");
@@ -208,10 +198,6 @@ describe("ServerStatus", () => {

  it("should close context menu after stop server action", async () => {
    const user = userEvent.setup();
-
-    // Mock agent store to return RUNNING state
-    mockAgentStore(AgentState.RUNNING);
-
    renderWithProviders(<ServerStatus conversationStatus="RUNNING" />);

    const statusContainer = screen.getByText("Running").closest("div");
@@ -228,10 +214,6 @@ describe("ServerStatus", () => {

  it("should close context menu after start server action", async () => {
    const user = userEvent.setup();
-
-    // Mock agent store to return STOPPED state
-    mockAgentStore(AgentState.STOPPED);
-
    renderWithProviders(<ServerStatus conversationStatus="STOPPED" />);

    const statusContainer = screen.getByText("Server Stopped").closest("div");
@@ -247,9 +229,6 @@ describe("ServerStatus", () => {
  });

  it("should handle null conversation status", () => {
-    // Mock agent store to return RUNNING state
-    mockAgentStore(AgentState.RUNNING);
-
    renderWithProviders(<ServerStatus conversationStatus={null} />);

    const statusText = screen.getByText("Running");
--- a/frontend/tests/components/features/home/home-header.test.tsx
+++ b/frontend/tests/components/features/home/home-header.test.tsx
@@ -1,5 +1,7 @@
 import { QueryClientProvider, QueryClient } from "@tanstack/react-query";
 import { render, screen } from "@testing-library/react";
+import { Provider } from "react-redux";
+import { setupStore } from "test-utils";
 import { describe, expect, it, vi } from "vitest";
 import { HomeHeader } from "#/components/features/home/home-header/home-header";

@@ -24,9 +26,11 @@ vi.mock("react-i18next", async () => {
 const renderHomeHeader = () => {
  return render(<HomeHeader />, {
    wrapper: ({ children }) => (
-      <QueryClientProvider client={new QueryClient()}>
-        {children}
-      </QueryClientProvider>
+      <Provider store={setupStore()}>
+        <QueryClientProvider client={new QueryClient()}>
+          {children}
+        </QueryClientProvider>
+      </Provider>
    ),
  });
 };
--- a/frontend/tests/components/features/home/new-conversation.test.tsx
+++ b/frontend/tests/components/features/home/new-conversation.test.tsx
@@ -1,6 +1,8 @@
 import { QueryClientProvider, QueryClient } from "@tanstack/react-query";
 import { render, screen } from "@testing-library/react";
+import { Provider } from "react-redux";
 import { createRoutesStub } from "react-router";
+import { setupStore } from "test-utils";
 import { describe, expect, it, vi } from "vitest";
 import userEvent from "@testing-library/user-event";
 import ConversationService from "#/api/conversation-service/conversation-service.api";
@@ -41,9 +43,11 @@ const renderNewConversation = () => {

  return render(<RouterStub />, {
    wrapper: ({ children }) => (
-      <QueryClientProvider client={new QueryClient()}>
-        {children}
-      </QueryClientProvider>
+      <Provider store={setupStore()}>
+        <QueryClientProvider client={new QueryClient()}>
+          {children}
+        </QueryClientProvider>
+      </Provider>
    ),
  });
 };
--- a/frontend/tests/components/features/home/repo-connector.test.tsx
+++ b/frontend/tests/components/features/home/repo-connector.test.tsx
@@ -2,6 +2,8 @@ import { render, screen, waitFor, within } from "@testing-library/react";
 import { beforeEach, describe, expect, it, vi } from "vitest";
 import userEvent from "@testing-library/user-event";
 import { QueryClientProvider, QueryClient } from "@tanstack/react-query";
+import { setupStore } from "test-utils";
+import { Provider } from "react-redux";
 import { createRoutesStub, Outlet } from "react-router";
 import SettingsService from "#/settings-service/settings-service.api";
 import ConversationService from "#/api/conversation-service/conversation-service.api";
@@ -40,9 +42,11 @@ const renderRepoConnector = () => {

  return render(<RouterStub />, {
    wrapper: ({ children }) => (
-      <QueryClientProvider client={new QueryClient()}>
-        {children}
-      </QueryClientProvider>
+      <Provider store={setupStore()}>
+        <QueryClientProvider client={new QueryClient()}>
+          {children}
+        </QueryClientProvider>
+      </Provider>
    ),
  });
 };
--- a/frontend/tests/components/features/home/task-card.test.tsx
+++ b/frontend/tests/components/features/home/task-card.test.tsx
@@ -2,7 +2,9 @@ import { render, screen } from "@testing-library/react";
 import { beforeEach, describe, expect, it, vi } from "vitest";
 import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
 import userEvent from "@testing-library/user-event";
+import { Provider } from "react-redux";
 import { createRoutesStub } from "react-router";
+import { setupStore } from "test-utils";
 import ConversationService from "#/api/conversation-service/conversation-service.api";
 import UserService from "#/api/user-service/user-service.api";
 import GitService from "#/api/git-service/git-service.api";
@@ -39,9 +41,11 @@ const renderTaskCard = (task = MOCK_TASK_1) => {

  return render(<RouterStub />, {
    wrapper: ({ children }) => (
-      <QueryClientProvider client={new QueryClient()}>
-        {children}
-      </QueryClientProvider>
+      <Provider store={setupStore()}>
+        <QueryClientProvider client={new QueryClient()}>
+          {children}
+        </QueryClientProvider>
+      </Provider>
    ),
  });
 };
--- a/frontend/tests/components/features/home/task-suggestions.test.tsx
+++ b/frontend/tests/components/features/home/task-suggestions.test.tsx
@@ -1,7 +1,9 @@
 import { render, screen, waitFor } from "@testing-library/react";
 import { afterEach, describe, expect, it, vi } from "vitest";
 import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
+import { Provider } from "react-redux";
 import { createRoutesStub } from "react-router";
+import { setupStore } from "test-utils";
 import { TaskSuggestions } from "#/components/features/home/tasks/task-suggestions";
 import { SuggestionsService } from "#/api/suggestions-service/suggestions-service.api";
 import { MOCK_TASKS } from "#/mocks/task-suggestions-handlers";
@@ -60,9 +62,11 @@ const renderTaskSuggestions = () => {

  return render(<RouterStub />, {
    wrapper: ({ children }) => (
-      <QueryClientProvider client={new QueryClient()}>
-        {children}
-      </QueryClientProvider>
+      <Provider store={setupStore()}>
+        <QueryClientProvider client={new QueryClient()}>
+          {children}
+        </QueryClientProvider>
+      </Provider>
    ),
  });
 };
--- a/frontend/tests/components/features/microagent-management/microagent-management.test.tsx
+++ b/frontend/tests/components/features/microagent-management/microagent-management.test.tsx
@@ -12,7 +12,6 @@ import GitService from "#/api/git-service/git-service.api";
 import { GitRepository } from "#/types/git";
 import { RepositoryMicroagent } from "#/types/microagent-management";
 import { Conversation } from "#/api/open-hands.types";
-import { useMicroagentManagementStore } from "#/state/microagent-management-store";

 // Mock hooks
 const mockUseUserProviders = vi.fn();
@@ -56,47 +55,25 @@ describe("MicroagentManagement", () => {
  ]);

  const renderMicroagentManagement = (config?: QueryClientConfig) =>
-    renderWithProviders(<RouterStub />);
-
-  // Common test data
-  const testRepository = {
-    id: "1",
-    full_name: "user/test-repo",
-    git_provider: "github" as const,
-    is_public: true,
-    owner_type: "user" as const,
-    pushed_at: "2021-10-01T12:00:00Z",
-  };
-
-  // Helper function to render with custom Zustand store state
-  const renderWithCustomStore = (storeOverrides: Partial<any>) => {
-    useMicroagentManagementStore.setState(storeOverrides);
-    return renderWithProviders(<RouterStub />);
-  };
-
-  // Helper function to render with update modal visible
-  const renderWithUpdateModal = (additionalState: Partial<any> = {}) => {
-    return renderWithCustomStore({
-      updateMicroagentModalVisible: true,
-      selectedRepository: testRepository,
-      ...additionalState,
-    });
-  };
-
-  // Helper function to render with selected microagent
-  const renderWithSelectedMicroagent = (
-    microagent: any,
-    additionalState: Partial<any> = {},
-  ) => {
-    return renderWithCustomStore({
-      selectedRepository: testRepository,
-      selectedMicroagentItem: {
-        microagent,
-        conversation: null,
+    renderWithProviders(<RouterStub />, {
+      preloadedState: {
+        metrics: {
+          cost: null,
+          max_budget_per_task: null,
+          usage: null,
+        },
+        microagentManagement: {
+          addMicroagentModalVisible: false,
+          updateMicroagentModalVisible: false,
+          selectedRepository: null,
+          personalRepositories: [],
+          organizationRepositories: [],
+          repositories: [],
+          selectedMicroagentItem: null,
+          learnThisRepoModalVisible: false,
+        },
      },
-      ...additionalState,
    });
-  };

  beforeAll(() => {
    vi.mock("react-router", async (importOriginal) => ({
@@ -157,52 +134,8 @@ describe("MicroagentManagement", () => {
      owner_type: "organization",
      pushed_at: "2021-10-06T12:00:00Z",
    },
-    {
-      id: "7",
-      full_name: "user/gitlab-repo/openhands-config",
-      git_provider: "gitlab",
-      is_public: true,
-      owner_type: "user",
-      pushed_at: "2021-10-07T12:00:00Z",
-    },
-    {
-      id: "8",
-      full_name: "org/gitlab-org-repo/openhands-config",
-      git_provider: "gitlab",
-      is_public: true,
-      owner_type: "organization",
-      pushed_at: "2021-10-08T12:00:00Z",
-    },
  ];

-  // Helper function to filter repositories with OpenHands suffixes
-  const getRepositoriesWithOpenHandsSuffix = (
-    repositories: GitRepository[],
-  ) => {
-    return repositories.filter(
-      (repo) =>
-        repo.full_name.endsWith("/.openhands") ||
-        repo.full_name.endsWith("/openhands-config"),
-    );
-  };
-
-  // Helper functions for mocking search repositories
-  const mockSearchRepositoriesWithData = (data: GitRepository[]) => {
-    mockUseSearchRepositories.mockReturnValue({
-      data,
-      isLoading: false,
-      isError: false,
-    });
-  };
-
-  const mockSearchRepositoriesEmpty = () => {
-    mockUseSearchRepositories.mockReturnValue({
-      data: [],
-      isLoading: false,
-      isError: false,
-    });
-  };
-
  const mockMicroagents: RepositoryMicroagent[] = [
    {
      name: "test-microagent-1",
@@ -253,23 +186,6 @@ describe("MicroagentManagement", () => {
    vi.clearAllMocks();
    vi.restoreAllMocks();

-    // Reset Zustand store to default state
-    useMicroagentManagementStore.setState({
-      // Modal visibility states
-      addMicroagentModalVisible: false,
-      updateMicroagentModalVisible: false,
-      learnThisRepoModalVisible: false,
-
-      // Repository states
-      selectedRepository: null,
-      personalRepositories: [],
-      organizationRepositories: [],
-      repositories: [],
-
-      // Microagent states
-      selectedMicroagentItem: null,
-    });
-
    // Setup default hook mocks
    mockUseUserProviders.mockReturnValue({
      providers: ["github"],
@@ -309,11 +225,11 @@ describe("MicroagentManagement", () => {
      isError: false,
    });

-    // Mock the search repositories hook to return repositories with OpenHands suffixes
-    const mockSearchResults =
-      getRepositoriesWithOpenHandsSuffix(mockRepositories);
-
-    mockSearchRepositoriesWithData(mockSearchResults);
+    mockUseSearchRepositories.mockReturnValue({
+      data: [],
+      isLoading: false,
+      isError: false,
+    });

    // Setup default mock for retrieveUserGitRepositories
    vi.spyOn(GitService, "retrieveUserGitRepositories").mockResolvedValue({
@@ -638,9 +554,6 @@ describe("MicroagentManagement", () => {
      onLoadMore: vi.fn(),
    });

-    // Mock empty search results
-    mockSearchRepositoriesEmpty();
-
    renderMicroagentManagement();

    // Wait for repositories to be loaded
@@ -829,10 +742,6 @@ describe("MicroagentManagement", () => {

    it("should handle empty search results", async () => {
      const user = userEvent.setup();
-
-      // Mock empty search results for this test
-      mockSearchRepositoriesEmpty();
-
      renderMicroagentManagement();

      // Wait for repositories to be loaded
@@ -1438,10 +1347,33 @@ describe("MicroagentManagement", () => {
      });
    });

-    it("should render modal when Zustand state is set to visible", async () => {
-      // Render with modal already visible in Zustand state
-      renderWithCustomStore({
-        addMicroagentModalVisible: true,
+    it("should render modal when Redux state is set to visible", async () => {
+      // Render with modal already visible in Redux state
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: null,
+            addMicroagentModalVisible: true, // Start with modal visible
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            updateMicroagentModalVisible: false,
+            learnThisRepoModalVisible: false,
+          },
+        },
      });

      // Check that modal is rendered
@@ -1711,16 +1643,34 @@ describe("MicroagentManagement", () => {
      pr_number: null,
    };

-    const renderMicroagentManagementMain = (selectedMicroagentItem: any) => {
-      // Set the store with the selected microagent item and a repository
-      useMicroagentManagementStore.setState({
-        selectedMicroagentItem,
-        selectedRepository: testRepository,
+    const renderMicroagentManagementMain = (selectedMicroagentItem: any) =>
+      renderWithProviders(<MicroagentManagementMain />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            addMicroagentModalVisible: false,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            selectedMicroagentItem,
+            updateMicroagentModalVisible: false,
+            learnThisRepoModalVisible: false,
+          },
+        },
      });

-      return renderWithProviders(<MicroagentManagementMain />);
-    };
-
    it("should render MicroagentManagementDefault when no microagent or conversation is selected", async () => {
      renderMicroagentManagementMain(null);

@@ -2045,8 +1995,36 @@ describe("MicroagentManagement", () => {
    });

    it("should render update microagent modal when updateMicroagentModalVisible is true", async () => {
-      // Render with update modal visible in Zustand state
-      renderWithUpdateModal();
+      // Render with update modal visible in Redux state
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForUpdate,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: true, // Start with update modal visible
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Check that update modal is rendered
      expect(screen.getByTestId("add-microagent-modal")).toBeInTheDocument();
@@ -2057,7 +2035,35 @@ describe("MicroagentManagement", () => {

    it("should display update microagent title when isUpdate is true", async () => {
      // Render with update modal visible and selected microagent
-      renderWithUpdateModal();
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForUpdate,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: true,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Check that the update title is displayed
      expect(
@@ -2067,10 +2073,33 @@ describe("MicroagentManagement", () => {

    it("should populate form fields with existing microagent data when updating", async () => {
      // Render with update modal visible and selected microagent
-      renderWithUpdateModal({
-        selectedMicroagentItem: {
-          microagent: mockMicroagentForUpdate,
-          conversation: null,
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForUpdate,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: true,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
        },
      });

@@ -2087,7 +2116,35 @@ describe("MicroagentManagement", () => {
      const user = userEvent.setup();

      // Render with update modal visible and selected microagent
-      renderWithUpdateModal();
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForUpdate,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: true,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Wait for modal to be rendered
      await waitFor(() => {
@@ -2115,7 +2172,35 @@ describe("MicroagentManagement", () => {
      const user = userEvent.setup();

      // Render with update modal visible
-      renderWithUpdateModal();
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForUpdate,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: true,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Wait for modal to be rendered
      await waitFor(() => {
@@ -2138,7 +2223,35 @@ describe("MicroagentManagement", () => {
      const user = userEvent.setup();

      // Render with update modal visible
-      renderWithUpdateModal();
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForUpdate,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: true,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Wait for modal to be rendered
      await waitFor(() => {
@@ -2164,7 +2277,32 @@ describe("MicroagentManagement", () => {

    it("should handle update modal with empty microagent data", async () => {
      // Render with update modal visible but no microagent data
-      renderWithUpdateModal();
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: null,
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: true,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Check that update modal is still rendered
      expect(screen.getByTestId("add-microagent-modal")).toBeInTheDocument();
@@ -2185,7 +2323,35 @@ describe("MicroagentManagement", () => {
      });

      // Render with update modal visible and microagent
-      renderWithUpdateModal();
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForUpdate,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: true,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Wait for the content to be loaded and check that the form field is empty
      await waitFor(() => {
@@ -2206,7 +2372,35 @@ describe("MicroagentManagement", () => {
      });

      // Render with update modal visible and microagent
-      renderWithUpdateModal();
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForUpdate,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: true,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Check that the modal is rendered correctly
      expect(screen.getByTestId("add-microagent-modal")).toBeInTheDocument();
@@ -2365,7 +2559,35 @@ describe("MicroagentManagement", () => {

    it("should render learn something new button in microagent view", async () => {
      // Render with selected microagent
-      renderWithSelectedMicroagent(mockMicroagentForLearn);
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForLearn,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: false,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Check that the learn something new button is displayed
      expect(
@@ -2377,7 +2599,35 @@ describe("MicroagentManagement", () => {
      const user = userEvent.setup();

      // Render with selected microagent
-      renderWithSelectedMicroagent(mockMicroagentForLearn);
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForLearn,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: false,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Find and click the learn something new button
      const learnButton = screen.getByText("COMMON$LEARN_SOMETHING_NEW");
@@ -2406,7 +2656,35 @@ describe("MicroagentManagement", () => {
      });

      // Render with selected microagent
-      renderWithSelectedMicroagent(mockMicroagentForLearn);
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForLearn,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: false,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Find and click the learn something new button
      const learnButton = screen.getByText("COMMON$LEARN_SOMETHING_NEW");
@@ -2438,7 +2716,35 @@ describe("MicroagentManagement", () => {
      });

      // Render with selected microagent
-      renderWithSelectedMicroagent(mockMicroagentForLearn);
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForLearn,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: false,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Find and click the learn something new button
      const learnButton = screen.getByText("COMMON$LEARN_SOMETHING_NEW");
@@ -2468,7 +2774,35 @@ describe("MicroagentManagement", () => {
      });

      // Render with selected microagent
-      renderWithSelectedMicroagent(mockMicroagentForLearn);
+      renderWithProviders(<RouterStub />, {
+        preloadedState: {
+          metrics: {
+            cost: null,
+            max_budget_per_task: null,
+            usage: null,
+          },
+          microagentManagement: {
+            selectedMicroagentItem: {
+              microagent: mockMicroagentForLearn,
+              conversation: undefined,
+            },
+            addMicroagentModalVisible: false,
+            updateMicroagentModalVisible: false,
+            selectedRepository: {
+              id: "1",
+              full_name: "user/test-repo",
+              git_provider: "github",
+              is_public: true,
+              owner_type: "user",
+              pushed_at: "2021-10-01T12:00:00Z",
+            },
+            personalRepositories: [],
+            organizationRepositories: [],
+            repositories: [],
+            learnThisRepoModalVisible: false,
+          },
+        },
+      });

      // Find and click the learn something new button
      const learnButton = screen.getByText("COMMON$LEARN_SOMETHING_NEW");
--- a/frontend/tests/components/interactive-chat-box.test.tsx
+++ b/frontend/tests/components/interactive-chat-box.test.tsx
@@ -5,18 +5,6 @@ import { MemoryRouter } from "react-router";
 import { InteractiveChatBox } from "#/components/features/chat/interactive-chat-box";
 import { renderWithProviders } from "../../test-utils";
 import { AgentState } from "#/types/agent-state";
-import { useAgentStore } from "#/stores/agent-store";
-import { useConversationStore } from "#/state/conversation-store";
-
-// Mock the agent store
-vi.mock("#/stores/agent-store", () => ({
-  useAgentStore: vi.fn(),
-}));
-
-// Mock the conversation store
-vi.mock("#/state/conversation-store", () => ({
-  useConversationStore: vi.fn(),
-}));

 // Mock React Router hooks
 vi.mock("react-router", async () => {
@@ -59,49 +47,6 @@ describe("InteractiveChatBox", () => {
  const onSubmitMock = vi.fn();
  const onStopMock = vi.fn();

-  // Helper function to mock stores
-  const mockStores = (agentState: AgentState = AgentState.INIT) => {
-    vi.mocked(useAgentStore).mockReturnValue({
-      curAgentState: agentState,
-      setCurrentAgentState: vi.fn(),
-      reset: vi.fn(),
-    });
-
-    vi.mocked(useConversationStore).mockReturnValue({
-      images: [],
-      files: [],
-      addImages: vi.fn(),
-      addFiles: vi.fn(),
-      clearAllFiles: vi.fn(),
-      addFileLoading: vi.fn(),
-      removeFileLoading: vi.fn(),
-      addImageLoading: vi.fn(),
-      removeImageLoading: vi.fn(),
-      submittedMessage: null,
-      setShouldHideSuggestions: vi.fn(),
-      setSubmittedMessage: vi.fn(),
-      isRightPanelShown: true,
-      selectedTab: "editor" as const,
-      loadingFiles: [],
-      loadingImages: [],
-      messageToSend: null,
-      shouldShownAgentLoading: false,
-      shouldHideSuggestions: false,
-      hasRightPanelToggled: true,
-      setIsRightPanelShown: vi.fn(),
-      setSelectedTab: vi.fn(),
-      setShouldShownAgentLoading: vi.fn(),
-      removeImage: vi.fn(),
-      removeFile: vi.fn(),
-      clearImages: vi.fn(),
-      clearFiles: vi.fn(),
-      clearAllLoading: vi.fn(),
-      setMessageToSend: vi.fn(),
-      resetConversationState: vi.fn(),
-      setHasRightPanelToggled: vi.fn(),
-    });
-  };
-
  // Helper function to render with Router context
  const renderInteractiveChatBox = (props: any, options: any = {}) => {
    return renderWithProviders(
@@ -123,12 +68,22 @@ describe("InteractiveChatBox", () => {
  });

  it("should render", () => {
-    mockStores(AgentState.INIT);
-
-    renderInteractiveChatBox({
-      onSubmit: onSubmitMock,
-      onStop: onStopMock,
-    });
+    renderInteractiveChatBox(
+      {
+        onSubmit: onSubmitMock,
+        onStop: onStopMock,
+        isWaitingForUserInput: false,
+        hasSubstantiveAgentActions: false,
+        optimisticUserMessage: false,
+      },
+      {
+        preloadedState: {
+          agent: {
+            curAgentState: AgentState.INIT,
+          },
+        },
+      },
+    );

    const chatBox = screen.getByTestId("interactive-chat-box");
    expect(chatBox).toBeInTheDocument();
@@ -136,12 +91,33 @@ describe("InteractiveChatBox", () => {

  it("should set custom values", async () => {
    const user = userEvent.setup();
-    mockStores(AgentState.AWAITING_USER_INPUT);
-
-    renderInteractiveChatBox({
-      onSubmit: onSubmitMock,
-      onStop: onStopMock,
-    });
+    renderInteractiveChatBox(
+      {
+        onSubmit: onSubmitMock,
+        onStop: onStopMock,
+        isWaitingForUserInput: true,
+        hasSubstantiveAgentActions: true,
+        optimisticUserMessage: false,
+      },
+      {
+        preloadedState: {
+          agent: {
+            curAgentState: AgentState.AWAITING_USER_INPUT,
+          },
+          conversation: {
+            isRightPanelShown: true,
+            shouldStopConversation: false,
+            shouldStartConversation: false,
+            images: [],
+            files: [],
+            loadingFiles: [],
+            loadingImages: [],
+            messageToSend: null,
+            shouldShownAgentLoading: false,
+          },
+        },
+      },
+    );

    const textbox = screen.getByTestId("chat-input");

@@ -153,12 +129,22 @@ describe("InteractiveChatBox", () => {

  it("should display the image previews when images are uploaded", async () => {
    const user = userEvent.setup();
-    mockStores(AgentState.INIT);
-
-    renderInteractiveChatBox({
-      onSubmit: onSubmitMock,
-      onStop: onStopMock,
-    });
+    renderInteractiveChatBox(
+      {
+        onSubmit: onSubmitMock,
+        onStop: onStopMock,
+        isWaitingForUserInput: false,
+        hasSubstantiveAgentActions: false,
+        optimisticUserMessage: false,
+      },
+      {
+        preloadedState: {
+          agent: {
+            curAgentState: AgentState.INIT,
+          },
+        },
+      },
+    );

    // Create a larger file to ensure it passes validation
    const fileContent = new Array(1024).fill("a").join(""); // 1KB file
@@ -180,12 +166,22 @@ describe("InteractiveChatBox", () => {

  it("should remove the image preview when the close button is clicked", async () => {
    const user = userEvent.setup();
-    mockStores(AgentState.INIT);
-
-    renderInteractiveChatBox({
-      onSubmit: onSubmitMock,
-      onStop: onStopMock,
-    });
+    renderInteractiveChatBox(
+      {
+        onSubmit: onSubmitMock,
+        onStop: onStopMock,
+        isWaitingForUserInput: false,
+        hasSubstantiveAgentActions: false,
+        optimisticUserMessage: false,
+      },
+      {
+        preloadedState: {
+          agent: {
+            curAgentState: AgentState.INIT,
+          },
+        },
+      },
+    );

    const fileContent = new Array(1024).fill("a").join(""); // 1KB file
    const file = new File([fileContent], "chucknorris.png", {
@@ -205,12 +201,22 @@ describe("InteractiveChatBox", () => {

  it("should call onSubmit with the message and images", async () => {
    const user = userEvent.setup();
-    mockStores(AgentState.INIT);
-
-    renderInteractiveChatBox({
-      onSubmit: onSubmitMock,
-      onStop: onStopMock,
-    });
+    renderInteractiveChatBox(
+      {
+        onSubmit: onSubmitMock,
+        onStop: onStopMock,
+        isWaitingForUserInput: false,
+        hasSubstantiveAgentActions: false,
+        optimisticUserMessage: false,
+      },
+      {
+        preloadedState: {
+          agent: {
+            curAgentState: AgentState.INIT,
+          },
+        },
+      },
+    );

    const textarea = screen.getByTestId("chat-input");

@@ -236,12 +242,22 @@ describe("InteractiveChatBox", () => {

  it("should disable the submit button when agent is loading", async () => {
    const user = userEvent.setup();
-    mockStores(AgentState.LOADING);
-
-    renderInteractiveChatBox({
-      onSubmit: onSubmitMock,
-      onStop: onStopMock,
-    });
+    renderInteractiveChatBox(
+      {
+        onSubmit: onSubmitMock,
+        onStop: onStopMock,
+        isWaitingForUserInput: false,
+        hasSubstantiveAgentActions: false,
+        optimisticUserMessage: false,
+      },
+      {
+        preloadedState: {
+          agent: {
+            curAgentState: AgentState.LOADING,
+          },
+        },
+      },
+    );

    const button = screen.getByTestId("submit-button");
    expect(button).toBeDisabled();
@@ -252,14 +268,23 @@ describe("InteractiveChatBox", () => {

  it("should display the stop button when agent is running and call onStop when clicked", async () => {
    const user = userEvent.setup();
-    mockStores(AgentState.RUNNING);
+    renderInteractiveChatBox(
+      {
+        onSubmit: onSubmitMock,
+        onStop: onStopMock,
+        isWaitingForUserInput: false,
+        hasSubstantiveAgentActions: true,
+        optimisticUserMessage: false,
+      },
+      {
+        preloadedState: {
+          agent: {
+            curAgentState: AgentState.RUNNING,
+          },
+        },
+      },
+    );

-    renderInteractiveChatBox({
-      onSubmit: onSubmitMock,
-      onStop: onStopMock,
-    });
-
-    // The stop button should be available when agent is running
    const stopButton = screen.getByTestId("stop-button");
    expect(stopButton).toBeInTheDocument();

@@ -272,12 +297,33 @@ describe("InteractiveChatBox", () => {
    const onSubmit = vi.fn();
    const onStop = vi.fn();

-    mockStores(AgentState.AWAITING_USER_INPUT);
-
-    const { rerender } = renderInteractiveChatBox({
-      onSubmit: onSubmit,
-      onStop: onStop,
-    });
+    const { rerender } = renderInteractiveChatBox(
+      {
+        onSubmit: onSubmit,
+        onStop: onStop,
+        isWaitingForUserInput: true,
+        hasSubstantiveAgentActions: true,
+        optimisticUserMessage: false,
+      },
+      {
+        preloadedState: {
+          agent: {
+            curAgentState: AgentState.AWAITING_USER_INPUT,
+          },
+          conversation: {
+            isRightPanelShown: true,
+            shouldStopConversation: false,
+            shouldStartConversation: false,
+            images: [],
+            files: [],
+            loadingFiles: [],
+            loadingImages: [],
+            messageToSend: null,
+            shouldShownAgentLoading: false,
+          },
+        },
+      },
+    );

    // Verify text input has the initial value
    const textarea = screen.getByTestId("chat-input");
@@ -296,7 +342,13 @@ describe("InteractiveChatBox", () => {
    // Simulate parent component updating the value prop
    rerender(
      <MemoryRouter>
-        <InteractiveChatBox onSubmit={onSubmit} onStop={onStop} />
+        <InteractiveChatBox
+          onSubmit={onSubmit}
+          onStop={onStop}
+          isWaitingForUserInput={true}
+          hasSubstantiveAgentActions={true}
+          optimisticUserMessage={false}
+        />
      </MemoryRouter>,
    );

--- a/frontend/tests/components/jupyter/jupyter.test.tsx
+++ b/frontend/tests/components/jupyter/jupyter.test.tsx
@@ -1,46 +1,42 @@
 import { render, screen } from "@testing-library/react";
+import { Provider } from "react-redux";
+import { configureStore } from "@reduxjs/toolkit";
 import { JupyterEditor } from "#/components/features/jupyter/jupyter";
-import { vi, describe, it, expect, beforeEach } from "vitest";
-import { AgentState } from "#/types/agent-state";
-import { useAgentStore } from "#/stores/agent-store";
-import { useJupyterStore } from "#/state/jupyter-store";
-
-// Mock the agent store
-vi.mock("#/stores/agent-store", () => ({
-  useAgentStore: vi.fn(),
-}));
-
-// Mock react-i18next
-vi.mock("react-i18next", () => ({
-  useTranslation: () => ({
-    t: (key: string) => key,
-  }),
-}));
+import { jupyterReducer } from "#/state/jupyter-slice";
+import { vi, describe, it, expect } from "vitest";

 describe("JupyterEditor", () => {
-  beforeEach(() => {
-    // Reset the Zustand store before each test
-    useJupyterStore.setState({
-      cells: Array(20).fill({
-        content: "Test cell content",
-        type: "input",
-        imageUrls: undefined,
-      }),
-    });
+  const mockStore = configureStore({
+    reducer: {
+      fileState: () => ({}),
+      initalQuery: () => ({}),
+      browser: () => ({}),
+      chat: () => ({}),
+      code: () => ({}),
+      cmd: () => ({}),
+      agent: () => ({}),
+      jupyter: jupyterReducer,
+      securityAnalyzer: () => ({}),
+      status: () => ({}),
+    },
+    preloadedState: {
+      jupyter: {
+        cells: Array(20).fill({
+          content: "Test cell content",
+          type: "input",
+          output: "Test output",
+        }),
+      },
+    },
  });

  it("should have a scrollable container", () => {
-    // Mock agent store to return RUNNING state (not in RUNTIME_INACTIVE_STATES)
-    vi.mocked(useAgentStore).mockReturnValue({
-      curAgentState: AgentState.RUNNING,
-      setCurrentAgentState: vi.fn(),
-      reset: vi.fn(),
-    });
-
    render(
-      <div style={{ height: "100vh" }}>
-        <JupyterEditor maxWidth={800} />
-      </div>,
+      <Provider store={mockStore}>
+        <div style={{ height: "100vh" }}>
+          <JupyterEditor maxWidth={800} />
+        </div>
+      </Provider>
    );

    const container = screen.getByTestId("jupyter-container");
--- a/frontend/tests/components/modals/microagents/microagent-modal.test.tsx
+++ b/frontend/tests/components/modals/microagents/microagent-modal.test.tsx
@@ -5,17 +5,19 @@ import { renderWithProviders } from "test-utils";
 import { MicroagentsModal } from "#/components/features/conversation-panel/microagents-modal";
 import ConversationService from "#/api/conversation-service/conversation-service.api";
 import { AgentState } from "#/types/agent-state";
-import { useAgentStore } from "#/stores/agent-store";

-// Mock the agent store
-vi.mock("#/stores/agent-store", () => ({
-  useAgentStore: vi.fn(),
-}));
-
-// Mock the conversation ID hook
-vi.mock("#/hooks/use-conversation-id", () => ({
-  useConversationId: () => ({ conversationId: "test-conversation-id" }),
-}));
+vi.mock("react-redux", async () => {
+  const actual = await vi.importActual("react-redux");
+  return {
+    ...actual,
+    useDispatch: () => vi.fn(),
+    useSelector: () => ({
+      agent: {
+        curAgentState: AgentState.AWAITING_USER_INPUT,
+      },
+    }),
+  };
+});

 describe("MicroagentsModal - Refresh Button", () => {
  const mockOnClose = vi.fn();
@@ -45,17 +47,10 @@ describe("MicroagentsModal - Refresh Button", () => {
    // Reset all mocks before each test
    vi.clearAllMocks();

-    // Setup default mock for getMicroagents
+    // Setup default mock for getUserConversations
    vi.spyOn(ConversationService, "getMicroagents").mockResolvedValue({
      microagents: mockMicroagents,
    });
-
-    // Mock the agent store to return a ready state
-    vi.mocked(useAgentStore).mockReturnValue({
-      curAgentState: AgentState.AWAITING_USER_INPUT,
-      setCurrentAgentState: vi.fn(),
-      reset: vi.fn(),
-    });
  });

  afterEach(() => {
@@ -63,11 +58,10 @@ describe("MicroagentsModal - Refresh Button", () => {
  });

  describe("Refresh Button Rendering", () => {
-    it("should render the refresh button with correct text and test ID", async () => {
+    it("should render the refresh button with correct text and test ID", () => {
      renderWithProviders(<MicroagentsModal {...defaultProps} />);

-      // Wait for the component to load and render the refresh button
-      const refreshButton = await screen.findByTestId("refresh-microagents");
+      const refreshButton = screen.getByTestId("refresh-microagents");
      expect(refreshButton).toBeInTheDocument();
      expect(refreshButton).toHaveTextContent("BUTTON$REFRESH");
    });
@@ -81,8 +75,7 @@ describe("MicroagentsModal - Refresh Button", () => {

      const refreshSpy = vi.spyOn(ConversationService, "getMicroagents");

-      // Wait for the component to load and render the refresh button
-      const refreshButton = await screen.findByTestId("refresh-microagents");
+      const refreshButton = screen.getByTestId("refresh-microagents");
      await user.click(refreshButton);

      expect(refreshSpy).toHaveBeenCalledTimes(1);
--- a/frontend/tests/components/terminal/terminal.test.tsx
+++ b/frontend/tests/components/terminal/terminal.test.tsx
@@ -1,14 +1,17 @@
 import { act, screen } from "@testing-library/react";
 import { renderWithProviders } from "test-utils";
 import { vi, describe, afterEach, it, expect } from "vitest";
-import { Command, useCommandStore } from "#/state/command-store";
+import { Command, appendInput, appendOutput } from "#/state/command-slice";
 import Terminal from "#/components/features/terminal/terminal";

-const renderTerminal = (commands: Command[] = []) => {
-  // Set initial commands in Zustand store
-  useCommandStore.setState({ commands });
-  return renderWithProviders(<Terminal />);
-};
+const renderTerminal = (commands: Command[] = []) =>
+  renderWithProviders(<Terminal />, {
+    preloadedState: {
+      cmd: {
+        commands,
+      },
+    },
+  });

 describe.skip("Terminal", () => {
  global.ResizeObserver = vi.fn().mockImplementation(() => ({
@@ -55,25 +58,25 @@ describe.skip("Terminal", () => {
  });

  it("should write commands to the terminal", () => {
-    renderTerminal();
+    const { store } = renderTerminal();

    act(() => {
-      useCommandStore.getState().appendInput("echo Hello");
-      useCommandStore.getState().appendOutput("Hello");
+      store.dispatch(appendInput("echo Hello"));
+      store.dispatch(appendOutput("Hello"));
    });

    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(1, "echo Hello");
    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(2, "Hello");

    act(() => {
-      useCommandStore.getState().appendInput("echo World");
+      store.dispatch(appendInput("echo World"));
    });

    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(3, "echo World");
  });

  it("should load and write commands to the terminal", () => {
-    renderTerminal([
+    const { store } = renderTerminal([
      { type: "input", content: "echo Hello" },
      { type: "output", content: "Hello" },
    ]);
@@ -82,17 +85,17 @@ describe.skip("Terminal", () => {
    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(2, "Hello");

    act(() => {
-      useCommandStore.getState().appendInput("echo Hello");
+      store.dispatch(appendInput("echo Hello"));
    });

    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(3, "echo Hello");
  });

  it("should end the line with a dollar sign after writing a command", () => {
-    renderTerminal();
+    const { store } = renderTerminal();

    act(() => {
-      useCommandStore.getState().appendInput("echo Hello");
+      store.dispatch(appendInput("echo Hello"));
    });

    expect(mockTerminal.writeln).toHaveBeenCalledWith("echo Hello");
--- a/frontend/tests/hooks/use-terminal.test.tsx
+++ b/frontend/tests/hooks/use-terminal.test.tsx
@@ -1,10 +1,9 @@
 import { beforeAll, describe, expect, it, vi } from "vitest";
 import { afterEach } from "node:test";
 import { useTerminal } from "#/hooks/use-terminal";
-import { Command, useCommandStore } from "#/state/command-store";
+import { Command } from "#/state/command-slice";
 import { AgentState } from "#/types/agent-state";
 import { renderWithProviders } from "../../test-utils";
-import { useAgentStore } from "#/stores/agent-store";

 // Mock the WsClient context
 vi.mock("#/context/ws-client-provider", () => ({
@@ -20,12 +19,10 @@ interface TestTerminalComponentProps {
  commands: Command[];
 }

-function TestTerminalComponent({ commands }: TestTerminalComponentProps) {
-  // Set commands in Zustand store
-  useCommandStore.setState({ commands });
-  // Set agent state in Zustand store
-  useAgentStore.setState({ curAgentState: AgentState.RUNNING });
-  const ref = useTerminal();
+function TestTerminalComponent({
+  commands,
+}: TestTerminalComponentProps) {
+  const ref = useTerminal({ commands });
  return <div ref={ref} />;
 }

@@ -60,7 +57,12 @@ describe("useTerminal", () => {
  });

  it("should render", () => {
-    renderWithProviders(<TestTerminalComponent commands={[]} />);
+    renderWithProviders(<TestTerminalComponent commands={[]} />, {
+      preloadedState: {
+        agent: { curAgentState: AgentState.RUNNING },
+        cmd: { commands: [] },
+      },
+    });
  });

  it("should render the commands in the terminal", () => {
@@ -69,7 +71,12 @@ describe("useTerminal", () => {
      { content: "hello", type: "output" },
    ];

-    renderWithProviders(<TestTerminalComponent commands={commands} />);
+    renderWithProviders(<TestTerminalComponent commands={commands} />, {
+      preloadedState: {
+        agent: { curAgentState: AgentState.RUNNING },
+        cmd: { commands },
+      },
+    });

    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(1, "echo hello");
    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(2, "hello");
@@ -87,7 +94,17 @@ describe("useTerminal", () => {
      { content: secret, type: "output" },
    ];

-    renderWithProviders(<TestTerminalComponent commands={commands} />);
+    renderWithProviders(
+      <TestTerminalComponent
+        commands={commands}
+      />,
+      {
+        preloadedState: {
+          agent: { curAgentState: AgentState.RUNNING },
+          cmd: { commands },
+        },
+      },
+    );

    // This test is no longer relevant as secrets filtering has been removed
  });
--- a/frontend/tests/initial-query.test.tsx
+++ b/frontend/tests/initial-query.test.tsx
@@ -1,24 +1,20 @@
-import { describe, it, expect, beforeEach } from "vitest";
-import { useInitialQueryStore } from "../src/stores/initial-query-store";
+import { describe, it, expect } from "vitest";
+import store from "../src/store";
+import {
+  setInitialPrompt,
+  clearInitialPrompt,
+} from "../src/state/initial-query-slice";

 describe("Initial Query Behavior", () => {
-  beforeEach(() => {
-    // Reset the store before each test
-    useInitialQueryStore.getState().reset();
-  });
-
-  it("should clear initial query when clearInitialPrompt is called", () => {
-    const { setInitialPrompt, clearInitialPrompt, initialPrompt } =
-      useInitialQueryStore.getState();
-
+  it("should clear initial query when clearInitialPrompt is dispatched", () => {
    // Set up initial query in the store
-    setInitialPrompt("test query");
-    expect(useInitialQueryStore.getState().initialPrompt).toBe("test query");
+    store.dispatch(setInitialPrompt("test query"));
+    expect(store.getState().initialQuery.initialPrompt).toBe("test query");

    // Clear the initial query
-    clearInitialPrompt();
+    store.dispatch(clearInitialPrompt());

    // Verify initial query is cleared
-    expect(useInitialQueryStore.getState().initialPrompt).toBeNull();
+    expect(store.getState().initialQuery.initialPrompt).toBeNull();
  });
 });
--- a/frontend/tests/routes/home-screen.test.tsx
+++ b/frontend/tests/routes/home-screen.test.tsx
@@ -3,7 +3,8 @@ import { beforeEach, describe, expect, it, vi } from "vitest";
 import { QueryClientProvider, QueryClient } from "@tanstack/react-query";
 import userEvent from "@testing-library/user-event";
 import { createRoutesStub } from "react-router";
-import { createAxiosNotFoundErrorObject } from "test-utils";
+import { Provider } from "react-redux";
+import { createAxiosNotFoundErrorObject, setupStore } from "test-utils";
 import HomeScreen from "#/routes/home";
 import { GitRepository } from "#/types/git";
 import SettingsService from "#/settings-service/settings-service.api";
@@ -65,9 +66,11 @@ const selectRepository = async (repoName: string) => {
 const renderHomeScreen = () =>
  render(<RouterStub />, {
    wrapper: ({ children }) => (
-      <QueryClientProvider client={new QueryClient()}>
-        {children}
-      </QueryClientProvider>
+      <Provider store={setupStore()}>
+        <QueryClientProvider client={new QueryClient()}>
+          {children}
+        </QueryClientProvider>
+      </Provider>
    ),
  });

--- a/frontend/tests/routes/settings.test.tsx
+++ b/frontend/tests/routes/settings.test.tsx
@@ -133,7 +133,7 @@ describe("Settings Screen", () => {
      "user",
      "integrations",
      "application",
-      "billing", // The nav item shows "Billing" text and routes to /billing
+      "billing", // The nav item shows "billing" text and routes to /billing
      "secrets",
      "api keys",
    ];
--- a/frontend/tests/services/actions.test.tsx
+++ b/frontend/tests/services/actions.test.tsx
@@ -13,28 +13,12 @@ vi.mock("#/store", () => ({
  },
 }));

-vi.mock("#/state/command-store", () => ({
-  useCommandStore: {
-    getState: () => ({
-      appendInput: mockAppendInput,
-    }),
-  },
+vi.mock("#/state/command-slice", () => ({
+  appendInput: mockAppendInput,
 }));

-vi.mock("#/state/jupyter-store", () => ({
-  useJupyterStore: {
-    getState: () => ({
-      appendJupyterInput: mockAppendJupyterInput,
-    }),
-  },
-}));
-
-vi.mock("#/state/metrics-slice", () => ({
-  setMetrics: vi.fn(),
-}));
-
-vi.mock("#/state/security-analyzer-slice", () => ({
-  appendSecurityAnalyzerInput: vi.fn(),
+vi.mock("#/state/jupyter-slice", () => ({
+  appendJupyterInput: mockAppendJupyterInput,
 }));

 describe("handleActionMessage", () => {
@@ -61,8 +45,7 @@ describe("handleActionMessage", () => {
    handleActionMessage(runAction);

    // Check that appendInput was called with the command
-    expect(mockAppendInput).toHaveBeenCalledWith("ls -la");
-    expect(mockDispatch).not.toHaveBeenCalled();
+    expect(mockDispatch).toHaveBeenCalledWith(mockAppendInput("ls -la"));
    expect(mockAppendJupyterInput).not.toHaveBeenCalled();
  });

@@ -76,8 +59,7 @@ describe("handleActionMessage", () => {
      args: {
        code: "print('Hello from Jupyter!')",
      },
-      message:
-        "Running Python code interactively: print('Hello from Jupyter!')",
+      message: "Running Python code interactively: print('Hello from Jupyter!')",
      timestamp: "2023-01-01T00:00:00Z",
    };

@@ -85,9 +67,7 @@ describe("handleActionMessage", () => {
    handleActionMessage(ipythonAction);

    // Check that appendJupyterInput was called with the code
-    expect(mockAppendJupyterInput).toHaveBeenCalledWith(
-      "print('Hello from Jupyter!')",
-    );
+    expect(mockDispatch).toHaveBeenCalledWith(mockAppendJupyterInput("print('Hello from Jupyter!')"));
    expect(mockAppendInput).not.toHaveBeenCalled();
  });

@@ -109,9 +89,7 @@ describe("handleActionMessage", () => {
    // Handle the action
    handleActionMessage(hiddenAction);

-    // Check that nothing was dispatched or called
+    // Check that nothing was dispatched
    expect(mockDispatch).not.toHaveBeenCalled();
-    expect(mockAppendInput).not.toHaveBeenCalled();
-    expect(mockAppendJupyterInput).not.toHaveBeenCalled();
  });
 });
--- a/frontend/tests/utils/check-hardcoded-strings.test.tsx
+++ b/frontend/tests/utils/check-hardcoded-strings.test.tsx
@@ -60,7 +60,13 @@ describe("Check for hardcoded English strings", () => {
  test("InteractiveChatBox should not have hardcoded English strings", () => {
    const { container } = renderWithProviders(
      <MemoryRouter>
-        <InteractiveChatBox onSubmit={() => {}} onStop={() => {}} />
+        <InteractiveChatBox
+          onSubmit={() => {}}
+          onStop={() => {}}
+          isWaitingForUserInput={false}
+          hasSubstantiveAgentActions={false}
+          optimisticUserMessage={false}
+        />
      </MemoryRouter>,
    );

--- a/frontend/global.d.ts
+++ b/frontend/global.d.ts
@@ -1,18 +1,4 @@
 interface Window {
  __APP_MODE__?: "saas" | "oss";
  __GITHUB_CLIENT_ID__?: string | null;
-  Reo?: {
-    init: (config: { clientID: string }) => void;
-    identify: (identity: {
-      username: string;
-      type: "github" |"email";
-      other_identities?: Array<{
-        username: string;
-        type: "github" | "email";
-      }>;
-      firstname?: string;
-      lastname?: string;
-      company?: string;
-    }) => void;
-  };
 }
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,51 +1,54 @@
 {
  "name": "openhands-frontend",
-  "version": "0.58.0",
+  "version": "0.56.0",
  "private": true,
  "type": "module",
  "engines": {
    "node": ">=22.0.0"
  },
  "dependencies": {
-    "@heroui/react": "^2.8.4",
+    "@heroui/react": "^2.8.3",
    "@heroui/use-infinite-scroll": "^2.2.11",
    "@microlink/react-json-view": "^1.26.2",
    "@monaco-editor/react": "^4.7.0-rc.0",
-    "@react-router/node": "^7.9.3",
-    "@react-router/serve": "^7.9.3",
+    "@react-router/node": "^7.8.2",
+    "@react-router/serve": "^7.8.2",
    "@react-types/shared": "^3.32.0",
-    "@stripe/react-stripe-js": "^4.0.2",
+    "@reduxjs/toolkit": "^2.9.0",
+    "@stripe/react-stripe-js": "^4.0.0",
    "@stripe/stripe-js": "^7.9.0",
    "@tailwindcss/postcss": "^4.1.13",
    "@tailwindcss/vite": "^4.1.13",
-    "@tanstack/react-query": "^5.90.2",
+    "@tanstack/react-query": "^5.87.0",
    "@uidotdev/usehooks": "^2.4.1",
-    "@vitejs/plugin-react": "^5.0.4",
+    "@vitejs/plugin-react": "^5.0.2",
    "@xterm/addon-fit": "^0.10.0",
    "@xterm/xterm": "^5.4.0",
-    "axios": "^1.12.2",
+    "axios": "^1.11.0",
    "class-variance-authority": "^0.7.1",
    "clsx": "^2.1.1",
    "date-fns": "^4.1.0",
    "downshift": "^9.0.10",
    "eslint-config-airbnb-typescript": "^18.0.0",
-    "framer-motion": "^12.23.22",
+    "framer-motion": "^12.23.12",
    "i18next": "^25.5.2",
    "i18next-browser-languagedetector": "^8.2.0",
    "i18next-http-backend": "^3.0.2",
-    "isbot": "^5.1.31",
+    "isbot": "^5.1.30",
    "jose": "^6.1.0",
-    "lucide-react": "^0.544.0",
-    "monaco-editor": "^0.53.0",
-    "posthog-js": "^1.268.8",
+    "lucide-react": "^0.542.0",
+    "monaco-editor": "^0.52.2",
+    "posthog-js": "^1.261.7",
    "react": "^19.1.1",
    "react-dom": "^19.1.1",
    "react-highlight": "^0.15.0",
    "react-hot-toast": "^2.6.0",
-    "react-i18next": "^16.0.0",
+    "react-i18next": "^15.7.2",
    "react-icons": "^5.5.0",
    "react-markdown": "^10.1.0",
-    "react-router": "^7.9.3",
+    "react-redux": "^9.2.0",
+    "react-resizable-panels": "^3.0.5",
+    "react-router": "^7.8.2",
    "react-syntax-highlighter": "^15.6.6",
    "remark-breaks": "^4.0.0",
    "remark-gfm": "^4.0.1",
@@ -53,10 +56,9 @@
    "socket.io-client": "^4.8.1",
    "tailwind-merge": "^3.3.1",
    "tailwind-scrollbar": "^4.0.2",
-    "vite": "^7.1.7",
+    "vite": "^7.1.4",
    "web-vitals": "^5.1.0",
-    "ws": "^8.18.2",
-    "zustand": "^5.0.8"
+    "ws": "^8.18.2"
  },
  "scripts": {
    "dev": "npm run make-i18n && cross-env VITE_MOCK_API=false react-router dev",
@@ -95,16 +97,16 @@
    "@babel/traverse": "^7.28.3",
    "@babel/types": "^7.28.2",
    "@mswjs/socket.io-binding": "^0.2.0",
-    "@playwright/test": "^1.55.1",
-    "@react-router/dev": "^7.9.3",
-    "@tailwindcss/typography": "^0.5.19",
-    "@tanstack/eslint-plugin-query": "^5.90.1",
+    "@playwright/test": "^1.55.0",
+    "@react-router/dev": "^7.8.2",
+    "@tailwindcss/typography": "^0.5.16",
+    "@tanstack/eslint-plugin-query": "^5.86.0",
    "@testing-library/dom": "^10.4.1",
    "@testing-library/jest-dom": "^6.8.0",
    "@testing-library/react": "^16.3.0",
    "@testing-library/user-event": "^14.6.1",
-    "@types/node": "^24.5.2",
-    "@types/react": "^19.1.15",
+    "@types/node": "^24.3.1",
+    "@types/react": "^19.1.12",
    "@types/react-dom": "^19.1.9",
    "@types/react-highlight": "^0.12.8",
    "@types/react-syntax-highlighter": "^15.5.13",
@@ -126,8 +128,8 @@
    "eslint-plugin-react-hooks": "^4.6.2",
    "eslint-plugin-unused-imports": "^4.2.0",
    "husky": "^9.1.7",
-    "jsdom": "^27.0.0",
-    "lint-staged": "^16.2.3",
+    "jsdom": "^26.1.0",
+    "lint-staged": "^16.1.6",
    "msw": "^2.6.6",
    "prettier": "^3.6.2",
    "stripe": "^18.5.0",
@@ -139,7 +141,7 @@
  },
  "packageManager": "npm@10.5.0",
  "volta": {
-    "node": "22.0.0"
+    "node": "18.20.1"
  },
  "msw": {
    "workerDirectory": [
--- a/frontend/public/mockServiceWorker.js
+++ b/frontend/public/mockServiceWorker.js
@@ -7,7 +7,7 @@
 * - Please do NOT modify this file.
 */

-const PACKAGE_VERSION = '2.11.1'
+const PACKAGE_VERSION = '2.10.5'
 const INTEGRITY_CHECKSUM = 'f5825c521429caf22a4dd13b66e243af'
 const IS_MOCKED_RESPONSE = Symbol('isMockedResponse')
 const activeClientIds = new Set()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
chuckbutkus	3a7df33acf	Merge branch 'main' into test-user	2025-09-17 14:02:52 -04:00
chuckbutkus	69fddecc7f	Merge branch 'main' into test-user	2025-09-07 21:55:39 -04:00
Chuck Butkus	3afe5ccee5	Add Logging	2025-09-05 20:52:48 -04:00
chuckbutkus	3d5a8dcf5a	Merge branch 'main' into test-user	2025-09-05 14:20:10 -04:00
Chuck Butkus	2ee1abe22c	Lint fix	2025-09-05 13:16:03 -04:00
Chuck Butkus	148940f553	Added logging around alive checks	2025-09-05 11:10:57 -04:00
Chuck Butkus	1f09296136	Fix username checks	2025-09-03 21:40:13 -04:00
Chuck Butkus	70e5d12ba9	Revert "Change to a non-login shell" This reverts commit `bcb3160d95`.	2025-08-29 01:48:47 -04:00
Chuck Butkus	bcb3160d95	Change to a non-login shell	2025-08-29 01:37:02 -04:00
Chuck Butkus	174c691744	Update	2025-08-28 02:25:05 -04:00
Chuck Butkus	af34d446e9	Remove vscode username restriction	2025-08-28 02:22:27 -04:00
Chuck Butkus	6604924f76	Fix bash username	2025-08-28 02:21:41 -04:00
chuckbutkus	b2def1e438	Merge branch 'main' into test-user	2025-08-27 23:33:45 -04:00
Chuck Butkus	2b8e47aca9	Add runtime user env vars	2025-08-27 23:02:39 -04:00
Chuck Butkus	dba8b28824	Logging	2025-08-27 21:30:47 -04:00