Merge branch 'main' into openhands-fix-issue-8199

specify condenser config for evals (#8177 )
Co-authored-by: openhands <openhands@all-hands.dev>
2026-04-29 03:00:45 -04:00 · 2025-05-21 22:11:01 +02:00 · 2025-05-21 22:08:57 +02:00 · 2025-05-21 20:00:56 +00:00 · 2025-05-21 19:34:59 +00:00 · 2025-05-21 19:17:43 +00:00
332 changed files with 14831 additions and 6683 deletions
--- a/.github/.codecov.yml
+++ b/.github/.codecov.yml
@@ -1,19 +0,0 @@
-codecov:
-  notify:
-    wait_for_ci: true
-    # our project is large, so 6 builds are typically uploaded. this waits till 5/6
-    # See https://docs.codecov.com/docs/notifications#section-preventing-notifications-until-after-n-builds
-    after_n_builds: 5
-
-coverage:
-  status:
-    patch:
-      default:
-        threshold: 100% # allow patch coverage to be lower than project coverage by any amount
-    project:
-      default:
-        threshold: 5% # allow project coverage to drop at most 5%
-
-comment: false
-github_checks:
-    annotations: false
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -0,0 +1,11 @@
+# CODEOWNERS file for OpenHands repository
+# See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
+
+# Frontend code owners
+/frontend/ @rbren @amanape
+
+# Evaluation code owners
+/evaluation/ @xingyaoww @neubig 
+
+# Documentation code owners
+/docs/ @mamoodi
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -13,6 +13,10 @@ updates:
      browsergym:
        patterns:
          - "browsergym*"
+      mcp-packages:
+        patterns:
+          - "mcp"
+          - "mcpm"
      security-all:
        applies-to: "security-updates"
        patterns:
--- a/.github/workflows/fe-unit-tests.yml
+++ b/.github/workflows/fe-unit-tests.yml
@@ -42,7 +42,3 @@ jobs:
      - name: Run tests and collect coverage
        working-directory: ./frontend
        run: npm run test:coverage
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/ghcr-build.yml
+++ b/.github/workflows/ghcr-build.yml
@@ -312,11 +312,7 @@ jobs:
          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=false \
-          poetry run pytest -n 7 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py --durations=10
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+          poetry run pytest -n 7 -raRs --reruns 2 --reruns-delay 5 -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py --durations=10

  # Run unit tests with the Docker runtime Docker images as openhands user
  test_runtime_oh:
@@ -381,11 +377,7 @@ jobs:
          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=true \
-          poetry run pytest -n 7 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py --durations=10
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+          poetry run pytest -n 7 -raRs --reruns 2 --reruns-delay 5 -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py --durations=10

  # The two following jobs (named identically) are to check whether all the runtime tests have passed as the
  # "All Runtime Tests Passed" is a required job for PRs to merge
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -30,11 +30,12 @@ jobs:
        run: |
          cd frontend
          npm install --frozen-lockfile
-      - name: Lint and TypeScript compilation
+      - name: Lint, TypeScript compilation, and translation checks
        run: |
          cd frontend
          npm run lint
          npm run make-i18n && tsc
+          npm run check-translation-completeness

  # Run lint on the python code
  lint-python:
--- a/.github/workflows/py-unit-tests.yml
+++ b/.github/workflows/py-unit-tests.yml
@@ -48,11 +48,7 @@ jobs:
      - name: Build Environment
        run: make build
      - name: Run Tests
-        run: poetry run pytest --forked -n auto --cov=openhands --cov-report=xml -svv ./tests/unit
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+        run: poetry run pytest --forked -n auto -svv ./tests/unit

  # Run specific Windows python tests
  test-on-windows:
--- a/.github/workflows/run-eval.yml
+++ b/.github/workflows/run-eval.yml
@@ -9,7 +9,7 @@ on:
 jobs:
  trigger-job:
    name: Trigger remote eval job
-    if: ${{ github.event.label.name == 'run-eval-xs' || github.event.label.name == 'run-eval-s' || github.event.label.name == 'run-eval-m' }}
+    if: ${{ github.event.label.name == 'run-eval-1' || github.event.label.name == 'run-eval-2' || github.event.label.name == 'run-eval-50' || github.event.label.name == 'run-eval-100' }}
    runs-on: blacksmith-4vcpu-ubuntu-2204

    steps:
@@ -26,12 +26,14 @@ jobs:
          echo "Repository URL: $REPO_URL"
          echo "PR Branch: $PR_BRANCH"

-          if [[ "${{ github.event.label.name }}" == "run-eval-xs" ]]; then
+          if [[ "${{ github.event.label.name }}" == "run-eval-1" ]]; then
            EVAL_INSTANCES="1"
-          elif [[ "${{ github.event.label.name }}" == "run-eval-s" ]]; then
-            EVAL_INSTANCES="5"
-          elif [[ "${{ github.event.label.name }}" == "run-eval-m" ]]; then
-            EVAL_INSTANCES="30"
+          elif [[ "${{ github.event.label.name }}" == "run-eval-2" ]]; then
+            EVAL_INSTANCES="2"
+          elif [[ "${{ github.event.label.name }}" == "run-eval-50" ]]; then
+            EVAL_INSTANCES="50"
+          elif [[ "${{ github.event.label.name }}" == "run-eval-100" ]]; then
+            EVAL_INSTANCES="100"
          fi

          curl -X POST \
--- a/.openhands/microagents/documentation.md
+++ b/.openhands/microagents/documentation.md
@@ -0,0 +1,33 @@
+---
+name: documentation
+type: knowledge
+version: 1.0.0
+agent: CodeActAgent
+triggers:
+- documentation
+- docs
+- document
+---
+
+# Documentation Guidelines
+
+All documentation must be grounded in fact, so you must not make anything up without proper evidence. When you have finished writing documentation, convey to the user what reference source, including web pages, source code, or other sources of documentation you referenced when writing each new fact in the documentation. If you cannot reference a source for anything do not include it in the pull request.
+
+## Best Practices for Documentation
+
+1. **Be Factual**: Only include information that can be verified from reliable sources.
+2. **Cite Sources**: Always reference the source of information (code, web pages, official documentation).
+3. **Be Clear and Concise**: Use simple language and avoid unnecessary jargon.
+4. **Use Examples**: Include practical examples to illustrate concepts.
+5. **Structure Properly**: Use headings, lists, and code blocks to organize information.
+6. **Keep Updated**: Ensure documentation reflects the current state of the code or system.
+
+## Documentation Process
+
+1. Research and gather information from reliable sources
+2. Draft documentation based on verified facts
+3. Review for accuracy and completeness
+4. Include references for all factual statements
+5. Submit only when all information is properly sourced
+
+Remember: If you cannot verify a piece of information, it's better to exclude it than to include potentially incorrect information.
--- a/.openhands/microagents/repo.md
+++ b/.openhands/microagents/repo.md
@@ -1,8 +1,3 @@
---
-name: repo
-type: repo
-agent: CodeActAgent
---
 This repository contains the code for OpenHands, an automated AI software engineer. It has a Python backend
 (in the `openhands` directory) and React frontend (in the `frontend` directory).

@@ -14,7 +9,7 @@ IMPORTANT: Before making any changes to the codebase, ALWAYS run `make install-p

 Before pushing any changes, you MUST ensure that any lint errors or simple test errors have been fixed.

-* If you've made changes to the backend, you should run `pre-commit run --all-files --config ./dev_config/python/.pre-commit-config.yaml`
+* If you've made changes to the backend, you should run `pre-commit run --config ./dev_config/python/.pre-commit-config.yaml` (this will run on staged files).
 * If you've made changes to the frontend, you should run `cd frontend && npm run lint:fix && npm run build ; cd ..`

 The pre-commit hooks MUST pass successfully before pushing any changes to the repository. This is a mandatory requirement to maintain code quality and consistency.
--- a/Development.md
+++ b/Development.md
@@ -1,8 +1,8 @@
 # Development Guide

 This guide is for people working on OpenHands and editing the source code.
-If you wish to contribute your changes, check out the [CONTRIBUTING.md](https://github.com/All-Hands-AI/OpenHands/blob/main/CONTRIBUTING.md) on how to clone and setup the project initially before moving on.
-Otherwise, you can clone the OpenHands project directly.
+If you wish to contribute your changes, check out the [CONTRIBUTING.md](https://github.com/All-Hands-AI/OpenHands/blob/main/CONTRIBUTING.md) on how to clone and setup the project 
+initially before moving on. Otherwise, you can clone the OpenHands project directly.

 ## Start the Server for Development

@@ -21,7 +21,8 @@ Make sure you have all these dependencies installed before moving on to `make bu

 #### Develop without sudo access

-If you want to develop without system admin/sudo access to upgrade/install `Python` and/or `NodeJs`, you can use `conda` or `mamba` to manage the packages for you:
+If you want to develop without system admin/sudo access to upgrade/install `Python` and/or `NodeJs`, you can use 
+`conda` or `mamba` to manage the packages for you:

 ```bash
 # Download and install Mamba (a faster version of conda)
@@ -36,7 +37,8 @@ mamba install conda-forge::poetry

 ### 2. Build and Setup The Environment

-Begin by building the project which includes setting up the environment and installing dependencies. This step ensures that OpenHands is ready to run on your system:
+Begin by building the project which includes setting up the environment and installing dependencies. This step ensures 
+that OpenHands is ready to run on your system:

 ```bash
 make build
@@ -45,8 +47,6 @@ make build
 ### 3. Configuring the Language Model

 OpenHands supports a diverse array of Language Models (LMs) through the powerful [litellm](https://docs.litellm.ai) library.
-By default, we've chosen Claude Sonnet 3.5 as our go-to model, but the world is your oyster! You can unleash the
-potential of any other LM that piques your interest.

 To configure the LM of your choice, run:

@@ -54,9 +54,12 @@ To configure the LM of your choice, run:
 make setup-config
 ```

-This command will prompt you to enter the LLM API key, model name, and other variables ensuring that OpenHands is tailored to your specific needs. Note that the model name will apply only when you run headless. If you use the UI, please set the model in the UI.
+This command will prompt you to enter the LLM API key, model name, and other variables ensuring that OpenHands is 
+tailored to your specific needs. Note that the model name will apply only when you run headless. If you use the UI, 
+please set the model in the UI.

-Note: If you have previously run OpenHands using the docker command, you may have already set some environmental variables in your terminal. The final configurations are set from highest to lowest priority:
+Note: If you have previously run OpenHands using the docker command, you may have already set some environmental 
+variables in your terminal. The final configurations are set from highest to lowest priority:
 Environment variables > config.toml variables > default variables

 **Note on Alternative Models:**
@@ -74,13 +77,15 @@ make run

 #### Option B: Individual Server Startup

- **Start the Backend Server:** If you prefer, you can start the backend server independently to focus on backend-related tasks or configurations.
+- **Start the Backend Server:** If you prefer, you can start the backend server independently to focus on 
+backend-related tasks or configurations.

  ```bash
  make start-backend
  ```

- **Start the Frontend Server:** Similarly, you can start the frontend server on its own to work on frontend-related components or interface enhancements.
+- **Start the Frontend Server:** Similarly, you can start the frontend server on its own to work on frontend-related 
+components or interface enhancements.
  ```bash
  make start-frontend
  ```
@@ -115,10 +120,10 @@ poetry run pytest ./tests/unit/test_*.py

 ### 9. Use existing Docker image

-To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by
-setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
+To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker 
+container image by setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.

-Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.37-nikolaik`
+Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.39-nikolaik`

 ## Develop inside Docker container

--- a/ISSUE_TRIAGE.md
+++ b/ISSUE_TRIAGE.md
@@ -3,17 +3,12 @@ These are the procedures and guidelines on how issues are triaged in this repo b

 ## General
 * All issues must be tagged with **enhancement**, **bug** or **troubleshooting/help**.
-* Issues may be tagged with what it relates to (**agent quality**, **frontend**, **resolver**, etc.).
+* Issues may be tagged with what it relates to (**agent quality**, **resolver**, **CLI**, etc.).

 ## Severity
-* **Low**: Minor issues or affecting single user.
-* **Medium**: Affecting multiple users.
 * **High**: High visibility issues or affecting many users.
 * **Critical**: Affecting all users or potential security issues.

-## Effort
-* Issues may be estimated with effort required (**small effort**, **medium effort**, **large effort**).
-
 ## Difficulty
 * Issues with low implementation difficulty may be tagged with **good first issue**.

--- a/13
+++ b/13
@@ -5,6 +5,7 @@ SHELL=/usr/bin/env bash
 BACKEND_HOST ?= "127.0.0.1"
 BACKEND_PORT = 3000
 BACKEND_HOST_PORT = "$(BACKEND_HOST):$(BACKEND_PORT)"
+FRONTEND_HOST ?= "127.0.0.1"
 FRONTEND_PORT = 3001
 DEFAULT_WORKSPACE_DIR = "./workspace"
 DEFAULT_MODEL = "gpt-4o"
@@ -288,6 +289,15 @@ setup-config-prompts:
 	@read -p "Enter your LLM base URL [mostly used for local LLMs, leave blank if not needed - example: http://localhost:5001/v1/]: " llm_base_url; \
 	 if [[ ! -z "$$llm_base_url" ]]; then echo "base_url=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; fi

+setup-config-basic:
+	@printf '%s\n' \
+	'[core]' \
+	'workspace_base="./workspace"' \
+	> config.toml
+	@echo "$(GREEN)config.toml created.$(RESET)"
+
+openhands-cloud-run:
+	@$(MAKE) run BACKEND_HOST="0.0.0.0" BACKEND_PORT="12000" FRONTEND_HOST="0.0.0.0" FRONTEND_PORT="12001"

 # Develop in container
 docker-dev:
@@ -322,5 +332,4 @@ help:
 	@echo "  $(GREEN)help$(RESET)                - Display this help message, providing information on available targets."

 # Phony targets
-.PHONY: build check-dependencies check-python check-npm check-docker check-poetry install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help
-.PHONY: docker-dev docker-run
+.PHONY: build check-dependencies check-system check-python check-npm check-nodejs check-docker check-poetry install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint-backend lint-frontend lint test-frontend test build-frontend start-backend start-frontend _run_setup run run-wsl setup-config setup-config-prompts setup-config-basic openhands-cloud-run docker-dev docker-run clean help
--- a/README.md
+++ b/README.md
@@ -51,17 +51,17 @@ system requirements and more information.


 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands-state:/.openhands-state \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37
+    docker.all-hands.dev/all-hands-ai/openhands:0.39
 ```

 You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
@@ -92,6 +92,7 @@ If you want to modify the OpenHands source code, check out [Development.md](http
 Having issues? The [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting) can help.

 ## 📖 Documentation
+  <a href="https://deepwiki.com/All-Hands-AI/OpenHands"><img src="https://deepwiki.com/badge.svg" alt="Ask DeepWiki" title="Autogenerated Documentation by DeepWiki"></a>

 To learn more about the project, and for tips on using OpenHands,
 check out our [documentation](https://docs.all-hands.dev/modules/usage/getting-started).
--- a/containers/dev/compose.yml
+++ b/containers/dev/compose.yml
@@ -11,7 +11,7 @@ services:
      - BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
      - SANDBOX_API_HOSTNAME=host.docker.internal
      #
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.37-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.39-nikolaik}
      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
    ports:
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -7,7 +7,7 @@ services:
    image: openhands:latest
    container_name: openhands-app-${DATE:-}
    environment:
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik}
      #- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} # enable this only if you want a specific non-root sandbox user but you will have to manually adjust permissions of openhands-state for this user
      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
    ports:
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/cloud/cloud-api.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/cloud/cloud-api.md
@@ -69,7 +69,7 @@ data = {
 response = requests.post(url, headers=headers, json=data)
 conversation = response.json()

-print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['id']}")
+print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['conversation_id']}")
 print(f"Status: {conversation['status']}")
 ```
 </details>
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
@@ -37,7 +37,7 @@ Pour exécuter OpenHands en mode CLI avec Docker :
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -46,7 +46,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.39 \
    python -m openhands.core.cli
 ```

--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
@@ -34,7 +34,7 @@ Pour exécuter OpenHands en mode Headless avec Docker :
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -44,7 +44,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.39 \
    python -m openhands.core.main -t "write a bash script that prints hi"
 ```

--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/installation.mdx
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/installation.mdx
@@ -58,17 +58,17 @@ Un système avec un processeur moderne et un minimum de **4 Go de RAM** est reco
 La façon la plus simple d'exécuter OpenHands est dans Docker.

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands-state:/.openhands-state \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37
+    docker.all-hands.dev/all-hands-ai/openhands:0.39
 ```

 Vous trouverez OpenHands en cours d'exécution à l'adresse http://localhost:3000 !
--- a/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/cloud/cloud-api.md
+++ b/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/cloud/cloud-api.md
@@ -69,7 +69,7 @@ data = {
 response = requests.post(url, headers=headers, json=data)
 conversation = response.json()

-print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['id']}")
+print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['conversation_id']}")
 print(f"Status: {conversation['status']}")
 ```
 </details>
--- a/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
+++ b/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
@@ -36,7 +36,7 @@ DockerでOpenHandsをCLIモードで実行するには：
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -45,7 +45,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.39 \
    python -m openhands.core.cli
 ```

--- a/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
+++ b/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
@@ -33,7 +33,7 @@ DockerでヘッドレスモードでOpenHandsを実行するには：
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -43,7 +43,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.39 \
    python -m openhands.core.main -t "write a bash script that prints hi"
 ```

--- a/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/installation.mdx
+++ b/docs/i18n/ja/docusaurus-plugin-content-docs/current/usage/installation.mdx
@@ -58,17 +58,17 @@ OpenHandsを実行するには、最新のプロセッサと最低**4GB RAM**を
 OpenHandsを実行する最も簡単な方法はDockerを使用することです。

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands-state:/.openhands-state \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37
+    docker.all-hands.dev/all-hands-ai/openhands:0.39
 ```

 OpenHandsは http://localhost:3000 で実行されています！
--- a/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/cloud/cloud-api.md
+++ b/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/cloud/cloud-api.md
@@ -69,7 +69,7 @@ data = {
 response = requests.post(url, headers=headers, json=data)
 conversation = response.json()

-print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['id']}")
+print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['conversation_id']}")
 print(f"Status: {conversation['status']}")
 ```
 </details>
--- a/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
+++ b/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
@@ -37,7 +37,7 @@ Para executar o OpenHands no modo CLI com Docker:
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -46,7 +46,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.39 \
    python -m openhands.core.cli
 ```

--- a/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
+++ b/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
@@ -34,7 +34,7 @@ Para executar o OpenHands em modo Headless com Docker:
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -44,7 +44,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.39 \
    python -m openhands.core.main -t "write a bash script that prints hi"
 ```

--- a/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/installation.mdx
+++ b/docs/i18n/pt-BR/docusaurus-plugin-content-docs/current/usage/installation.mdx
@@ -58,17 +58,17 @@
 A maneira mais fácil de executar o OpenHands é no Docker.

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands-state:/.openhands-state \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37
+    docker.all-hands.dev/all-hands-ai/openhands:0.39
 ```

 Você encontrará o OpenHands rodando em http://localhost:3000!
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/cloud/cloud-api.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/cloud/cloud-api.md
@@ -69,7 +69,7 @@ data = {
 response = requests.post(url, headers=headers, json=data)
 conversation = response.json()

-print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['id']}")
+print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['conversation_id']}")
 print(f"Status: {conversation['status']}")
 ```
 </details>
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md
@@ -36,7 +36,7 @@ poetry run python -m openhands.core.cli
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -45,7 +45,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.39 \
    python -m openhands.core.cli
 ```

--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md
@@ -33,7 +33,7 @@ poetry run python -m openhands.core.main -t "write a bash script that prints hi"
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -43,7 +43,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.39 \
    python -m openhands.core.main -t "write a bash script that prints hi"
 ```

--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/installation.mdx
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/installation.mdx
@@ -58,17 +58,17 @@
 运行 OpenHands 最简单的方法是使用 Docker。

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands-state:/.openhands-state \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37
+    docker.all-hands.dev/all-hands-ai/openhands:0.39
 ```

 OpenHands 将在 http://localhost:3000 运行！
--- a/docs/modules/usage/cloud/cloud-api.md
+++ b/docs/modules/usage/cloud/cloud-api.md
@@ -70,7 +70,7 @@ data = {
 response = requests.post(url, headers=headers, json=data)
 conversation = response.json()

-print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['id']}")
+print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['conversation_id']}")
 print(f"Status: {conversation['status']}")
 ```
 </details>
--- a/docs/modules/usage/how-to/cli-mode.md
+++ b/docs/modules/usage/how-to/cli-mode.md
@@ -31,7 +31,7 @@ This command opens an interactive prompt where you can type tasks or commands an
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -40,7 +40,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.39 \
    python -m openhands.cli.main
 ```

--- a/docs/modules/usage/how-to/headless-mode.md
+++ b/docs/modules/usage/how-to/headless-mode.md
@@ -31,7 +31,7 @@ To run OpenHands in Headless mode with Docker:
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -41,7 +41,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.39 \
    python -m openhands.core.main -t "write a bash script that prints hi"
 ```

--- a/docs/modules/usage/how-to/websocket-connection.md
+++ b/docs/modules/usage/how-to/websocket-connection.md
@@ -0,0 +1,181 @@
+---
+sidebar_position: 9
+---
+
+# Connecting to the WebSocket
+
+This guide explains how to connect to the OpenHands WebSocket API to receive real-time events and send actions to the agent.
+
+## Overview
+
+OpenHands uses [Socket.IO](https://socket.io/) for WebSocket communication between the client and server. The WebSocket connection allows you to:
+
+1. Receive real-time events from the agent
+2. Send user actions to the agent
+3. Maintain a persistent connection for ongoing conversations
+
+## Connecting to the WebSocket
+
+### Connection Parameters
+
+When connecting to the WebSocket, you need to provide the following query parameters:
+
+- `conversation_id`: The ID of the conversation you want to join
+- `latest_event_id`: The ID of the latest event you've received (use `-1` for a new connection)
+- `providers_set`: (Optional) A comma-separated list of provider types
+
+### Connection Example
+
+Here's a basic example of connecting to the WebSocket using JavaScript:
+
+```javascript
+import { io } from "socket.io-client";
+
+const socket = io("http://localhost:3000", {
+  transports: ["websocket"],
+  query: {
+    conversation_id: "your-conversation-id",
+    latest_event_id: -1,
+    providers_set: "github,gitlab" // Optional
+  }
+});
+
+socket.on("connect", () => {
+  console.log("Connected to OpenHands WebSocket");
+});
+
+socket.on("oh_event", (event) => {
+  console.log("Received event:", event);
+});
+
+socket.on("connect_error", (error) => {
+  console.error("Connection error:", error);
+});
+
+socket.on("disconnect", (reason) => {
+  console.log("Disconnected:", reason);
+});
+```
+
+## Sending Actions to the Agent
+
+To send an action to the agent, use the `oh_user_action` event:
+
+```javascript
+// Send a user message to the agent
+socket.emit("oh_user_action", {
+  type: "message",
+  source: "user",
+  message: "Hello, can you help me with my project?"
+});
+```
+
+## Receiving Events from the Agent
+
+The server emits events using the `oh_event` event type. Here are some common event types you might receive:
+
+- User messages (`source: "user", type: "message"`)
+- Agent messages (`source: "agent", type: "message"`)
+- File edits (`action: "edit"`)
+- File writes (`action: "write"`)
+- Command executions (`action: "run"`)
+
+Example event handler:
+
+```javascript
+socket.on("oh_event", (event) => {
+  if (event.source === "agent" && event.type === "message") {
+    console.log("Agent says:", event.message);
+  } else if (event.action === "run") {
+    console.log("Command executed:", event.args.command);
+    console.log("Result:", event.result);
+  }
+});
+```
+
+## Using Websocat for Testing
+
+[Websocat](https://github.com/vi/websocat) is a command-line tool for interacting with WebSockets. It's useful for testing your WebSocket connection without writing a full client application.
+
+### Installation
+
+```bash
+# On macOS
+brew install websocat
+
+# On Linux
+curl -L https://github.com/vi/websocat/releases/download/v1.11.0/websocat.x86_64-unknown-linux-musl > websocat
+chmod +x websocat
+sudo mv websocat /usr/local/bin/
+```
+
+### Connecting to the WebSocket
+
+```bash
+# Connect to the WebSocket and print all received messages
+echo "40{}" | \
+websocat "ws://localhost:3000/socket.io/?EIO=4&transport=websocket&conversation_id=your-conversation-id&latest_event_id=-1"
+```
+
+### Sending a Message
+
+```bash
+# Send a message to the agent
+echo '42["oh_user_action",{"type":"message","source":"user","message":"Hello, agent!"}]' | \
+websocat "ws://localhost:3000/socket.io/?EIO=4&transport=websocket&conversation_id=your-conversation-id&latest_event_id=-1"
+```
+
+### Complete Example with Websocat
+
+Here's a complete example of connecting to the WebSocket, sending a message, and receiving events:
+
+```bash
+# Start a persistent connection
+websocat -v "ws://localhost:3000/socket.io/?EIO=4&transport=websocket&conversation_id=your-conversation-id&latest_event_id=-1"
+
+# In another terminal, send a message
+echo '42["oh_user_action",{"type":"message","source":"user","message":"Can you help me with my project?"}]' | \
+websocat "ws://localhost:3000/socket.io/?EIO=4&transport=websocket&conversation_id=your-conversation-id&latest_event_id=-1"
+```
+
+## Event Structure
+
+Events sent and received through the WebSocket follow a specific structure:
+
+```typescript
+interface OpenHandsEvent {
+  id: string;           // Unique event ID
+  source: string;       // "user" or "agent"
+  timestamp: string;    // ISO timestamp
+  message?: string;     // For message events
+  type?: string;        // Event type (e.g., "message")
+  action?: string;      // Action type (e.g., "run", "edit", "write")
+  args?: any;           // Action arguments
+  result?: any;         // Action result
+}
+```
+
+## Best Practices
+
+1. **Handle Reconnection**: Implement reconnection logic in your client to handle network interruptions.
+2. **Track Event IDs**: Store the latest event ID you've received and use it when reconnecting to avoid duplicate events.
+3. **Error Handling**: Implement proper error handling for connection errors and failed actions.
+4. **Rate Limiting**: Avoid sending too many actions in a short period to prevent overloading the server.
+
+## Troubleshooting
+
+### Connection Issues
+
+- Verify that the OpenHands server is running and accessible
+- Check that you're providing the correct conversation ID
+- Ensure your WebSocket URL is correctly formatted
+
+### Authentication Issues
+
+- Make sure you have the necessary authentication cookies if required
+- Verify that you have permission to access the specified conversation
+
+### Event Handling Issues
+
+- Check that you're correctly parsing the event data
+- Verify that your event handlers are properly registered
--- a/docs/modules/usage/installation.mdx
+++ b/docs/modules/usage/installation.mdx
@@ -58,17 +58,17 @@ A system with a modern processor and a minimum of **4GB RAM** is recommended to
 The easiest way to run OpenHands is in Docker.

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.37-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands-state:/.openhands-state \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.37
+    docker.all-hands.dev/all-hands-ai/openhands:0.39
 ```

 You'll find OpenHands running at http://localhost:3000!
--- a/docs/modules/usage/llms/azure-llms.md
+++ b/docs/modules/usage/llms/azure-llms.md
@@ -1,6 +1,7 @@
 # Azure

-OpenHands uses LiteLLM to make calls to Azure's chat models. You can find their documentation on using Azure as a provider [here](https://docs.litellm.ai/docs/providers/azure).
+OpenHands uses LiteLLM to make calls to Azure's chat models. You can find their documentation on using Azure as a 
+provider [here](https://docs.litellm.ai/docs/providers/azure).

 ## Azure OpenAI Configuration

@@ -18,7 +19,7 @@ docker run -it --pull=always \
    ...
 ```

-Then in the OpenHands UI Settings:
+Then in the OpenHands UI Settings under the `LLM` tab:

 :::note
 You will need your ChatGPT deployment name which can be found on the deployments page in Azure. This is referenced as
--- a/docs/modules/usage/llms/google-llms.md
+++ b/docs/modules/usage/llms/google-llms.md
@@ -7,10 +7,11 @@ OpenHands uses LiteLLM to make calls to Google's chat models. You can find their

 ## Gemini - Google AI Studio Configs

-When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
+When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings under the `LLM` tab:
 - `LLM Provider` to `Gemini`
 - `LLM Model` to the model you will be using.
-If the model is not in the list, toggle `Advanced` options, and enter it in `Custom Model` (e.g. gemini/&lt;model-name&gt; like `gemini/gemini-2.0-flash`).
+If the model is not in the list, enable `Advanced` options, and enter it in `Custom Model` 
+(e.g. gemini/&lt;model-name&gt; like `gemini/gemini-2.0-flash`).
 - `API Key` to your Gemini API key

 ## VertexAI - Google Cloud Platform Configs
@@ -24,7 +25,8 @@ VERTEXAI_PROJECT="<your-gcp-project-id>"
 VERTEXAI_LOCATION="<your-gcp-location>"
 ```

-Then set the following in the OpenHands UI through the Settings:
+Then set the following in the OpenHands UI through the Settings under the `LLM` tab:
 - `LLM Provider` to `VertexAI`
 - `LLM Model` to the model you will be using.
-If the model is not in the list, toggle `Advanced` options, and enter it in `Custom Model` (e.g. vertex_ai/&lt;model-name&gt;).
+If the model is not in the list, enable `Advanced` options, and enter it in `Custom Model` 
+(e.g. vertex_ai/&lt;model-name&gt;).
--- a/docs/modules/usage/llms/groq.md
+++ b/docs/modules/usage/llms/groq.md
@@ -1,22 +1,21 @@
 # Groq

-OpenHands uses LiteLLM to make calls to chat models on Groq. You can find their documentation on using Groq as a provider [here](https://docs.litellm.ai/docs/providers/groq).
+OpenHands uses LiteLLM to make calls to chat models on Groq. You can find their documentation on using Groq as a 
+provider [here](https://docs.litellm.ai/docs/providers/groq).

 ## Configuration

-When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
+When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings under the `LLM` tab:
 - `LLM Provider` to `Groq`
 - `LLM Model` to the model you will be using. [Visit here to see the list of
-models that Groq hosts](https://console.groq.com/docs/models). If the model is not in the list, toggle
-`Advanced` options, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`).
+models that Groq hosts](https://console.groq.com/docs/models). If the model is not in the list, 
+enable `Advanced` options, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`).
 - `API key` to your Groq API key. To find or create your Groq API Key, [see here](https://console.groq.com/keys).

-
-
 ## Using Groq as an OpenAI-Compatible Endpoint

 The Groq endpoint for chat completion is [mostly OpenAI-compatible](https://console.groq.com/docs/openai). Therefore, you can access Groq models as you
-would access any OpenAI-compatible endpoint. In the OpenHands UI through the Settings:
+would access any OpenAI-compatible endpoint. In the OpenHands UI through the Settings under the `LLM` tab:
 1. Enable `Advanced` options
 2. Set the following:
   - `Custom Model` to the prefix `openai/` + the model you will be using (e.g. `openai/llama3-70b-8192`)
--- a/docs/modules/usage/llms/litellm-proxy.md
+++ b/docs/modules/usage/llms/litellm-proxy.md
@@ -7,7 +7,7 @@ OpenHands supports using the [LiteLLM proxy](https://docs.litellm.ai/docs/proxy/
 To use LiteLLM proxy with OpenHands, you need to:

 1. Set up a LiteLLM proxy server (see [LiteLLM documentation](https://docs.litellm.ai/docs/proxy/quick_start))
-2. When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
+2. When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings under the `LLM` tab:
  * Enable `Advanced` options
  * `Custom Model` to the prefix `litellm_proxy/` + the model you will be using (e.g. `litellm_proxy/anthropic.claude-3-5-sonnet-20241022-v2:0`)
  * `Base URL` to your LiteLLM proxy URL (e.g. `https://your-litellm-proxy.com`)
@@ -15,6 +15,7 @@ To use LiteLLM proxy with OpenHands, you need to:

 ## Supported Models

-The supported models depend on your LiteLLM proxy configuration. OpenHands supports any model that your LiteLLM proxy is configured to handle.
+The supported models depend on your LiteLLM proxy configuration. OpenHands supports any model that your LiteLLM proxy 
+is configured to handle.

 Refer to your LiteLLM proxy configuration for the list of available models and their names.
--- a/docs/modules/usage/llms/llms.md
+++ b/docs/modules/usage/llms/llms.md
@@ -11,14 +11,12 @@ OpenHands can connect to any LLM supported by LiteLLM. However, it requires a po
 Based on our evaluations of language models for coding tasks (using the SWE-bench dataset), we can provide some
 recommendations for model selection. Our latest benchmarking results can be found in [this spreadsheet](https://docs.google.com/spreadsheets/d/1wOUdFCMyY6Nt0AIqF705KN4JKOWgeI4wUGUP60krXXs/edit?gid=0).

-Based on these findings and community feedback, the following models have been verified to work reasonably well with OpenHands:
+Based on these findings and community feedback, these are the latest models that have been verified to work reasonably well with OpenHands:

 - [anthropic/claude-3-7-sonnet-20250219](https://www.anthropic.com/api) (recommended)
+- [openai/o4-mini](https://openai.com/index/introducing-o3-and-o4-mini/)
 - [gemini/gemini-2.5-pro](https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/)
 - [deepseek/deepseek-chat](https://api-docs.deepseek.com/)
- [openai/o3-mini](https://openai.com/index/openai-o3-mini/)
- [openai/o3](https://openai.com/index/introducing-o3-and-o4-mini/)
- [openai/o4-mini](https://openai.com/index/introducing-o3-and-o4-mini/)
 - [all-hands/openhands-lm-32b-v0.1](https://www.all-hands.dev/blog/introducing-openhands-lm-32b----a-strong-open-coding-agent-model) -- available through [OpenRouter](https://openrouter.ai/all-hands/openhands-lm-32b-v0.1)


@@ -27,8 +25,8 @@ OpenHands will issue many prompts to the LLM you configure. Most of these LLMs c
 limits and monitor usage.
 :::

-If you have successfully run OpenHands with specific LLMs not in the list, please add them to the verified list. We
-also encourage you to open a PR to share your setup process to help others using the same provider and LLM!
+If you have successfully run OpenHands with specific providers, we encourage you to open a PR to share your setup process 
+to help others using the same provider!

 For a full list of the providers and models available, please consult the
 [litellm documentation](https://docs.litellm.ai/docs/providers).
--- a/docs/modules/usage/llms/local-llms.md
+++ b/docs/modules/usage/llms/local-llms.md
@@ -1,4 +1,4 @@
-# Local LLM with SGLang or vLLM
+# Local LLMs

 :::warning
 When using a Local LLM, OpenHands may have limited functionality.
@@ -7,10 +7,91 @@ It is highly recommended that you use GPUs to serve local models for optimal exp

 ## News

+- 2025/05/21: We collaborated with Mistral AI and released [Devstral Small](https://mistral.ai/news/devstral) that achieves [46.8% on SWE-Bench Verified](https://github.com/SWE-bench/experiments/pull/228)!
 - 2025/03/31: We released an open model OpenHands LM v0.1 32B that achieves 37.1% on SWE-Bench Verified
 ([blog](https://www.all-hands.dev/blog/introducing-openhands-lm-32b----a-strong-open-coding-agent-model), [model](https://huggingface.co/all-hands/openhands-lm-32b-v0.1)).

-## Download the Model from Huggingface
+
+## Quickstart: Running OpenHands on Your Macbook
+
+### Serve the model on your Macbook
+
+We recommend using [LMStudio](https://lmstudio.ai/) for serving these models locally.
+
+1. Download [LM Studio](https://lmstudio.ai/) and install it
+
+2. Download the model:
+   - Option 1: Directly download the LLM from [this link](https://lmstudio.ai/model/devstral-small-2505-mlx) or by searching for the name `Devstral-Small-2505` in LM Studio
+   - Option 2: Download a LLM in GGUF format. For example, to download [Devstral Small 2505 GGUF](https://huggingface.co/mistralai/Devstral-Small-2505_gguf), using `huggingface-cli download mistralai/Devstral-Small-2505_gguf --local-dir mistralai/Devstral-Small-2505_gguf`. Then in bash terminal, run `lms import {model_name}` in the directory where you've downloaded the model checkpoint (e.g. run `lms import devstralQ4_K_M.gguf` in `mistralai/Devstral-Small-2505_gguf`)
+
+3. Open LM Studio application, you should first switch to `power user` mode, and then open the developer tab:
+  
+![image](./screenshots/1_select_power_user.png)
+
+4. Then click `Select a model to load` on top of the application:
+
+![image](./screenshots/2_select_model.png)
+
+5. And choose the model you want to use, holding `option` on mac to enable advanced loading options:
+
+![image](./screenshots/3_select_devstral.png)
+
+6. You should then pick an appropriate context window for OpenHands based on your hardware configuration (larger than 32768 is recommended for using OpenHands, but too large may cause you to run out of memory); Flash attention is also recommended if it works on your machine.
+
+![image](./screenshots/4_set_context_window.png)
+
+7. And you should start the server (if it is not already in `Running` status), un-toggle `Serve on Local Network` and remember the port number of the LMStudio URL (`1234` is the port number for `http://127.0.0.1:1234` in this example):
+
+![image](./screenshots/5_copy_url.png)
+
+8. Finally, you can click the `copy` button near model name to copy the model name (`imported-models/uncategorized/devstralq4_k_m.gguf` in this example):
+
+![image](./screenshots/6_copy_to_get_model_name.png)
+
+### Start OpenHands with locally served model
+
+Check [the installation guide](https://docs.all-hands.dev/modules/usage/installation) to make sure you have all the prerequisites for running OpenHands.
+
+```bash
+export LMSTUDIO_MODEL_NAME="imported-models/uncategorized/devstralq4_k_m.gguf" # <- Replace this with the model name you copied from LMStudio
+export LMSTUDIO_URL="http://host.docker.internal:1234"  # <- Replace this with the port from LMStudio
+
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik
+
+mkdir -p ~/.openhands-state && echo '{"language":"en","agent":"CodeActAgent","max_iterations":null,"security_analyzer":null,"confirmation_mode":false,"llm_model":"lm_studio/'$LMSTUDIO_MODEL_NAME'","llm_api_key":"dummy","llm_base_url":"'$LMSTUDIO_URL/v1'","remote_runtime_resource_factor":null,"github_token":null,"enable_default_condenser":true,"user_consents_to_analytics":true}' > ~/.openhands-state/settings.json
+
+docker run -it --rm --pull=always \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
+    -e LOG_ALL_EVENTS=true \
+    -v /var/run/docker.sock:/var/run/docker.sock \
+    -v ~/.openhands-state:/.openhands-state \
+    -p 3000:3000 \
+    --add-host host.docker.internal:host-gateway \
+    --name openhands-app \
+    docker.all-hands.dev/all-hands-ai/openhands:0.39
+```
+
+Once your server is running -- you can visit `http://localhost:3000` in your browser to use OpenHands with local Devstral model:
+```
+Digest: sha256:e72f9baecb458aedb9afc2cd5bc935118d1868719e55d50da73190d3a85c674f
+Status: Image is up to date for docker.all-hands.dev/all-hands-ai/openhands:0.39
+Starting OpenHands...
+Running OpenHands as root
+14:22:13 - openhands:INFO: server_config.py:50 - Using config class None
+INFO:     Started server process [8]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:3000 (Press CTRL+C to quit)
+```
+
+
+## Advanced: Serving LLM on GPUs
+
+### Download model checkpoints
+
+:::note
+The model checkpoints downloaded here should NOT be in GGUF format.
+:::

 For example, to download [OpenHands LM 32B v0.1](https://huggingface.co/all-hands/openhands-lm-32b-v0.1):

@@ -18,9 +99,7 @@ For example, to download [OpenHands LM 32B v0.1](https://huggingface.co/all-hand
 huggingface-cli download all-hands/openhands-lm-32b-v0.1 --local-dir all-hands/openhands-lm-32b-v0.1
 ```

-## Create an OpenAI-Compatible Endpoint With a Model Serving Framework
-
-### Serving with SGLang
+### Create an OpenAI-Compatible Endpoint With SGLang

 - Install SGLang following [the official documentation](https://docs.sglang.ai/start/install.html).
 - Example launch command for OpenHands LM 32B (with at least 2 GPUs):
@@ -35,7 +114,7 @@ SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 python3 -m sglang.launch_server \
    --api-key mykey --context-length 131072
 ```

-### Serving with vLLM
+### Create an OpenAI-Compatible Endpoint with vLLM

 - Install vLLM following [the official documentation](https://docs.vllm.ai/en/latest/getting_started/installation.html).
 - Example launch command for OpenHands LM 32B (with at least 2 GPUs):
@@ -49,7 +128,7 @@ vllm serve all-hands/openhands-lm-32b-v0.1 \
    --enable-prefix-caching
 ```

-## Run and Configure OpenHands
+## Advanced: Run and Configure OpenHands

 ### Run OpenHands

@@ -75,7 +154,7 @@ Start OpenHands using `make run`.

 ### Configure OpenHands

-Once OpenHands is running, you'll need to set the following in the OpenHands UI through the Settings:
+Once OpenHands is running, you'll need to set the following in the OpenHands UI through the Settings under the `LLM` tab: 
 1. Enable `Advanced` options.
 2. Set the following:
 - `Custom Model` to `openai/<served-model-name>` (e.g. `openai/openhands-lm-32b-v0.1`)
--- a/docs/modules/usage/llms/openai-llms.md
+++ b/docs/modules/usage/llms/openai-llms.md
@@ -1,14 +1,15 @@
 # OpenAI

-OpenHands uses LiteLLM to make calls to OpenAI's chat models. You can find their documentation on using OpenAI as a provider [here](https://docs.litellm.ai/docs/providers/openai).
+OpenHands uses LiteLLM to make calls to OpenAI's chat models. You can find their documentation on using OpenAI as a 
+provider [here](https://docs.litellm.ai/docs/providers/openai).

 ## Configuration

-When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
+When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings under the `LLM` tab:
 * `LLM Provider` to `OpenAI`
 * `LLM Model` to the model you will be using.
 [Visit here to see a full list of OpenAI models that LiteLLM supports.](https://docs.litellm.ai/docs/providers/openai#openai-chat-completion-models)
-If the model is not in the list, toggle `Advanced` options, and enter it in `Custom Model` (e.g. openai/&lt;model-name&gt; like `openai/gpt-4o`).
+If the model is not in the list, enable `Advanced` options, and enter it in `Custom Model` (e.g. openai/&lt;model-name&gt; like `openai/gpt-4o`).
 * `API Key` to your OpenAI API key. To find or create your OpenAI Project API Key, [see here](https://platform.openai.com/api-keys).

 ## Using OpenAI-Compatible Endpoints
@@ -17,7 +18,7 @@ Just as for OpenAI Chat completions, we use LiteLLM for OpenAI-compatible endpoi

 ## Using an OpenAI Proxy

-If you're using an OpenAI proxy, in the OpenHands UI through the Settings:
+If you're using an OpenAI proxy, in the OpenHands UI through the Settings under the `LLM` tab:
 1. Enable `Advanced` options
 2. Set the following:
   - `Custom Model` to openai/&lt;model-name&gt; (e.g. `openai/gpt-4o` or openai/&lt;proxy-prefix&gt;/&lt;model-name&gt;)
--- a/docs/modules/usage/llms/openrouter.md
+++ b/docs/modules/usage/llms/openrouter.md
@@ -1,12 +1,14 @@
 # OpenRouter

-OpenHands uses LiteLLM to make calls to chat models on OpenRouter. You can find their documentation on using OpenRouter as a provider [here](https://docs.litellm.ai/docs/providers/openrouter).
+OpenHands uses LiteLLM to make calls to chat models on OpenRouter. You can find their documentation on using 
+OpenRouter as a provider [here](https://docs.litellm.ai/docs/providers/openrouter).

 ## Configuration

-When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
+When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings under the `LLM` tab:
 * `LLM Provider` to `OpenRouter`
 * `LLM Model` to the model you will be using.
 [Visit here to see a full list of OpenRouter models](https://openrouter.ai/models).
-If the model is not in the list, toggle `Advanced` options, and enter it in `Custom Model` (e.g. openrouter/&lt;model-name&gt; like `openrouter/anthropic/claude-3.5-sonnet`).
+If the model is not in the list, enable `Advanced` options, and enter it in 
+`Custom Model` (e.g. openrouter/&lt;model-name&gt; like `openrouter/anthropic/claude-3.5-sonnet`).
 * `API Key` to your OpenRouter API key.
--- a/docs/modules/usage/llms/screenshots/1_select_power_user.png
+++ b/docs/modules/usage/llms/screenshots/1_select_power_user.png
--- a/docs/modules/usage/llms/screenshots/2_select_model.png
+++ b/docs/modules/usage/llms/screenshots/2_select_model.png
--- a/docs/modules/usage/llms/screenshots/3_select_devstral.png
+++ b/docs/modules/usage/llms/screenshots/3_select_devstral.png
--- a/docs/modules/usage/llms/screenshots/4_set_context_window.png
+++ b/docs/modules/usage/llms/screenshots/4_set_context_window.png
--- a/docs/modules/usage/llms/screenshots/5_copy_url.png
+++ b/docs/modules/usage/llms/screenshots/5_copy_url.png
--- a/docs/modules/usage/llms/screenshots/6_copy_to_get_model_name.png
+++ b/docs/modules/usage/llms/screenshots/6_copy_to_get_model_name.png
--- a/docs/modules/usage/mcp.md
+++ b/docs/modules/usage/mcp.md
@@ -13,9 +13,11 @@ or custom tools. MCP is based on the open standard defined at [modelcontextproto

 ## Configuration

-MCP configuration is defined in the `[mcp]` section of your `config.toml` file.
+MCP configuration can be defined in:
+* The OpenHands UI through the Settings under the `MCP` tab.
+* The `config.toml` file under the `[mcp]` section if not using the UI.

-### Configuration Example
+### Configuration Example via config.toml

 ```toml
 [mcp]
@@ -82,7 +84,7 @@ Stdio servers are configured using an object with the following properties:

 When OpenHands starts, it:

-1. Reads the MCP configuration from `config.toml`.
+1. Reads the MCP configuration.
 2. Connects to any configured SSE servers.
 3. Starts any configured stdio servers.
 4. Registers the tools provided by these servers with the agent.
--- a/docs/modules/usage/prompting/microagents-org.md
+++ b/docs/modules/usage/prompting/microagents-org.md
@@ -0,0 +1,23 @@
+# Organization and User Microagents
+
+## Purpose
+
+Organizations and users can define microagents that apply to all repositories belonging to the organization or user.
+
+## Usage
+
+These microagents can be [any type of microagent](./microagents-overview#microagent-types) and will be loaded 
+accordingly. However, they are applied to all repositories belonging to the organization or user.
+
+Add a `.openhands` repository under the organization or user and create a `microagents` directory and place the
+microagents in that directory.
+
+## Example
+
+General microagent file example for organization `Great-Co` located inside the `.openhands` repository:
+`microagents/org-microagent.md`:
+```
+* Use type hints and error boundaries; validate inputs at system boundaries and fail with meaningful error messages.
+* Document interfaces and public APIs; use implementation comments only for non-obvious logic.
+* Follow the same naming convention for variables, classes, constants, etc. already used in each repository.
+```
--- a/docs/modules/usage/prompting/microagents-overview.md
+++ b/docs/modules/usage/prompting/microagents-overview.md
@@ -7,7 +7,7 @@ They provide expert guidance, automate common tasks, and ensure consistent pract

 Currently OpenHands supports the following types of microagents:

- [General Repository Microagents](./microagents-repo): General guidelines for OpenHands about the repository.
+- [General Microagents](./microagents-repo): General guidelines for OpenHands about the repository.
 - [Keyword-Triggered Microagents](./microagents-keyword): Guidelines activated by specific keywords in prompts.

 To customize OpenHands' behavior, create a .openhands/microagents/ directory in the root of your repository and
@@ -24,7 +24,7 @@ Example repository structure:
 some-repository/
 └── .openhands/
    └── microagents/
-        └── repo.md            # General repository guidelines
+        └── repo.md            # General guidelines
        └── trigger_this.md    # Microagent triggered by specific keywords
        └── trigger_that.md    # Microagent triggered by specific keywords
 ```
@@ -34,7 +34,7 @@ some-repository/
 Each microagent file may include frontmatter that provides additional information. In some cases, this frontmatter
 is required:

-| Microagent Type                  | Required |
-|----------------------------------|----------|
-| `General Repository Microagents` | No       |
-| `Keyword-Triggered Microagents`  | Yes      |
+| Microagent Type                 | Required |
+|---------------------------------|----------|
+| `General Microagents`           | No       |
+| `Keyword-Triggered Microagents` | Yes      |
--- a/docs/modules/usage/prompting/microagents-repo.md
+++ b/docs/modules/usage/prompting/microagents-repo.md
@@ -1,4 +1,4 @@
-# General Repository Microagents
+# General Microagents

 ## Purpose

@@ -20,7 +20,7 @@ Frontmatter should be enclosed in triple dashes (---) and may include the follow

 ## Example

-General repository microagent file example located at `.openhands/microagents/repo.md`:
+General microagent file example located at `.openhands/microagents/repo.md`:
 ```
 This project is a TODO application that allows users to track TODO items.

@@ -28,4 +28,4 @@ To set it up, you can run `npm run build`.
 Always make sure the tests are passing before committing changes. You can run the tests by running `npm run test`.
 ```

-[See more examples of general repository microagents here.](https://github.com/All-Hands-AI/OpenHands/tree/main/.openhands/microagents)
+[See more examples of general microagents here.](https://github.com/All-Hands-AI/OpenHands/tree/main/.openhands/microagents)
--- a/docs/modules/usage/runtimes/local.md
+++ b/docs/modules/usage/runtimes/local.md
@@ -13,14 +13,16 @@ files on your machine. Only use this runtime in controlled environments or when
 Before using the Local Runtime, ensure that:

 1. You can run OpenHands using the [Development workflow](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
-2. tmux is available on your system.
+2. For Linux and Mac, tmux is available on your system.
+3. For Windows, PowerShell is available on your system.
+    - Only [CLI mode](../how-to/cli-mode) and [headless mode](../how-to/headless-mode) are supported in Windows with Local Runtime. 

 ## Configuration

 To use the Local Runtime, besides required configurations like the LLM provider, model and API key, you'll need to set
 the following options via environment variables or the [config.toml file](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml) when starting OpenHands:

-Via environment variables:
+Via environment variables (please use PowerShell syntax for Windows PowerShell):

 ```bash
 # Required
@@ -65,4 +67,4 @@ The Local Runtime is particularly useful for:

 - CI/CD pipelines where Docker is not available.
 - Testing and development of OpenHands itself.
- Environments where container usage is restricted.
+- Environments where container usage is restricted (e.g. native Windows).
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -70,7 +70,7 @@ const sidebars: SidebarsConfig = {
            },
            {
              type: 'doc',
-              label: 'General Repository Microagents',
+              label: 'General Microagents',
              id: 'usage/prompting/microagents-repo',
            },
            {
@@ -78,6 +78,11 @@ const sidebars: SidebarsConfig = {
              label: 'Keyword-Triggered Microagents',
              id: 'usage/prompting/microagents-keyword',
            },
+            {
+              type: 'doc',
+              label: 'Organization and User Microagents',
+              id: 'usage/prompting/microagents-org',
+            },
            {
              type: 'doc',
              label: 'Global Microagents',
@@ -267,6 +272,11 @@ const sidebars: SidebarsConfig = {
          label: 'Evaluation',
          id: 'usage/how-to/evaluation-harness',
        },
+        {
+          type: 'doc',
+          label: 'WebSocket Connection',
+          id: 'usage/how-to/websocket-connection',
+        },
      ],
    },
    {
--- a/docs/static/openapi.json
+++ b/docs/static/openapi.json
@@ -876,6 +876,11 @@
                    "type": "string",
                    "nullable": true
                  },
+                  "conversation_instructions": {
+                    "type": "string",
+                    "nullable": true,
+                    "description": "Optional instructions the agent must follow throughout the conversation while addressing the user's initial task"
+                  },
                  "image_urls": {
                    "type": "array",
                    "items": {
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -42,6 +42,37 @@ api_key = "XXX"
 temperature = 0.0
 ```

+### Configuring Condensers for Evaluation
+
+For benchmarks that support condenser configuration (like SWE-Bench), you can define multiple condenser configurations in your `config.toml` file. A condenser is responsible for managing conversation history to maintain context while staying within token limits - you can learn more about how it works [here](https://www.all-hands.dev/blog/openhands-context-condensensation-for-more-efficient-ai-agents):
+
+```toml
+# LLM-based summarizing condenser for evaluation
+[condenser.summarizer_for_eval]
+type = "llm"
+llm_config = "haiku"  # Reference to an LLM config to use for summarization
+keep_first = 2        # Number of initial events to always keep
+max_size = 100        # Maximum size of history before triggering summarization
+
+# Recent events condenser for evaluation
+[condenser.recent_for_eval]
+type = "recent"
+keep_first = 2        # Number of initial events to always keep
+max_events = 50       # Maximum number of events to keep in history
+```
+
+You can then specify which condenser configuration to use when running evaluation scripts, for example:
+
+```bash
+EVAL_CONDENSER=summarizer_for_eval \
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 500 100 1 princeton-nlp/SWE-bench_Verified test
+```
+
+The name is up to you, but should match a name defined in your `config.toml` file. The last argument in the command specifies the condenser configuration to use. In this case, `summarizer_for_eval` is used, which refers to the LLM-based summarizing condenser as defined above.
+
+If no condenser configuration is specified, the 'noop' condenser will be used by default, which keeps the full conversation history.
+```
+
 For other configurations specific to evaluation, such as `save_trajectory_path`, these are typically set in the `get_config` function of the respective `run_infer.py` file for each benchmark.

 ## Supported Benchmarks
--- a/evaluation/benchmarks/scienceagentbench/Dockerfile.evaluator
+++ b/evaluation/benchmarks/scienceagentbench/Dockerfile.evaluator
@@ -17,7 +17,7 @@ RUN git checkout 4eddc7db6449a5ade3e37285747c8b208cd54ce7
 RUN micromamba create -n sci-agent python=3.10 pip setuptools wheel
 RUN micromamba run -n sci-agent pip install -r requirements.txt

-# Replace all occurence of conda with micromamba under the /workspace
+# Replace all occurrences of conda with micromamba under the /workspace
 RUN find ./ -type f -exec sed -i 's/conda/micromamba/g' {} \;

 # pushd evaluation/scienceagentbench
--- a/evaluation/benchmarks/swe_bench/README.md
+++ b/evaluation/benchmarks/swe_bench/README.md
@@ -45,7 +45,7 @@ For example, for instance ID `django_django-11011`, it will try to pull our pre-
 This image will be used create an OpenHands runtime image where the agent will operate on.

 ```bash
-./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] [n_runs] [mode]

 # Example
 ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 500 100 1 princeton-nlp/SWE-bench_Verified test
@@ -63,19 +63,26 @@ to `CodeActAgent`.
 default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
 in order to use `eval_limit`, you must also set `agent`.
 - `max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
-default, it is set to 60.
+default, it is set to 100.
 - `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
 default, it is set to 1.
 - `dataset`, a huggingface dataset name. e.g. `princeton-nlp/SWE-bench`, `princeton-nlp/SWE-bench_Lite`, `princeton-nlp/SWE-bench_Verified`, or `princeton-nlp/SWE-bench_Multimodal`, specifies which dataset to evaluate on.
 - `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.

+- `n_runs`, e.g. `3`, is the number of times to run the evaluation. Default is 1.
+- `mode`, e.g. `swt`, `swt-ci`, or `swe`, specifies the evaluation mode. Default is `swe`.
+
 > [!CAUTION]
 > Setting `num_workers` larger than 1 is not officially tested, YMMV.

-There is also one optional environment variable you can set.
+There are also optional environment variables you can set:

 ```bash
-export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure.
+# Use hint text in the evaluation (default: false)
+export USE_HINT_TEXT=true # Ignore this if you are not sure.
+
+# Specify a condenser configuration for memory management (default: NoOpCondenser)
+export EVAL_CONDENSER=summarizer_for_eval # Name of the condenser config group in config.toml
 ```

 Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and CodeActAgent,
@@ -102,9 +109,9 @@ Fill out [this form](https://docs.google.com/forms/d/e/1FAIpQLSckVz_JFwg2_mOxNZj
 ```bash
 ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]

-# Example - This runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
+# Example - This runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 100 iteration per instances, with 16 number of workers running in parallel
 ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
-./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 100 16 "princeton-nlp/SWE-bench_Lite" test
 ```

 To clean-up all existing runtime you've already started, run:
@@ -176,7 +183,7 @@ Fill out [this form](https://docs.google.com/forms/d/e/1FAIpQLSckVz_JFwg2_mOxNZj

 # Example - This evaluates patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
 ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
-evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
+evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_100_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
 ```

 To clean-up all existing runtimes that you've already started, run:
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -44,6 +44,8 @@ from openhands.core.config import (
    get_llm_config_arg,
    get_parser,
 )
+from openhands.core.config.utils import get_condenser_config_arg
+from openhands.core.config.condenser_config import NoOpCondenserConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.critic import AgentFinishedCritic
@@ -261,6 +263,7 @@ def get_config(
        enable_jupyter=False,
        enable_browsing=RUN_WITH_BROWSING,
        enable_llm_editor=False,
+        enable_mcp=False,
        condenser=metadata.condenser_config,
        enable_prompt_extensions=False,
    )
@@ -714,6 +717,19 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
                subset = dataset[dataset[filter_column].isin(selected_ids)]
                logger.info(f'Retained {subset.shape[0]} tasks after filtering')
                return subset
+            if 'selected_repos' in data:
+                # repos for the swe-bench instances:
+                # ['astropy/astropy', 'django/django', 'matplotlib/matplotlib', 'mwaskom/seaborn', 'pallets/flask', 'psf/requests', 'pydata/xarray', 'pylint-dev/pylint', 'pytest-dev/pytest', 'scikit-learn/scikit-learn', 'sphinx-doc/sphinx', 'sympy/sympy']
+                selected_repos = data['selected_repos']
+                if isinstance(selected_repos, str): selected_repos = [selected_repos]
+                assert isinstance(selected_repos, list)
+                logger.info(
+                    f'Filtering {selected_repos} tasks from "selected_repos"...'
+                )
+                subset = dataset[dataset["repo"].isin(selected_repos)]
+                logger.info(f'Retained {subset.shape[0]} tasks after filtering')
+                return subset
+                
    skip_ids = os.environ.get('SKIP_IDS', '').split(',')
    if len(skip_ids) > 0:
        logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
@@ -742,6 +758,7 @@ if __name__ == '__main__':
        choices=['swe', 'swt', 'swt-ci'],
        help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'",
    )
+
    args, _ = parser.parse_known_args()

    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
@@ -778,6 +795,19 @@ if __name__ == '__main__':
    if llm_config is None:
        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

+    # Get condenser config from environment variable
+    condenser_name = os.environ.get('EVAL_CONDENSER')
+    if condenser_name:
+        condenser_config = get_condenser_config_arg(condenser_name)
+        if condenser_config is None:
+            raise ValueError(
+                f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}'
+            )
+    else:
+        # If no specific condenser config is provided via env var, default to NoOpCondenser
+        condenser_config = NoOpCondenserConfig()
+        logger.debug('No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.')
+
    details = {'mode': args.mode}
    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)

@@ -792,6 +822,7 @@ if __name__ == '__main__':
        args.eval_note,
        args.eval_output_dir,
        details=details,
+        condenser_config=condenser_config,
    )

    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
--- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
@@ -14,6 +14,7 @@ SPLIT=$8
 N_RUNS=$9
 MODE=${10}

+
 if [ -z "$NUM_WORKERS" ]; then
  NUM_WORKERS=1
  echo "Number of workers not specified, use default $NUM_WORKERS"
@@ -26,8 +27,8 @@ if [ -z "$AGENT" ]; then
 fi

 if [ -z "$MAX_ITER" ]; then
-  echo "MAX_ITER not specified, use default 60"
-  MAX_ITER=60
+  echo "MAX_ITER not specified, use default 100"
+  MAX_ITER=100
 fi

 if [ -z "$RUN_WITH_BROWSING" ]; then
@@ -51,6 +52,12 @@ if [ -z "$MODE" ]; then
  echo "MODE not specified, use default $MODE"
 fi

+if [ -n "$EVAL_CONDENSER" ]; then
+  echo "Using Condenser Config: $EVAL_CONDENSER"
+else
+  echo "No Condenser Config provided via EVAL_CONDENSER, use default (NoOpCondenser)."
+fi
+
 export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
 echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"

@@ -65,6 +72,7 @@ echo "MAX_ITER: $MAX_ITER"
 echo "NUM_WORKERS: $NUM_WORKERS"
 echo "COMMIT_HASH: $COMMIT_HASH"
 echo "MODE: $MODE"
+echo "EVAL_CONDENSER: $EVAL_CONDENSER"

 # Default to NOT use Hint
 if [ -z "$USE_HINT_TEXT" ]; then
@@ -88,6 +96,10 @@ fi
 if [ "$MODE" != "swe" ]; then
  EVAL_NOTE="${EVAL_NOTE}-${MODE}"
 fi
+# Add condenser config to eval note if provided
+if [ -n "$EVAL_CONDENSER" ]; then
+  EVAL_NOTE="${EVAL_NOTE}-${EVAL_CONDENSER}"
+fi

 function run_eval() {
  local eval_note="${1}"
@@ -101,6 +113,8 @@ function run_eval() {
    --split $SPLIT \
    --mode $MODE"

+
+
  if [ -n "$EVAL_LIMIT" ]; then
    echo "EVAL_LIMIT: $EVAL_LIMIT"
    COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
--- a/evaluation/benchmarks/visual_swe_bench/README.md
+++ b/evaluation/benchmarks/visual_swe_bench/README.md
@@ -0,0 +1,172 @@
+# Visual SWE-Bench Evaluation with Docker Image
+
+This folder contains the evaluation harness that we built on top of the original [Visual SWE-Bench benchmark](https://multi-swe-bench.github.io/#/) ([paper](https://arxiv.org/abs/2412.17315)).
+
+The evaluation consists of three steps:
+
+1. Environment setup: [install python environment](../../README.md#development-environment), [configure LLM config](../../README.md#configure-openhands-and-your-llm), and [pull docker](#openhands-visual-swe-bench-instance-level-docker-support).
+2. [Run inference](#run-inference-on-visual-swe-bench-instances): Generate a edit patch for each Github issue.
+3. [Evaluate patches using Visual SWE-Bench docker](#evaluate-generated-patches).
+
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
+
+## OpenHands Visual SWE-Bench Instance-level Docker Support
+
+OpenHands now support using the official evaluation docker for both **[inference](#run-inference-on-visual-swe-bench-instances) and [evaluation](#evaluate-generated-patches)**.
+This is now the default behavior.
+
+## Run Inference on Visual SWE-Bench Instances
+
+Make sure your Docker daemon is running, and you have ample disk space for the [instance-level docker image](#openhands-visual-swe-bench-instance-level-docker-support).
+
+When the `run_infer.sh` script is started, it will automatically pull the relevant Visual SWE-Bench images. For example, for instance ID `networkx__networkx-6503`, it will try to pull our pre-build docker image `sweb.eval.x86_64.networkx_s_networkx-6503` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.
+
+```bash
+./evaluation/benchmarks/visual_swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers]
+
+# Example
+./evaluation/benchmarks/visual_swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 133 30 1
+```
+
+where `model_config` is mandatory, and the rest are optional.
+
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
+default, the script evaluates the entire Visual SWE-bench set (133 issues). Note:
+in order to use `eval_limit`, you must also set `agent`.
+- `max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
+default, it is set to 30.
+- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
+default, it is set to 1.
+
+There are also two optional environment variables you can set.
+
+```bash
+export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure.
+export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images. Default to true
+```
+
+Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and CodeActAgent,
+
+then your command would be:
+
+```bash
+./evaluation/benchmarks/visual_swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
+```
+
+### Specify a subset of tasks to run infer
+
+If you would like to specify a list of tasks you'd like to benchmark on, you could
+create a `config.toml` under `./evaluation/benchmarks/visual_swe_bench/` folder, and put a list
+attribute named `selected_ids`, e.g.
+
+```toml
+selected_ids = ['astropy__astropy-13838', 'matplotlib__matplotlib-21617', 'plotly__plotly.py-1966']
+```
+
+Then only these tasks (rows whose `instance_id` is in the above list) will be evaluated.
+In this case, `eval_limit` option applies to tasks that are in the `selected_ids` list.
+
+After running the inference, you will obtain a `output.jsonl` (by default it will be saved to `evaluation/evaluation_outputs`).
+
+## Evaluate Generated Patches
+
+### Download Docker Images
+
+**(Recommended for reproducibility)** If you have extra local space (e.g., 200GB), you can try pull the instance-level docker images we've prepared by running:
+
+```bash
+evaluation/benchmarks/visual_swe_bench/scripts/docker/pull_all_eval_docker.sh instance
+```
+
+If you want to save disk space a bit, while speeding up the image pre-build process, you can pull the environment-level docker images:
+
+```bash
+evaluation/benchmarks/visual_swe_bench/scripts/docker/pull_all_eval_docker.sh env
+```
+
+If you want to evaluate on the full SWE-Bench test set:
+
+```bash
+evaluation/benchmarks/visual_swe_bench/scripts/docker/pull_all_eval_docker.sh instance full
+```
+
+### Run evaluation
+
+With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patches, and produce a fine-grained report.
+
+**This evaluation is performed using the official dockerized evaluation announced.**
+
+> If you want to evaluate existing results, you should first run this to clone existing outputs
+>
+>```bash
+>git clone https://huggingface.co/spaces/OpenHands/evaluation evaluation/evaluation_outputs
+>```
+
+NOTE, you should have already pulled the instance-level OR env-level docker images following [this section](#openhands-visual-swe-bench-instance-level-docker-support).
+
+Then you can run the following:
+
+```bash
+./evaluation/benchmarks/visual_swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id]
+
+# Example
+./evaluation/benchmarks/visual_swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/luolin101__Visual-SWE-bench-test/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
+```
+
+The script now accepts optional arguments:
+
+- `instance_id`: Specify a single instance to evaluate (optional)
+
+For example, to evaluate a specific instance with a custom dataset and split:
+
+```bash
+./evaluation/benchmarks/visual_swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL instance_123
+```
+
+> You can also pass in a JSONL with SWE-Bench format to `./evaluation/benchmarks/visual_swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`.
+
+The final results will be saved to `evaluation/evaluation_outputs/outputs/visual_swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/` with the following files/directory:
+
+- `README.md`: a report showing what are the instances that passed, failed, etc.
+- `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent.
+- `logs/`: a directory of test logs
+
+## Visualize Results
+
+First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
+
+```bash
+git clone https://huggingface.co/spaces/OpenHands/evaluation
+```
+
+**(optional) setup streamlit environment with conda**:
+
+```bash
+cd evaluation
+conda create -n streamlit python=3.10
+conda activate streamlit
+pip install -r requirements.txt
+```
+
+**run the visualizer**:
+Then, in a separate Python environment with `streamlit` library, you can run the following:
+
+```bash
+# Make sure you are inside the cloned `evaluation` repo
+conda activate streamlit # if you follow the optional conda env setup above
+streamlit app.py --server.port 8501 --server.address 0.0.0.0
+```
+
+Then you can access the SWE-Bench trajectory visualizer at `localhost:8501`.
+
+## Submit your evaluation results
+
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
--- a/evaluation/benchmarks/visual_swe_bench/init.py
+++ b/evaluation/benchmarks/visual_swe_bench/init.py
--- a/evaluation/benchmarks/visual_swe_bench/run_infer.py
+++ b/evaluation/benchmarks/visual_swe_bench/run_infer.py
@@ -0,0 +1,641 @@
+import asyncio
+import json
+import os
+import tempfile
+from typing import Any
+
+import pandas as pd
+import toml
+from datasets import load_dataset
+
+import openhands.agenthub
+from evaluation.benchmarks.swe_bench.resource.mapping import (
+    get_instance_resource_factor,
+)
+from evaluation.utils.shared import (
+    EvalException,
+    EvalMetadata,
+    EvalOutput,
+    assert_and_raise,
+    codeact_user_response,
+    get_default_sandbox_config_for_eval,
+    get_metrics,
+    is_fatal_evaluation_error,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+    update_llm_config_for_completions_logging,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AgentConfig,
+    AppConfig,
+    get_llm_config_arg,
+    get_parser,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.observation import CmdOutputObservation, ErrorObservation
+from openhands.events.serialization.event import event_to_dict
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+from openhands.utils.shutdown_listener import sleep_if_should_continue
+
+USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
+RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+}
+
+
+def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
+    return f'{instance.repo}__{instance.version}'.replace('/', '__')
+
+
+def get_instruction(instance: pd.Series, metadata: EvalMetadata):
+    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
+    # Instruction based on Anthropic's official trajectory
+    # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
+    instruction = (
+        '<uploaded_files>\n'
+        f'/workspace/{workspace_dir_name}\n'
+        '</uploaded_files>\n'
+        f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
+        f'<issue_description>\n'
+        f'{instance.problem_statement}\n'
+        '</issue_description>\n\n'
+        'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
+        "I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
+        "Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
+        'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
+        'Follow these steps to resolve the issue:\n'
+        '1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
+        '2. Create a script to reproduce the error and execute it with `python <filename.py>` using the BashTool, to confirm the error\n'
+        '3. Edit the sourcecode of the repo to resolve the issue\n'
+        '4. Rerun your reproduce script and confirm that the error is fixed!\n'
+        '5. Think about edgecases, add comprehensive tests for them in your reproduce script, and run them to make sure your fix handles them as well\n'
+        f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
+        '   - The issue you are fixing\n'
+        '   - The files you modified\n'
+        '   - The functions you changed\n'
+        '   Make sure all these tests pass with your changes.\n'
+        "Your thinking should be thorough and so it's fine if it's very long.\n"
+    )
+
+    if RUN_WITH_BROWSING:
+        instruction += (
+            '<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
+        )
+    return instruction
+
+
+# TODO: migrate all swe-bench docker to ghcr.io/openhands
+DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
+logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
+
+
+def get_instance_docker_image(instance_id: str, official_image: bool = False) -> str:
+    image_name = 'sweb.eval.x86_64.' + instance_id
+    image_name = image_name.replace(
+        '__', '_s_'
+    )  # to comply with docker image naming convention
+    other_list = [
+        'plotly__plotly.py-4083',
+        'plotly__plotly.py-2600',
+        'plotly__plotly.py-2591',
+        'plotly__plotly.py-1966',
+        'networkx__networkx-6503',
+        'networkx__networkx-6098',
+        'networkx__networkx-5616',
+        'networkx__networkx-5354',
+        'networkx__networkx-5058',
+        'networkx__networkx-4378',
+        'networkx__networkx-3764',
+        'vega__altair-2785',
+        'vega__altair-1092',
+        'vega__altair-974',
+        'vega__altair-830',
+        'matplotlib__matplotlib-27754',
+        'matplotlib__matplotlib-26926',
+        'matplotlib__matplotlib-26788',
+        'matplotlib__matplotlib-26586',
+        'sympy__sympy-26941',
+        'mwaskom__seaborn-3458',
+        'mwaskom__seaborn-3454',
+    ]
+    if instance_id in other_list:
+        return ('docker.io/luolin101/'.rstrip('/') + '/' + image_name).lower()
+    return (DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + image_name).lower()
+
+
+def get_config(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+) -> AppConfig:
+    # We use a different instance image for the each instance of swe-bench eval
+    use_official_image = bool(
+        'verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower()
+    )
+    base_container_image = get_instance_docker_image(
+        instance['instance_id'], use_official_image
+    )
+    logger.info(
+        f'Using instance container image: {base_container_image}. '
+        f'Please make sure this image exists. '
+        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
+    )
+
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = base_container_image
+    sandbox_config.enable_auto_lint = True
+    sandbox_config.use_host_network = False
+    # Add platform to the sandbox config to solve issue 4401
+    sandbox_config.platform = 'linux/amd64'
+    sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
+        dataset_name=metadata.dataset,
+        instance_id=instance['instance_id'],
+    )
+
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        max_iterations=metadata.max_iterations,
+        runtime=os.environ.get('RUNTIME', 'docker'),
+        sandbox=sandbox_config,
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(
+        update_llm_config_for_completions_logging(
+            metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
+        )
+    )
+    agent_config = AgentConfig(
+        enable_jupyter=False,
+        enable_browsing=RUN_WITH_BROWSING,
+        enable_llm_editor=False,
+        condenser=metadata.condenser_config,
+        enable_prompt_extensions=False,
+    )
+    config.set_agent_config(agent_config)
+    return config
+
+
+def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Initialization Fn')
+    logger.info('-' * 30)
+    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(
+        command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc"""
+    )
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0, f'Failed to export SWE_INSTANCE_ID: {str(obs)}'
+    )
+
+    action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
+
+    # inject the init script
+    script_dir = os.path.dirname(__file__)
+
+    # inject the instance info
+    action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to create /swe_util/eval_data/instances: {str(obs)}',
+    )
+
+    swe_instance_json_name = 'swe-bench-instance.json'
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Construct the full path for the desired file name within the temporary directory
+        temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
+        # Write to the file with the desired name within the temporary directory
+        with open(temp_file_path, 'w') as f:
+            if not isinstance(instance, dict):
+                json.dump([instance.to_dict()], f)
+            else:
+                json.dump([instance], f)
+
+        # Copy the file to the desired location
+        runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
+
+        # inject the instance swe entry
+        runtime.copy_to(
+            str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
+            '/swe_util/',
+        )
+
+    action = CmdRunAction(command='cat ~/.bashrc')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
+
+    action = CmdRunAction(command='source ~/.bashrc')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    if isinstance(obs, ErrorObservation):
+        logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
+    assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
+
+    action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}',
+    )
+
+    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
+    )
+
+    action = CmdRunAction(command='git reset --hard')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}')
+
+    action = CmdRunAction(
+        command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
+    )
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
+
+    action = CmdRunAction(command='which python')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0 and 'testbed' in obs.content,
+        f'Expected to find python interpreter from testbed, but got: {str(obs)}',
+    )
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Initialization Fn')
+    logger.info('-' * 30)
+
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Completion Fn')
+    logger.info('-' * 30)
+    obs: CmdOutputObservation
+    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
+
+    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    if obs.exit_code == -1:
+        # The previous command is still running
+        # We need to kill previous command
+        logger.info('The previous command is still running, trying to kill it...')
+        action = CmdRunAction(command='C-c')
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Then run the command again
+        action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
+    )
+
+    action = CmdRunAction(command='git config --global core.pager ""')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to git config --global core.pager "": {str(obs)}',
+    )
+
+    # First check for any git repositories in subdirectories
+    action = CmdRunAction(command='find . -type d -name .git -not -path "./.git"')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to find git repositories: {str(obs)}',
+    )
+
+    git_dirs = [p for p in obs.content.strip().split('\n') if p]
+    if git_dirs:
+        # Remove all .git directories in subdirectories
+        for git_dir in git_dirs:
+            action = CmdRunAction(command=f'rm -rf "{git_dir}"')
+            action.set_hard_timeout(600)
+            logger.info(action, extra={'msg_type': 'ACTION'})
+            obs = runtime.run_action(action)
+            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+            assert_and_raise(
+                isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+                f'Failed to remove git directory {git_dir}: {str(obs)}',
+            )
+
+    # add all files
+    action = CmdRunAction(command='git add -A')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to git add -A: {str(obs)}',
+    )
+
+    n_retries = 0
+    git_patch = None
+    while n_retries < 5:
+        action = CmdRunAction(
+            command=f'git diff --no-color --cached {instance["base_commit"]}'
+        )
+        action.set_hard_timeout(max(300 + 100 * n_retries, 600))
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        n_retries += 1
+        if isinstance(obs, CmdOutputObservation):
+            if obs.exit_code == 0:
+                git_patch = obs.content.strip()
+                break
+            else:
+                logger.info('Failed to get git diff, retrying...')
+                sleep_if_should_continue(10)
+        elif isinstance(obs, ErrorObservation):
+            logger.error(f'Error occurred: {obs.content}. Retrying...')
+            sleep_if_should_continue(10)
+        else:
+            assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
+
+    assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Completion Fn')
+    logger.info('-' * 30)
+    return {'git_patch': git_patch}
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+    runtime_failure_count: int = 0,
+) -> EvalOutput:
+    config = get_config(instance, metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
+
+    # Increase resource_factor with increasing attempt_id
+    if runtime_failure_count > 0:
+        config.sandbox.remote_runtime_resource_factor = min(
+            config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
+            8,
+        )
+        logger.warning(
+            f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
+        )
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    try:
+        initialize_runtime(runtime, instance)
+
+        instruction = get_instruction(instance, metadata)
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State | None = asyncio.run(
+            run_controller(
+                config=config,
+                initial_user_action=MessageAction(content=instruction),
+                runtime=runtime,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                    metadata.agent_class
+                ],
+            )
+        )
+
+        # if fatal error, throw EvalError to trigger re-run
+        if is_fatal_evaluation_error(state.last_error):
+            raise EvalException('Fatal error detected: ' + state.last_error)
+
+        # ======= THIS IS SWE-Bench specific =======
+        # Get git patch
+        return_val = complete_runtime(runtime, instance)
+        git_patch = return_val['git_patch']
+        logger.info(
+            f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
+        )
+    finally:
+        runtime.close()
+    # ==========================================
+
+    # ======= Attempt to evaluate the agent's edits =======
+    # we use eval_infer.sh to evaluate the agent's edits, not here
+    # because the agent may alter the environment / testcases
+    test_result = {
+        'git_patch': git_patch,
+    }
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
+    histories = [event_to_dict(event) for event in state.history]
+    metrics = get_metrics(state)
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instruction=instruction,
+        instance=instance.to_dict(),  # SWE Bench specific
+        test_result=test_result,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+    )
+    return output
+
+
+def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
+    file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml')
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as file:
+            data = toml.load(file)
+            if 'selected_ids' in data:
+                selected_ids = data['selected_ids']
+                logger.info(
+                    f'Filtering {len(selected_ids)} tasks from "selected_ids"...'
+                )
+                subset = dataset[dataset[filter_column].isin(selected_ids)]
+                logger.info(f'Retained {subset.shape[0]} tasks after filtering')
+                return subset
+    skip_ids = os.environ.get('SKIP_IDS', '').split(',')
+    if len(skip_ids) > 0:
+        logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
+        return dataset[~dataset[filter_column].isin(skip_ids)]
+    return dataset
+
+
+# A list of instances that are known to be tricky to infer
+# (will cause runtime failure even with resource factor = 8)
+SWEGYM_EXCLUDE_IDS = [
+    'dask__dask-10422',
+    'pandas-dev__pandas-50548',
+    'pandas-dev__pandas-53672',
+    'pandas-dev__pandas-54174',
+    'pandas-dev__pandas-55518',
+    'pandas-dev__pandas-58383',
+    'pydata__xarray-6721',
+    'pytest-dev__pytest-10081',
+    'pytest-dev__pytest-7236',
+]
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='princeton-nlp/SWE-bench',
+        help='data set to evaluate on, either full-test or lite-test',
+    )
+    parser.add_argument(
+        '--split',
+        type=str,
+        default='test',
+        help='split to evaluate on',
+    )
+    args, _ = parser.parse_known_args()
+
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenHands's repo
+    dataset = load_dataset(args.dataset, split=args.split)
+    swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
+    logger.info(
+        f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
+    )
+    if 'SWE-Gym' in args.dataset:
+        swe_bench_tests = swe_bench_tests[
+            ~swe_bench_tests['instance_id'].isin(SWEGYM_EXCLUDE_IDS)
+        ]
+        logger.info(
+            f'{len(swe_bench_tests)} tasks left after excluding SWE-Gym excluded tasks'
+        )
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+        llm_config.log_completions = True
+        # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
+        llm_config.modify_params = False
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    details = {}
+    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
+
+    dataset_descrption = (
+        args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__')
+    )
+    metadata = make_metadata(
+        llm_config,
+        dataset_descrption,
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+        details=details,
+    )
+
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    print(f'### OUTPUT FILE: {output_file} ###')
+    instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
+
+    if len(instances) > 0 and not isinstance(
+        instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str
+    ):
+        for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']:
+            instances[col] = instances[col].apply(lambda x: str(x))
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+        timeout_seconds=8 * 60 * 60,  # 8 hour PER instance should be more than enough
+        max_retries=5,
+    )
--- a/evaluation/benchmarks/visual_swe_bench/scripts/docker/all-visualswebench-full-instance-images.txt
+++ b/evaluation/benchmarks/visual_swe_bench/scripts/docker/all-visualswebench-full-instance-images.txt
@@ -0,0 +1,157 @@
+xingyaoww/sweb.eval.x86_64.astropy_s_astropy-11693:latest
+xingyaoww/sweb.eval.x86_64.astropy_s_astropy-13838:latest
+xingyaoww/sweb.eval.x86_64.astropy_s_astropy-14295:latest
+xingyaoww/sweb.eval.x86_64.astropy_s_astropy-8292:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-13908:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-13980:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-13983:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-13984:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-14043:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-14623:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-19763:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-20470:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-20518:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-20584:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-20761:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-20826:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-21443:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-21490:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-21550:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-21568:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-21617:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-22865:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-22871:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-22931:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-23047:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-23111:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-23412:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-24088:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-24177:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-24189:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-24570:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-24691:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-24749:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-24768:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-24849:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-24870:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-24971:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-25287:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-25334:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-25340:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-25346:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-25405:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-25499:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-25565:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-25640:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-25667:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-25779:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-26078:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-26466:latest
+xingyaoww/sweb.eval.x86_64.mwaskom_s_seaborn-2576:latest
+xingyaoww/sweb.eval.x86_64.mwaskom_s_seaborn-2846:latest
+xingyaoww/sweb.eval.x86_64.mwaskom_s_seaborn-2979:latest
+xingyaoww/sweb.eval.x86_64.mwaskom_s_seaborn-3180:latest
+xingyaoww/sweb.eval.x86_64.mwaskom_s_seaborn-3187:latest
+xingyaoww/sweb.eval.x86_64.mwaskom_s_seaborn-3202:latest
+xingyaoww/sweb.eval.x86_64.mwaskom_s_seaborn-3216:latest
+xingyaoww/sweb.eval.x86_64.mwaskom_s_seaborn-3217:latest
+xingyaoww/sweb.eval.x86_64.mwaskom_s_seaborn-3276:latest
+xingyaoww/sweb.eval.x86_64.mwaskom_s_seaborn-3394:latest
+xingyaoww/sweb.eval.x86_64.pydata_s_xarray-4182:latest
+xingyaoww/sweb.eval.x86_64.pydata_s_xarray-5682:latest
+xingyaoww/sweb.eval.x86_64.pylint-dev_s_pylint-4551:latest
+xingyaoww/sweb.eval.x86_64.scikit-learn_s_scikit-learn-13087:latest
+xingyaoww/sweb.eval.x86_64.scikit-learn_s_scikit-learn-13618:latest
+xingyaoww/sweb.eval.x86_64.scikit-learn_s_scikit-learn-14067:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-10048:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-10097:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-10191:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-10435:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-11266:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-11502:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-7615:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-7757:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-8028:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-8056:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-8075:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-8120:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-8265:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-8278:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-8620:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-8621:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-8638:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-8658:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9229:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9230:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9289:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9320:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9350:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9464:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9673:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9698:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9797:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9982:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9987:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9997:latest
+xingyaoww/sweb.eval.x86_64.sphinx-doc_s_sphinx-9999:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-11787:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-11788:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-13264:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-13840:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-15151:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-15304:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-15625:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-15976:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-16003:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-17067:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-17115:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-18922:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-21769:latest
+xingyaoww/sweb.eval.x86_64.sympy_s_sympy-24723:latest
+luolin101/sweb.eval.x86_64.plotly_s_plotly.py-4083:latest
+luolin101/sweb.eval.x86_64.plotly_s_plotly.py-2600:latest
+luolin101/sweb.eval.x86_64.plotly_s_plotly.py-2591:latest
+luolin101/sweb.eval.x86_64.plotly_s_plotly.py-1966:latest
+luolin101/sweb.eval.x86_64.networkx_s_networkx-6503:latest
+luolin101/sweb.eval.x86_64.networkx_s_networkx-6098:latest
+luolin101/sweb.eval.x86_64.networkx_s_networkx-5616:latest
+luolin101/sweb.eval.x86_64.networkx_s_networkx-5354:latest
+luolin101/sweb.eval.x86_64.networkx_s_networkx-5058:latest
+luolin101/sweb.eval.x86_64.networkx_s_networkx-4378:latest
+luolin101/sweb.eval.x86_64.networkx_s_networkx-3764:latest
+luolin101/sweb.eval.x86_64.vega_s_altair-2785:latest
+luolin101/sweb.eval.x86_64.vega_s_altair-1092:latest
+luolin101/sweb.eval.x86_64.vega_s_altair-974:latest
+luolin101/sweb.eval.x86_64.vega_s_altair-830:latest
+luolin101/sweb.eval.x86_64.matplotlib_s_matplotlib-27754:latest
+luolin101/sweb.eval.x86_64.matplotlib_s_matplotlib-26926:latest
+luolin101/sweb.eval.x86_64.matplotlib_s_matplotlib-26788:latest
+luolin101/sweb.eval.x86_64.matplotlib_s_matplotlib-26586:latest
+luolin101/sweb.eval.x86_64.sympy_s_sympy-26941:latest
+luolin101/sweb.eval.x86_64.mwaskom_s_seaborn-3458:latest
+luolin101/sweb.eval.x86_64.mwaskom_s_seaborn-3454:latest
+xingyaoww/sweb.eval.x86_64.matplotlib_s_matplotlib-25631:latest
+xingyaoww/sweb.env.x86_64.428468730904ff6b4232aa:latest
+xingyaoww/sweb.env.x86_64.89a9e6df7ab7bcb9e010c8:latest
+xingyaoww/sweb.env.x86_64.15374367de368534f261e3:latest
+xingyaoww/sweb.env.x86_64.6b007979cf533f0f3016e8:latest
+xingyaoww/sweb.env.x86_64.b382c45e0a94d34ef0fc86:latest
+xingyaoww/sweb.env.x86_64.7037e8c448a4b8ebfe9b13:latest
+xingyaoww/sweb.env.x86_64.31244378a92e3bcce809ac:latest
+xingyaoww/sweb.env.x86_64.efa6065ed5bf204410fd53:latest
+xingyaoww/sweb.env.x86_64.a0efca7a0fe6719dbf65c2:latest
+xingyaoww/sweb.env.x86_64.502d8fc6ebccd881244091:latest
+luolin101/sweb.env.x86_64.eb002359cfcbe2edb56088:latest
+xingyaoww/sweb.env.x86_64.d905bb51fb68acc5d4221b:latest
+xingyaoww/sweb.env.x86_64.aa92880033da20ca313928:latest
+luolin101/sweb.env.x86_64.c6d251a05e0af7688b64fd:latest
+xingyaoww/sweb.env.x86_64.c795f4b88616b8462021ed:latest
+luolin101/sweb.env.x86_64.1e5a06e76ee016d067d77e:latest
+luolin101/sweb.env.x86_64.2e03d8e4d4bd373937a9ef:latest
+luolin101/sweb.env.x86_64.4c16026920d27ea78f3b7a:latest
+luolin101/sweb.env.x86_64.d15120dfdbda9831e9646b:latest
+luolin101/sweb.env.x86_64.c581ba273c3275679773dd:latest
+luolin101/sweb.env.x86_64.dc800a1bbe275c5de0c4aa:latest
+luolin101/sweb.env.x86_64.59bd7d84a0939c7caba7e6:latest
+xingyaoww/sweb.env.x86_64.0d80c7dec81ee2f2f513e2:latest
+xingyaoww/sweb.base.x86_64:latest
--- a/evaluation/benchmarks/visual_swe_bench/scripts/docker/pull_all_eval_docker.sh
+++ b/evaluation/benchmarks/visual_swe_bench/scripts/docker/pull_all_eval_docker.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+set -e
+
+LEVEL=$1
+# three levels:
+# - base, keyword "sweb.base"
+# - env, keyword "sweb.env"
+# - instance, keyword "sweb.eval"
+SET=$2
+
+if [ -z "$LEVEL" ]; then
+    echo "Usage: $0 <cache_level> <set>"
+    echo "cache_level: base, env, or instance"
+    echo "set: lite, full"
+    exit 1
+fi
+
+if [ -z "$SET" ]; then
+    echo "Usage: $0 <cache_level> <set>"
+    echo "cache_level: base, env, or instance"
+    echo "set: lite, full, default is lite"
+    SET="lite"
+fi
+
+
+if [ "$SET" == "full" ]; then
+    IMAGE_FILE="$(dirname "$0")/all-visualswebench-full-instance-images.txt"
+else
+    IMAGE_FILE="$(dirname "$0")/all-visualswebench-full-instance-images.txt"
+fi
+
+# Define a pattern based on the level
+case $LEVEL in
+    base)
+        PATTERN="sweb.base"
+        ;;
+    env)
+        PATTERN="sweb.base\|sweb.env"
+        ;;
+    instance)
+        PATTERN="sweb.base\|sweb.env\|sweb.eval"
+        ;;
+    *)
+        echo "Invalid cache level: $LEVEL"
+        echo "Valid levels are: base, env, instance"
+        exit 1
+        ;;
+esac
+
+echo "Pulling docker images for [$LEVEL] level"
+
+echo "Pattern: $PATTERN"
+echo "Image file: $IMAGE_FILE"
+
+# Read each line from the file, filter by pattern, and pull the docker image
+grep "$PATTERN" "$IMAGE_FILE" | while IFS= read -r image; do
+    echo "Pulling $image into $image"
+    docker pull $image
+    # replace _s_ to __ in the image name
+    renamed_image=$(echo "$image" | sed 's|.*/||; s/_s_/__/g')
+    docker tag $image $renamed_image
+done
--- a/evaluation/benchmarks/visual_swe_bench/scripts/eval_infer.sh
+++ b/evaluation/benchmarks/visual_swe_bench/scripts/eval_infer.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+
+PROCESS_FILEPATH=$1
+if [ -z "$PROCESS_FILEPATH" ]; then
+    echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
+    exit 1
+fi
+
+if [ ! -f $PROCESS_FILEPATH ]; then
+    echo "Error: $PROCESS_FILEPATH is not a file"
+    exit 1
+fi
+
+# If instance_id is empty, it means we want to eval on the whole $PROCESS_FILEPATH
+# otherwise, we want to eval on the instance_id
+INSTANCE_ID=$2
+DATASET_NAME=${3:-"luolin101/Visual-SWE-bench"}
+SPLIT=${4:-"test"}
+
+echo "INSTANCE_ID: $INSTANCE_ID"
+echo "DATASET_NAME: $DATASET_NAME"
+echo "SPLIT: $SPLIT"
+
+PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
+FILE_DIR=$(dirname $PROCESS_FILEPATH)
+FILE_NAME=$(basename $PROCESS_FILEPATH)
+
+echo "Evaluating $FILE_NAME @ $FILE_DIR"
+
+# ================================================
+# detect whether PROCESS_FILEPATH is in OH format or in SWE-bench format
+echo "=============================================================="
+echo "Detecting whether PROCESS_FILEPATH is in OH format or in SWE-bench format"
+echo "=============================================================="
+# SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch
+function is_swebench_format() {
+    # Read the first line of the file
+    read -r first_line < "$PROCESS_FILEPATH"
+
+    # Use jq to check if the first line has the required fields
+    echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null
+
+    if [ $? -ne 0 ]; then
+        return 1 # Return 1 if the first line does not have the required fields
+    fi
+
+    return 0 # Return 0 if the first line has the required fields
+}
+# Call the function with the file path
+is_swebench_format "$PROCESS_FILEPATH"
+IS_SWEBENCH_FORMAT=$?
+# Use the result in an if-else statement
+if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
+    echo "The file IS in SWE-bench format."
+    SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH
+else
+    echo "The file IS NOT in SWE-bench format."
+
+    # ==== Convert OH format to SWE-bench format ====
+    echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
+    poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
+    # replace .jsonl with .swebench.jsonl in filename
+    SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
+    echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
+    # assert that the file exists
+    if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then
+        echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process."
+        exit 1
+    fi
+    SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL)
+fi
+# ================================================
+
+echo "=============================================================="
+echo "Running SWE-bench evaluation"
+echo "=============================================================="
+
+RUN_ID=$(date +"%Y%m%d_%H%M%S")
+N_PROCESS=16
+
+if [ -z "$INSTANCE_ID" ]; then
+    echo "Running SWE-bench evaluation on the whole input file..."
+    # Default to SWE-Bench-lite
+    # change `--dataset_name` and `--split` to alter dataset
+
+    poetry run python -m visualswebench.harness.run_evaluation \
+        --dataset_name "$DATASET_NAME" \
+        --split "$SPLIT" \
+        --predictions_path $SWEBENCH_FORMAT_JSONL \
+        --timeout 1800 \
+        --cache_level instance \
+        --max_workers $N_PROCESS \
+        --run_id $RUN_ID
+
+    # get the "model_name_or_path" from the first line of the SWEBENCH_FORMAT_JSONL
+    MODEL_NAME_OR_PATH=$(jq -r '.model_name_or_path' $SWEBENCH_FORMAT_JSONL | head -n 1)
+    echo "MODEL_NAME_OR_PATH: $MODEL_NAME_OR_PATH"
+
+    RESULT_OUTPUT_DIR=$(dirname $SWEBENCH_FORMAT_JSONL)
+    echo "RESULT_OUTPUT_DIR: $RESULT_OUTPUT_DIR"
+
+    # move the eval results to the target directory
+    mkdir -p $RESULT_OUTPUT_DIR
+    # rm eval_outputs directory if it exists
+    if [ -d $RESULT_OUTPUT_DIR/eval_outputs ]; then
+        rm -rf $RESULT_OUTPUT_DIR/eval_outputs
+    fi
+
+    mv logs/run_evaluation/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
+    mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
+    echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt
+
+    # move report file
+    REPORT_PATH=$MODEL_NAME_OR_PATH.$RUN_ID.json
+    if [ -f $REPORT_PATH ]; then
+        # check if $RESULT_OUTPUT_DIR/report.json exists
+        if [ -f $RESULT_OUTPUT_DIR/report.json ]; then
+            echo "Report file $RESULT_OUTPUT_DIR/report.json already exists. Overwriting..."
+            if [ -f $RESULT_OUTPUT_DIR/report.json.bak ]; then
+                rm $RESULT_OUTPUT_DIR/report.json.bak
+            fi
+            mv $RESULT_OUTPUT_DIR/report.json $RESULT_OUTPUT_DIR/report.json.bak
+        fi
+
+        mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json
+    fi
+
+    poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
+
+else
+    echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
+    poetry run python -m visualswebench.harness.run_evaluation \
+        --dataset_name "$DATASET_NAME" \
+        --split "$SPLIT" \
+        --predictions_path $SWEBENCH_FORMAT_JSONL \
+        --timeout 1800 \
+        --instance_ids $INSTANCE_ID \
+        --cache_level instance \
+        --max_workers $N_PROCESS \
+        --run_id $RUN_ID
+fi
--- a/evaluation/benchmarks/visual_swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/visual_swe_bench/scripts/run_infer.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+MAX_ITER=$5
+NUM_WORKERS=$6
+DATASET=$7
+SPLIT=$8
+N_RUNS=$9
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+if [ -z "$MAX_ITER" ]; then
+  echo "MAX_ITER not specified, use default 100"
+  MAX_ITER=100
+fi
+
+if [ -z "$USE_INSTANCE_IMAGE" ]; then
+  echo "USE_INSTANCE_IMAGE not specified, use default true"
+  USE_INSTANCE_IMAGE=true
+fi
+
+if [ -z "$RUN_WITH_BROWSING" ]; then
+  echo "RUN_WITH_BROWSING not specified, use default false"
+  RUN_WITH_BROWSING=false
+fi
+
+
+if [ -z "$DATASET" ]; then
+  echo "DATASET not specified, use default luolin101/Visual-SWE-bench"
+  DATASET="luolin101/Visual-SWE-bench"
+fi
+
+if [ -z "$SPLIT" ]; then
+  echo "SPLIT not specified, use default test"
+  SPLIT="test"
+fi
+
+export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
+echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
+export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
+echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
+
+get_openhands_version
+
+echo "AGENT: $AGENT"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+echo "DATASET: $DATASET"
+echo "SPLIT: $SPLIT"
+
+# Default to NOT use Hint
+if [ -z "$USE_HINT_TEXT" ]; then
+  export USE_HINT_TEXT=false
+fi
+echo "USE_HINT_TEXT: $USE_HINT_TEXT"
+EVAL_NOTE="$OPENHANDS_VERSION"
+# if not using Hint, add -no-hint to the eval note
+if [ "$USE_HINT_TEXT" = false ]; then
+  EVAL_NOTE="$EVAL_NOTE-no-hint"
+fi
+
+if [ "$RUN_WITH_BROWSING" = true ]; then
+  EVAL_NOTE="$EVAL_NOTE-with-browsing"
+fi
+
+if [ -n "$EXP_NAME" ]; then
+  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
+fi
+
+function run_eval() {
+  local eval_note=$1
+  COMMAND="poetry run python evaluation/benchmarks/visual_swe_bench/run_infer.py \
+    --agent-cls $AGENT \
+    --llm-config $MODEL_CONFIG \
+    --max-iterations $MAX_ITER \
+    --eval-num-workers $NUM_WORKERS \
+    --eval-note $eval_note \
+    --dataset $DATASET \
+    --split $SPLIT"
+
+  if [ -n "$EVAL_LIMIT" ]; then
+    echo "EVAL_LIMIT: $EVAL_LIMIT"
+    COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+  fi
+
+  # Run the command
+  eval $COMMAND
+}
+
+unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
+if [ -z "$N_RUNS" ]; then
+  N_RUNS=1
+  echo "N_RUNS not specified, use default $N_RUNS"
+fi
+
+for i in $(seq 1 $N_RUNS); do
+  current_eval_note="$EVAL_NOTE-run_$i"
+  echo "EVAL_NOTE: $current_eval_note"
+  run_eval $current_eval_note
+done
+
+checkout_original_branch
--- a/evaluation/benchmarks/visual_swe_bench/scripts/setup/instance_swe_entry.sh
+++ b/evaluation/benchmarks/visual_swe_bench/scripts/setup/instance_swe_entry.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+source ~/.bashrc
+SWEUTIL_DIR=/swe_util
+
+# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
+# SWE_INSTANCE_ID=django__django-11099
+if [ -z "$SWE_INSTANCE_ID" ]; then
+    echo "Error: SWE_INSTANCE_ID is not set." >&2
+    exit 1
+fi
+
+# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
+item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
+
+if [[ -z "$item" ]]; then
+  echo "No item found for the provided instance ID."
+  exit 1
+fi
+
+WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")')
+
+echo "WORKSPACE_NAME: $WORKSPACE_NAME"
+
+# Clear the workspace
+if [ -d /workspace ]; then
+    rm -rf /workspace/*
+else
+    mkdir /workspace
+fi
+# Copy repo to workspace
+if [ -d /workspace/$WORKSPACE_NAME ]; then
+    rm -rf /workspace/$WORKSPACE_NAME
+fi
+mkdir -p /workspace
+cp -r /testbed /workspace/$WORKSPACE_NAME
+
+# Activate instance-specific environment
+. /opt/miniconda3/etc/profile.d/conda.sh
+conda activate testbed
--- a/frontend/.husky/pre-commit
+++ b/frontend/.husky/pre-commit
@@ -2,6 +2,7 @@
 echo "Running frontend checks..."
 cd frontend
 npm run check-unlocalized-strings
+npm run check-translation-completeness
 npx lint-staged

 # Run backend pre-commit
--- a/frontend/README.md
+++ b/frontend/README.md
@@ -61,7 +61,7 @@ make build
 # Start the application
 make run
 ```
-Or to run backend and frontend seperately.
+Or to run backend and frontend separately.

 ```sh
 # Start the backend from the root directory
--- a/frontend/tests/components/chat-message.test.tsx
+++ b/frontend/tests/components/chat-message.test.tsx
@@ -10,11 +10,7 @@ describe("ChatMessage", () => {
    expect(screen.getByText("Hello, World!")).toBeInTheDocument();
  });

-  it("should render an assistant message", () => {
-    render(<ChatMessage type="assistant" message="Hello, World!" />);
-    expect(screen.getByTestId("assistant-message")).toBeInTheDocument();
-    expect(screen.getByText("Hello, World!")).toBeInTheDocument();
-  });
+  it.todo("should render an assistant message");

  it.skip("should support code syntax highlighting", () => {
    const code = "```js\nconsole.log('Hello, World!')\n```";
@@ -66,10 +62,7 @@ describe("ChatMessage", () => {

  it("should apply correct styles to inline code", () => {
    render(
-      <ChatMessage
-        type="assistant"
-        message="Here is some `inline code` text"
-      />,
+      <ChatMessage type="agent" message="Here is some `inline code` text" />,
    );
    const codeElement = screen.getByText("inline code");

--- a/frontend/tests/components/chat/action-suggestions.test.tsx
+++ b/frontend/tests/components/chat/action-suggestions.test.tsx
@@ -4,6 +4,7 @@ import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
 import { ActionSuggestions } from "#/components/features/chat/action-suggestions";
 import OpenHands from "#/api/open-hands";
 import { MOCK_DEFAULT_USER_SETTINGS } from "#/mocks/handlers";
+import { ConversationProvider } from "#/context/conversation-context";

 // Mock dependencies
 vi.mock("posthog-js", () => ({
@@ -38,12 +39,20 @@ vi.mock("react-i18next", () => ({
  }),
 }));

+vi.mock("react-router", () => ({
+  useParams: () => ({
+    conversationId: "test-conversation-id",
+  }),
+}));
+
 const renderActionSuggestions = () =>
  render(<ActionSuggestions onSuggestionsClick={() => {}} />, {
    wrapper: ({ children }) => (
-      <QueryClientProvider client={new QueryClient()}>
-        {children}
-      </QueryClientProvider>
+      <ConversationProvider>
+        <QueryClientProvider client={new QueryClient()}>
+          {children}
+        </QueryClientProvider>
+      </ConversationProvider>
    ),
  });

@@ -65,6 +74,11 @@ describe("ActionSuggestions", () => {
  });

  it("should render both GitHub buttons when GitHub token is set and repository is selected", async () => {
+    const getConversationSpy = vi.spyOn(OpenHands, "getConversation");
+    // @ts-expect-error - only required for testing
+    getConversationSpy.mockResolvedValue({
+      selected_repository: "test-repo",
+    });
    renderActionSuggestions();

    // Find all buttons with data-testid="suggestion"
--- a/frontend/tests/components/chat/chat-interface.test.tsx
+++ b/frontend/tests/components/chat/chat-interface.test.tsx
@@ -1,11 +1,9 @@
 import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
-import { act, screen, waitFor, within } from "@testing-library/react";
+import { screen, waitFor, within } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { renderWithProviders } from "test-utils";
 import type { Message } from "#/message";
-import { addUserMessage } from "#/state/chat-slice";
 import { SUGGESTIONS } from "#/utils/suggestions";
-import * as ChatSlice from "#/state/chat-slice";
 import { WsClientProviderStatus } from "#/context/ws-client-provider";
 import { ChatInterface } from "#/components/features/chat/chat-interface";

@@ -42,51 +40,10 @@ describe("Empty state", () => {
    vi.clearAllMocks();
  });

-  it("should render suggestions if empty", () => {
-    const { store } = renderWithProviders(<ChatInterface />, {
-      preloadedState: {
-        chat: {
-          messages: [],
-          systemMessage: {
-            content: "",
-            tools: [],
-            openhands_version: null,
-            agent_class: null
-          }
-        },
-      },
-    });
-
-    expect(screen.getByTestId("suggestions")).toBeInTheDocument();
-
-    act(() => {
-      store.dispatch(
-        addUserMessage({
-          content: "Hello",
-          imageUrls: [],
-          timestamp: new Date().toISOString(),
-          pending: true,
-        }),
-      );
-    });
-
-    expect(screen.queryByTestId("suggestions")).not.toBeInTheDocument();
-  });
+  it.todo("should render suggestions if empty");

  it("should render the default suggestions", () => {
-    renderWithProviders(<ChatInterface />, {
-      preloadedState: {
-        chat: {
-          messages: [],
-          systemMessage: {
-            content: "",
-            tools: [],
-            openhands_version: null,
-            agent_class: null
-          }
-        },
-      },
-    });
+    renderWithProviders(<ChatInterface />);

    const suggestions = screen.getByTestId("suggestions");
    const repoSuggestions = Object.keys(SUGGESTIONS.repo);
@@ -110,21 +67,8 @@ describe("Empty state", () => {
        status: WsClientProviderStatus.CONNECTED,
        isLoadingMessages: false,
      }));
-      const addUserMessageSpy = vi.spyOn(ChatSlice, "addUserMessage");
      const user = userEvent.setup();
-      const { store } = renderWithProviders(<ChatInterface />, {
-        preloadedState: {
-          chat: {
-            messages: [],
-            systemMessage: {
-              content: "",
-              tools: [],
-              openhands_version: null,
-              agent_class: null
-            }
-          },
-        },
-      });
+      renderWithProviders(<ChatInterface />);

      const suggestions = screen.getByTestId("suggestions");
      const displayedSuggestions = within(suggestions).getAllByRole("button");
@@ -133,9 +77,7 @@ describe("Empty state", () => {
      await user.click(displayedSuggestions[0]);

      // user message loaded to input
-      expect(addUserMessageSpy).not.toHaveBeenCalled();
      expect(screen.queryByTestId("suggestions")).toBeInTheDocument();
-      expect(store.getState().chat.messages).toHaveLength(0);
      expect(input).toHaveValue(displayedSuggestions[0].textContent);
    },
  );
@@ -149,19 +91,7 @@ describe("Empty state", () => {
        isLoadingMessages: false,
      }));
      const user = userEvent.setup();
-      const { rerender } = renderWithProviders(<ChatInterface />, {
-        preloadedState: {
-          chat: {
-            messages: [],
-            systemMessage: {
-              content: "",
-              tools: [],
-              openhands_version: null,
-              agent_class: null
-            }
-          },
-        },
-      });
+      const { rerender } = renderWithProviders(<ChatInterface />);

      const suggestions = screen.getByTestId("suggestions");
      const displayedSuggestions = within(suggestions).getAllByRole("button");
--- a/frontend/tests/components/context-menu/account-settings-context-menu.test.tsx
+++ b/frontend/tests/components/context-menu/account-settings-context-menu.test.tsx
@@ -20,7 +20,6 @@ describe("AccountSettingsContextMenu", () => {
      <AccountSettingsContextMenu
        onLogout={onLogoutMock}
        onClose={onCloseMock}
-        isLoggedIn
      />,
    );

@@ -35,7 +34,6 @@ describe("AccountSettingsContextMenu", () => {
      <AccountSettingsContextMenu
        onLogout={onLogoutMock}
        onClose={onCloseMock}
-        isLoggedIn
      />,
    );

@@ -45,19 +43,18 @@ describe("AccountSettingsContextMenu", () => {
    expect(onLogoutMock).toHaveBeenCalledOnce();
  });

-  test("onLogout should be disabled if the user is not logged in", async () => {
+  test("logout button is always enabled", async () => {
    render(
      <AccountSettingsContextMenu
        onLogout={onLogoutMock}
        onClose={onCloseMock}
-        isLoggedIn={false}
      />,
    );

    const logoutOption = screen.getByText("ACCOUNT_SETTINGS$LOGOUT");
    await user.click(logoutOption);

-    expect(onLogoutMock).not.toHaveBeenCalled();
+    expect(onLogoutMock).toHaveBeenCalledOnce();
  });

  it("should call onClose when clicking outside of the element", async () => {
@@ -65,7 +62,6 @@ describe("AccountSettingsContextMenu", () => {
      <AccountSettingsContextMenu
        onLogout={onLogoutMock}
        onClose={onCloseMock}
-        isLoggedIn
      />,
    );

--- a/frontend/tests/components/features/conversation-panel/conversation-panel.test.tsx
+++ b/frontend/tests/components/features/conversation-panel/conversation-panel.test.tsx
@@ -45,6 +45,8 @@ describe("ConversationPanel", () => {
      last_updated_at: "2021-10-01T12:00:00Z",
      created_at: "2021-10-01T12:00:00Z",
      status: "STOPPED" as const,
+      url: null,
+      session_api_key: null,
    },
    {
      conversation_id: "2",
@@ -53,6 +55,8 @@ describe("ConversationPanel", () => {
      last_updated_at: "2021-10-02T12:00:00Z",
      created_at: "2021-10-02T12:00:00Z",
      status: "STOPPED" as const,
+      url: null,
+      session_api_key: null,
    },
    {
      conversation_id: "3",
@@ -61,6 +65,8 @@ describe("ConversationPanel", () => {
      last_updated_at: "2021-10-03T12:00:00Z",
      created_at: "2021-10-03T12:00:00Z",
      status: "STOPPED" as const,
+      url: null,
+      session_api_key: null,
    },
  ];

@@ -143,6 +149,8 @@ describe("ConversationPanel", () => {
        last_updated_at: "2021-10-01T12:00:00Z",
        created_at: "2021-10-01T12:00:00Z",
        status: "STOPPED" as const,
+        url: null,
+        session_api_key: null,
      },
      {
        conversation_id: "2",
@@ -151,6 +159,8 @@ describe("ConversationPanel", () => {
        last_updated_at: "2021-10-02T12:00:00Z",
        created_at: "2021-10-02T12:00:00Z",
        status: "STOPPED" as const,
+        url: null,
+        session_api_key: null,
      },
      {
        conversation_id: "3",
@@ -159,6 +169,8 @@ describe("ConversationPanel", () => {
        last_updated_at: "2021-10-03T12:00:00Z",
        created_at: "2021-10-03T12:00:00Z",
        status: "STOPPED" as const,
+        url: null,
+        session_api_key: null,
      },
    ];

--- a/frontend/tests/components/features/git/git-repo-selector.test.tsx
+++ b/frontend/tests/components/features/git/git-repo-selector.test.tsx
@@ -1,89 +0,0 @@
-import { screen } from "@testing-library/react";
-import { describe, expect, it, vi } from "vitest";
-import { renderWithProviders } from "test-utils";
-import { GitRepositorySelector } from "#/components/features/git/git-repo-selector";
-import OpenHands from "#/api/open-hands";
-import { Provider } from "#/types/settings";
-
-describe("GitRepositorySelector", () => {
-  const onInputChangeMock = vi.fn();
-  const onSelectMock = vi.fn();
-
-  it("should render the search input", () => {
-    renderWithProviders(
-      <GitRepositorySelector
-        onInputChange={onInputChangeMock}
-        onSelect={onSelectMock}
-        publicRepositories={[]}
-        userRepositories={[]}
-      />,
-    );
-
-    expect(
-      screen.getByPlaceholderText("LANDING$SELECT_GIT_REPO"),
-    ).toBeInTheDocument();
-  });
-
-  it("should show the GitHub login button in OSS mode", () => {
-    const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
-    getConfigSpy.mockResolvedValue({
-      APP_MODE: "oss",
-      APP_SLUG: "openhands",
-      GITHUB_CLIENT_ID: "test-client-id",
-      POSTHOG_CLIENT_KEY: "test-posthog-key",
-      FEATURE_FLAGS: {
-        ENABLE_BILLING: false,
-        HIDE_LLM_SETTINGS: false,
-      },
-    });
-
-    renderWithProviders(
-      <GitRepositorySelector
-        onInputChange={onInputChangeMock}
-        onSelect={onSelectMock}
-        publicRepositories={[]}
-        userRepositories={[]}
-      />,
-    );
-
-    expect(screen.getByTestId("github-repo-selector")).toBeInTheDocument();
-  });
-
-  it("should show the search results", () => {
-    const mockSearchedRepos = [
-      {
-        id: 1,
-        full_name: "test/repo1",
-        git_provider: "github" as Provider,
-        stargazers_count: 100,
-        is_public: true,
-        pushed_at: "2023-01-01T00:00:00Z",
-      },
-      {
-        id: 2,
-        full_name: "test/repo2",
-        git_provider: "github" as Provider,
-        stargazers_count: 200,
-        is_public: true,
-        pushed_at: "2023-01-02T00:00:00Z",
-      },
-    ];
-
-    const searchPublicRepositoriesSpy = vi.spyOn(
-      OpenHands,
-      "searchGitRepositories",
-    );
-    searchPublicRepositoriesSpy.mockResolvedValue(mockSearchedRepos);
-
-    renderWithProviders(
-      <GitRepositorySelector
-        onInputChange={onInputChangeMock}
-        onSelect={onSelectMock}
-        publicRepositories={[]}
-        userRepositories={[]}
-      />,
-    );
-
-    expect(screen.getByTestId("github-repo-selector")).toBeInTheDocument();
-  });
-});
--- a/frontend/tests/components/features/home/home-header.test.tsx
+++ b/frontend/tests/components/features/home/home-header.test.tsx
@@ -43,7 +43,6 @@ describe("HomeHeader", () => {
    await userEvent.click(launchButton);

    expect(createConversationSpy).toHaveBeenCalledExactlyOnceWith(
-      "gui",
      undefined,
      undefined,
      undefined,
--- a/frontend/tests/components/features/home/repo-connector.test.tsx
+++ b/frontend/tests/components/features/home/repo-connector.test.tsx
@@ -22,7 +22,7 @@ const renderRepoConnector = () => {
      path: "/conversations/:conversationId",
    },
    {
-      Component: Outlet,
+      Component: () => <Outlet />,
      path: "/settings",
      children: [
        {
@@ -173,7 +173,6 @@ describe("RepoConnector", () => {
    await userEvent.click(launchButton);

    expect(createConversationSpy).toHaveBeenCalledExactlyOnceWith(
-      "gui",
      "rbren/polaris",
      "github",
      undefined,
--- a/frontend/tests/components/features/home/repo-selection-form.test.tsx
+++ b/frontend/tests/components/features/home/repo-selection-form.test.tsx
@@ -0,0 +1,259 @@
+import { render, screen } from "@testing-library/react";
+import { describe, expect, vi, beforeEach, it } from "vitest";
+import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
+import userEvent from "@testing-library/user-event";
+import { RepositorySelectionForm } from "../../../../src/components/features/home/repo-selection-form";
+import OpenHands from "#/api/open-hands";
+import { GitRepository } from "#/types/git";
+
+// Create mock functions
+const mockUseUserRepositories = vi.fn();
+const mockUseCreateConversation = vi.fn();
+const mockUseIsCreatingConversation = vi.fn();
+const mockUseTranslation = vi.fn();
+const mockUseAuth = vi.fn();
+
+// Setup default mock returns
+mockUseUserRepositories.mockReturnValue({
+  data: [],
+  isLoading: false,
+  isError: false,
+});
+
+mockUseCreateConversation.mockReturnValue({
+  mutate: vi.fn(),
+  isPending: false,
+  isSuccess: false,
+});
+
+mockUseIsCreatingConversation.mockReturnValue(false);
+
+mockUseTranslation.mockReturnValue({ t: (key: string) => key });
+
+mockUseAuth.mockReturnValue({
+  isAuthenticated: true,
+  isLoading: false,
+  providersAreSet: true,
+  user: {
+    id: 1,
+    login: "testuser",
+    avatar_url: "https://example.com/avatar.png",
+    name: "Test User",
+    email: "test@example.com",
+    company: "Test Company",
+  },
+  login: vi.fn(),
+  logout: vi.fn(),
+});
+
+vi.mock("#/hooks/mutation/use-create-conversation", () => ({
+  useCreateConversation: () => mockUseCreateConversation(),
+}));
+
+vi.mock("#/hooks/use-is-creating-conversation", () => ({
+  useIsCreatingConversation: () => mockUseIsCreatingConversation(),
+}));
+
+vi.mock("react-i18next", () => ({
+  useTranslation: () => mockUseTranslation(),
+}));
+
+vi.mock("#/context/auth-context", () => ({
+  useAuth: () => mockUseAuth(),
+}));
+
+vi.mock("#/hooks/use-debounce", () => ({
+  useDebounce: (value: string) => value,
+}));
+
+const mockOnRepoSelection = vi.fn();
+const renderForm = () =>
+  render(<RepositorySelectionForm onRepoSelection={mockOnRepoSelection} />, {
+    wrapper: ({ children }) => (
+      <QueryClientProvider
+        client={
+          new QueryClient({
+            defaultOptions: {
+              queries: {
+                retry: false,
+              },
+            },
+          })
+        }
+      >
+        {children}
+      </QueryClientProvider>
+    ),
+  });
+
+describe("RepositorySelectionForm", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it("shows loading indicator when repositories are being fetched", () => {
+    const MOCK_REPOS: GitRepository[] = [
+      {
+        id: 1,
+        full_name: "user/repo1",
+        git_provider: "github",
+        is_public: true,
+      },
+      {
+        id: 2,
+        full_name: "user/repo2",
+        git_provider: "github",
+        is_public: true,
+      },
+    ];
+    const retrieveUserGitRepositoriesSpy = vi.spyOn(
+      OpenHands,
+      "retrieveUserGitRepositories",
+    );
+    retrieveUserGitRepositoriesSpy.mockResolvedValue(MOCK_REPOS);
+
+    renderForm();
+
+    // Check if loading indicator is displayed
+    expect(screen.getByTestId("repo-dropdown-loading")).toBeInTheDocument();
+    expect(screen.getByText("HOME$LOADING_REPOSITORIES")).toBeInTheDocument();
+  });
+
+  it("shows dropdown when repositories are loaded", async () => {
+    const MOCK_REPOS: GitRepository[] = [
+      {
+        id: 1,
+        full_name: "user/repo1",
+        git_provider: "github",
+        is_public: true,
+      },
+      {
+        id: 2,
+        full_name: "user/repo2",
+        git_provider: "github",
+        is_public: true,
+      },
+    ];
+    const retrieveUserGitRepositoriesSpy = vi.spyOn(
+      OpenHands,
+      "retrieveUserGitRepositories",
+    );
+    retrieveUserGitRepositoriesSpy.mockResolvedValue(MOCK_REPOS);
+
+    renderForm();
+    expect(await screen.findByTestId("repo-dropdown")).toBeInTheDocument();
+  });
+
+  it("shows error message when repository fetch fails", async () => {
+    const retrieveUserGitRepositoriesSpy = vi.spyOn(
+      OpenHands,
+      "retrieveUserGitRepositories",
+    );
+    retrieveUserGitRepositoriesSpy.mockRejectedValue(
+      new Error("Failed to load"),
+    );
+
+    renderForm();
+
+    expect(
+      await screen.findByTestId("repo-dropdown-error"),
+    ).toBeInTheDocument();
+    expect(
+      screen.getByText("HOME$FAILED_TO_LOAD_REPOSITORIES"),
+    ).toBeInTheDocument();
+  });
+
+  it("should call the search repos API when searching a URL", async () => {
+    const MOCK_REPOS: GitRepository[] = [
+      {
+        id: 1,
+        full_name: "user/repo1",
+        git_provider: "github",
+        is_public: true,
+      },
+      {
+        id: 2,
+        full_name: "user/repo2",
+        git_provider: "github",
+        is_public: true,
+      },
+    ];
+
+    const MOCK_SEARCH_REPOS: GitRepository[] = [
+      {
+        id: 3,
+        full_name: "kubernetes/kubernetes",
+        git_provider: "github",
+        is_public: true,
+      },
+    ];
+
+    const searchGitReposSpy = vi.spyOn(OpenHands, "searchGitRepositories");
+    const retrieveUserGitRepositoriesSpy = vi.spyOn(
+      OpenHands,
+      "retrieveUserGitRepositories",
+    );
+
+    searchGitReposSpy.mockResolvedValue(MOCK_SEARCH_REPOS);
+    retrieveUserGitRepositoriesSpy.mockResolvedValue(MOCK_REPOS);
+
+    renderForm();
+
+    const input = await screen.findByTestId("repo-dropdown");
+    await userEvent.click(input);
+
+    for (const repo of MOCK_REPOS) {
+      expect(screen.getByText(repo.full_name)).toBeInTheDocument();
+    }
+    expect(
+      screen.queryByText(MOCK_SEARCH_REPOS[0].full_name),
+    ).not.toBeInTheDocument();
+
+    expect(searchGitReposSpy).not.toHaveBeenCalled();
+
+    await userEvent.type(input, "https://github.com/kubernetes/kubernetes");
+    expect(searchGitReposSpy).toHaveBeenLastCalledWith(
+      "kubernetes/kubernetes",
+      3,
+    );
+
+    expect(
+      screen.getByText(MOCK_SEARCH_REPOS[0].full_name),
+    ).toBeInTheDocument();
+    for (const repo of MOCK_REPOS) {
+      expect(screen.queryByText(repo.full_name)).not.toBeInTheDocument();
+    }
+  });
+
+  it("should call onRepoSelection when a searched repository is selected", async () => {
+    const MOCK_SEARCH_REPOS: GitRepository[] = [
+      {
+        id: 3,
+        full_name: "kubernetes/kubernetes",
+        git_provider: "github",
+        is_public: true,
+      },
+    ];
+
+    const searchGitReposSpy = vi.spyOn(OpenHands, "searchGitRepositories");
+    searchGitReposSpy.mockResolvedValue(MOCK_SEARCH_REPOS);
+
+    renderForm();
+
+    const input = await screen.findByTestId("repo-dropdown");
+
+    await userEvent.type(input, "https://github.com/kubernetes/kubernetes");
+    expect(searchGitReposSpy).toHaveBeenLastCalledWith(
+      "kubernetes/kubernetes",
+      3,
+    );
+
+    const searchedRepo = screen.getByText(MOCK_SEARCH_REPOS[0].full_name);
+    expect(searchedRepo).toBeInTheDocument();
+
+    await userEvent.click(searchedRepo);
+    expect(mockOnRepoSelection).toHaveBeenCalledWith(
+      MOCK_SEARCH_REPOS[0].full_name,
+    );
+  });
+});
--- a/frontend/tests/components/features/home/task-card.test.tsx
+++ b/frontend/tests/components/features/home/task-card.test.tsx
@@ -85,7 +85,6 @@ describe("TaskCard", () => {
      await userEvent.click(launchButton);

      expect(createConversationSpy).toHaveBeenCalledWith(
-        "suggested_task",
        MOCK_RESPOSITORIES[0].full_name,
        MOCK_RESPOSITORIES[0].git_provider,
        undefined,
--- a/frontend/tests/components/features/home/task-suggestions.test.tsx
+++ b/frontend/tests/components/features/home/task-suggestions.test.tsx
@@ -11,7 +11,7 @@ import { MOCK_TASKS } from "#/mocks/task-suggestions-handlers";
 const renderTaskSuggestions = () => {
  const RouterStub = createRoutesStub([
    {
-      Component: TaskSuggestions,
+      Component: () => <TaskSuggestions />,
      path: "/",
    },
    {
--- a/frontend/tests/components/features/settings/api-keys-manager.test.tsx
+++ b/frontend/tests/components/features/settings/api-keys-manager.test.tsx
@@ -0,0 +1,59 @@
+import { render, screen } from "@testing-library/react";
+import { describe, expect, it, vi } from "vitest";
+import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
+import { ApiKeysManager } from "#/components/features/settings/api-keys-manager";
+
+// Mock the react-i18next
+vi.mock("react-i18next", async () => {
+  const actual = await vi.importActual<typeof import("react-i18next")>("react-i18next");
+  return {
+    ...actual,
+    useTranslation: () => ({
+      t: (key: string) => key,
+    }),
+    Trans: ({ i18nKey, components }: { i18nKey: string; components: Record<string, React.ReactNode> }) => {
+      // Simplified Trans component that renders the link
+      if (i18nKey === "SETTINGS$API_KEYS_DESCRIPTION") {
+        return (
+          <span>
+            API keys allow you to authenticate with the OpenHands API programmatically. 
+            Keep your API keys secure; anyone with your API key can access your account. 
+            For more information on how to use the API, see our {components.a}
+          </span>
+        );
+      }
+      return <span>{i18nKey}</span>;
+    },
+  };
+});
+
+// Mock the API keys hook
+vi.mock("#/hooks/query/use-api-keys", () => ({
+  useApiKeys: () => ({
+    data: [],
+    isLoading: false,
+    error: null,
+  }),
+}));
+
+describe("ApiKeysManager", () => {
+  const renderComponent = () => {
+    const queryClient = new QueryClient();
+    return render(
+      <QueryClientProvider client={queryClient}>
+        <ApiKeysManager />
+      </QueryClientProvider>
+    );
+  };
+
+  it("should render the API documentation link", () => {
+    renderComponent();
+    
+    // Find the link to the API documentation
+    const link = screen.getByRole("link");
+    expect(link).toBeInTheDocument();
+    expect(link).toHaveAttribute("href", "https://docs.all-hands.dev/modules/usage/cloud/cloud-api");
+    expect(link).toHaveAttribute("target", "_blank");
+    expect(link).toHaveAttribute("rel", "noopener noreferrer");
+  });
+});
--- a/frontend/tests/components/file-operations.test.tsx
+++ b/frontend/tests/components/file-operations.test.tsx
@@ -1,92 +1,11 @@
-import { render, screen } from "@testing-library/react";
-import { describe, it, expect, vi } from "vitest";
-import { Messages } from "#/components/features/chat/messages";
-import type { Message } from "#/message";
-import { renderWithProviders } from "test-utils";
-
-// Mock the useParams hook to provide a conversationId
-vi.mock("react-router", async () => {
-  const actual = await vi.importActual<typeof import("react-router")>("react-router");
-  return {
-    ...actual,
-    useParams: () => ({ conversationId: "test-conversation-id" }),
-  };
-});
+import { describe, it } from "vitest";

 describe("File Operations Messages", () => {
-  it("should show success indicator for successful file read operation", () => {
-    const messages: Message[] = [
-      {
-        type: "action",
-        translationID: "read_file_contents",
-        content: "Successfully read file contents",
-        success: true,
-        sender: "assistant",
-        timestamp: new Date().toISOString(),
-      },
-    ];
+  it.todo("should show success indicator for successful file read operation");

-    renderWithProviders(<Messages messages={messages} isAwaitingUserConfirmation={false} />);
+  it.todo("should show failure indicator for failed file read operation");

-    const statusIcon = screen.getByTestId("status-icon");
-    expect(statusIcon).toBeInTheDocument();
-    expect(statusIcon.closest("svg")).toHaveClass("fill-success");
-  });
+  it.todo("should show success indicator for successful file edit operation");

-  it("should show failure indicator for failed file read operation", () => {
-    const messages: Message[] = [
-      {
-        type: "action",
-        translationID: "read_file_contents",
-        content: "Failed to read file contents",
-        success: false,
-        sender: "assistant",
-        timestamp: new Date().toISOString(),
-      },
-    ];
-
-    renderWithProviders(<Messages messages={messages} isAwaitingUserConfirmation={false} />);
-
-    const statusIcon = screen.getByTestId("status-icon");
-    expect(statusIcon).toBeInTheDocument();
-    expect(statusIcon.closest("svg")).toHaveClass("fill-danger");
-  });
-
-  it("should show success indicator for successful file edit operation", () => {
-    const messages: Message[] = [
-      {
-        type: "action",
-        translationID: "edit_file_contents",
-        content: "Successfully edited file contents",
-        success: true,
-        sender: "assistant",
-        timestamp: new Date().toISOString(),
-      },
-    ];
-
-    renderWithProviders(<Messages messages={messages} isAwaitingUserConfirmation={false} />);
-
-    const statusIcon = screen.getByTestId("status-icon");
-    expect(statusIcon).toBeInTheDocument();
-    expect(statusIcon.closest("svg")).toHaveClass("fill-success");
-  });
-
-  it("should show failure indicator for failed file edit operation", () => {
-    const messages: Message[] = [
-      {
-        type: "action",
-        translationID: "edit_file_contents",
-        content: "Failed to edit file contents",
-        success: false,
-        sender: "assistant",
-        timestamp: new Date().toISOString(),
-      },
-    ];
-
-    renderWithProviders(<Messages messages={messages} isAwaitingUserConfirmation={false} />);
-
-    const statusIcon = screen.getByTestId("status-icon");
-    expect(statusIcon).toBeInTheDocument();
-    expect(statusIcon.closest("svg")).toHaveClass("fill-danger");
-  });
+  it.todo("should show failure indicator for failed file edit operation");
 });
--- a/frontend/tests/components/user-actions.test.tsx
+++ b/frontend/tests/components/user-actions.test.tsx
@@ -57,7 +57,7 @@ describe("UserActions", () => {
    ).not.toBeInTheDocument();
  });

-  test("onLogout should not be called when the user is not logged in", async () => {
+  test("logout button is always enabled", async () => {
    render(<UserActions onLogout={onLogoutMock} />);

    const userAvatar = screen.getByTestId("user-avatar");
@@ -66,6 +66,6 @@ describe("UserActions", () => {
    const logoutOption = screen.getByText("ACCOUNT_SETTINGS$LOGOUT");
    await user.click(logoutOption);

-    expect(onLogoutMock).not.toHaveBeenCalled();
+    expect(onLogoutMock).toHaveBeenCalledOnce();
  });
 });
--- a/frontend/tests/context/ws-client-provider.test.tsx
+++ b/frontend/tests/context/ws-client-provider.test.tsx
@@ -2,7 +2,6 @@ import { describe, it, expect, vi, beforeEach } from "vitest";
 import { render, waitFor } from "@testing-library/react";
 import React from "react";
 import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
-import * as ChatSlice from "#/state/chat-slice";
 import {
  updateStatusWhenErrorMessagePresent,
  WsClientProvider,
@@ -11,42 +10,15 @@ import {

 describe("Propagate error message", () => {
  it("should do nothing when no message was passed from server", () => {
-    const addErrorMessageSpy = vi.spyOn(ChatSlice, "addErrorMessage");
    updateStatusWhenErrorMessagePresent(null);
    updateStatusWhenErrorMessagePresent(undefined);
    updateStatusWhenErrorMessagePresent({});
    updateStatusWhenErrorMessagePresent({ message: null });
-
-    expect(addErrorMessageSpy).not.toHaveBeenCalled();
  });

-  it("should display error to user when present", () => {
-    const message = "We have a problem!";
-    const addErrorMessageSpy = vi.spyOn(ChatSlice, "addErrorMessage");
-    updateStatusWhenErrorMessagePresent({ message });
+  it.todo("should display error to user when present");

-    expect(addErrorMessageSpy).toHaveBeenCalledWith({
-      message,
-      status_update: true,
-      type: "error",
-    });
-  });
-
-  it("should display error including translation id when present", () => {
-    const message = "We have a problem!";
-    const addErrorMessageSpy = vi.spyOn(ChatSlice, "addErrorMessage");
-    updateStatusWhenErrorMessagePresent({
-      message,
-      data: { msg_id: "..id.." },
-    });
-
-    expect(addErrorMessageSpy).toHaveBeenCalledWith({
-      message,
-      id: "..id..",
-      status_update: true,
-      type: "error",
-    });
-  });
+  it.todo("should display error including translation id when present");
 });

 // Create a mock for socket.io-client
@@ -84,6 +56,19 @@ function TestComponent() {
 describe("WsClientProvider", () => {
  beforeEach(() => {
    vi.clearAllMocks();
+    vi.mock("#/hooks/query/use-user-conversation", () => ({
+      useUserConversation: () => {
+        return { data: {
+        conversation_id: "1",
+        title: "Conversation 1",
+        selected_repository: null,
+        last_updated_at: "2021-10-01T12:00:00Z",
+        created_at: "2021-10-01T12:00:00Z",
+        status: "STOPPED" as const,
+        url: null,
+        session_api_key: null,
+      }}},
+    }));
  });

  it("should emit oh_user_action event when send is called", async () => {
--- a/frontend/tests/i18n/translations.test.tsx
+++ b/frontend/tests/i18n/translations.test.tsx
@@ -11,7 +11,6 @@ describe("Translations", () => {
      <AccountSettingsContextMenu
        onLogout={() => {}}
        onClose={() => {}}
-        isLoggedIn
      />,
    );
    expect(
--- a/frontend/tests/routes/llm-settings.test.tsx
+++ b/frontend/tests/routes/llm-settings.test.tsx
@@ -48,7 +48,7 @@ describe("Content", () => {

      await waitFor(() => {
        expect(provider).toHaveValue("Anthropic");
-        expect(model).toHaveValue("claude-3-5-sonnet-20241022");
+        expect(model).toHaveValue("claude-3-7-sonnet-20250219");

        expect(apiKey).toHaveValue("");
        expect(apiKey).toHaveProperty("placeholder", "");
@@ -135,7 +135,7 @@ describe("Content", () => {
      );
      const condensor = screen.getByTestId("enable-memory-condenser-switch");

-      expect(model).toHaveValue("anthropic/claude-3-5-sonnet-20241022");
+      expect(model).toHaveValue("anthropic/claude-3-7-sonnet-20250219");
      expect(baseUrl).toHaveValue("");
      expect(apiKey).toHaveValue("");
      expect(apiKey).toHaveProperty("placeholder", "");
@@ -542,7 +542,7 @@ describe("Form submission", () => {

    // select model
    await userEvent.click(model);
-    const modelOption = screen.getByText("claude-3-5-sonnet-20241022");
+    const modelOption = screen.getByText("claude-3-7-sonnet-20250219");
    await userEvent.click(modelOption);

    const submitButton = screen.getByTestId("submit-button");
@@ -550,7 +550,7 @@ describe("Form submission", () => {

    expect(saveSettingsSpy).toHaveBeenCalledWith(
      expect.objectContaining({
-        llm_model: "anthropic/claude-3-5-sonnet-20241022",
+        llm_model: "anthropic/claude-3-7-sonnet-20250219",
        llm_base_url: "",
        confirmation_mode: false,
      }),
--- a/frontend/tests/routes/secrets-settings.test.tsx
+++ b/frontend/tests/routes/secrets-settings.test.tsx
@@ -0,0 +1,565 @@
+import { render, screen, waitFor, within } from "@testing-library/react";
+import { beforeEach, describe, expect, it, vi } from "vitest";
+import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
+import userEvent from "@testing-library/user-event";
+import { createRoutesStub, Outlet } from "react-router";
+import SecretsSettingsScreen from "#/routes/secrets-settings";
+import { SecretsService } from "#/api/secrets-service";
+import { GetSecretsResponse } from "#/api/secrets-service.types";
+import OpenHands from "#/api/open-hands";
+import { MOCK_DEFAULT_USER_SETTINGS } from "#/mocks/handlers";
+
+const MOCK_GET_SECRETS_RESPONSE: GetSecretsResponse["custom_secrets"] = [
+  {
+    name: "My_Secret_1",
+    description: "My first secret",
+  },
+  {
+    name: "My_Secret_2",
+    description: "My second secret",
+  },
+];
+
+const RouterStub = createRoutesStub([
+  {
+    Component: () => <Outlet />,
+    path: "/settings",
+    children: [
+      {
+        Component: SecretsSettingsScreen,
+        path: "/settings/secrets",
+      },
+      {
+        Component: () => <div data-testid="git-settings-screen" />,
+        path: "/settings/git",
+      },
+    ],
+  },
+]);
+
+const renderSecretsSettings = () =>
+  render(<RouterStub initialEntries={["/settings/secrets"]} />, {
+    wrapper: ({ children }) => (
+      <QueryClientProvider
+        client={
+          new QueryClient({
+            defaultOptions: { queries: { retry: false } },
+          })
+        }
+      >
+        {children}
+      </QueryClientProvider>
+    ),
+  });
+
+beforeEach(() => {
+  const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
+  // @ts-expect-error - only return the config we need
+  getConfigSpy.mockResolvedValue({
+    APP_MODE: "oss",
+  });
+});
+
+describe("Content", () => {
+  it("should render the secrets settings screen", () => {
+    renderSecretsSettings();
+    screen.getByTestId("secrets-settings-screen");
+  });
+
+  it("should NOT render a button to connect with git if they havent already in oss", async () => {
+    const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
+    const getSettingsSpy = vi.spyOn(OpenHands, "getSettings");
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    // @ts-expect-error - only return the config we need
+    getConfigSpy.mockResolvedValue({
+      APP_MODE: "oss",
+    });
+    getSettingsSpy.mockResolvedValue({
+      ...MOCK_DEFAULT_USER_SETTINGS,
+      provider_tokens_set: {},
+    });
+
+    renderSecretsSettings();
+
+    expect(getConfigSpy).toHaveBeenCalled();
+    await waitFor(() => expect(getSecretsSpy).toHaveBeenCalled());
+    expect(screen.queryByTestId("connect-git-button")).not.toBeInTheDocument();
+  });
+
+  it("should render a button to connect with git if they havent already in saas", async () => {
+    const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
+    const getSettingsSpy = vi.spyOn(OpenHands, "getSettings");
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    // @ts-expect-error - only return the config we need
+    getConfigSpy.mockResolvedValue({
+      APP_MODE: "saas",
+    });
+    getSettingsSpy.mockResolvedValue({
+      ...MOCK_DEFAULT_USER_SETTINGS,
+      provider_tokens_set: {},
+    });
+
+    renderSecretsSettings();
+
+    expect(getSecretsSpy).not.toHaveBeenCalled();
+    await waitFor(() =>
+      expect(screen.queryByTestId("add-secret-button")).not.toBeInTheDocument(),
+    );
+    const button = await screen.findByTestId("connect-git-button");
+    await userEvent.click(button);
+
+    screen.getByTestId("git-settings-screen");
+  });
+
+  it("should render a message if there are no existing secrets", async () => {
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    getSecretsSpy.mockResolvedValue([]);
+    renderSecretsSettings();
+
+    await screen.findByTestId("no-secrets-message");
+  });
+
+  it("should render existing secrets", async () => {
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    getSecretsSpy.mockResolvedValue(MOCK_GET_SECRETS_RESPONSE);
+    renderSecretsSettings();
+
+    const secrets = await screen.findAllByTestId("secret-item");
+    expect(secrets).toHaveLength(2);
+    expect(screen.queryByTestId("no-secrets-message")).not.toBeInTheDocument();
+  });
+});
+
+describe("Secret actions", () => {
+  it("should create a new secret", async () => {
+    const createSecretSpy = vi.spyOn(SecretsService, "createSecret");
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    createSecretSpy.mockResolvedValue(true);
+    renderSecretsSettings();
+
+    // render form & hide items
+    expect(screen.queryByTestId("add-secret-form")).not.toBeInTheDocument();
+    const button = await screen.findByTestId("add-secret-button");
+    await userEvent.click(button);
+
+    const secretForm = screen.getByTestId("add-secret-form");
+    const secrets = screen.queryAllByTestId("secret-item");
+
+    expect(screen.queryByTestId("add-secret-button")).not.toBeInTheDocument();
+    expect(secretForm).toBeInTheDocument();
+    expect(secrets).toHaveLength(0);
+
+    // enter details
+    const nameInput = within(secretForm).getByTestId("name-input");
+    const valueInput = within(secretForm).getByTestId("value-input");
+    const descriptionInput =
+      within(secretForm).getByTestId("description-input");
+
+    const submitButton = within(secretForm).getByTestId("submit-button");
+
+    vi.clearAllMocks(); // reset mocks to check for upcoming calls
+
+    await userEvent.type(nameInput, "My_Custom_Secret");
+    await userEvent.type(valueInput, "my-custom-secret-value");
+    await userEvent.type(descriptionInput, "My custom secret description");
+
+    await userEvent.click(submitButton);
+
+    // make POST request
+    expect(createSecretSpy).toHaveBeenCalledWith(
+      "My_Custom_Secret",
+      "my-custom-secret-value",
+      "My custom secret description",
+    );
+
+    // hide form & render items
+    expect(screen.queryByTestId("add-secret-form")).not.toBeInTheDocument();
+    expect(getSecretsSpy).toHaveBeenCalled();
+  });
+
+  it("should edit a secret", async () => {
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    const updateSecretSpy = vi.spyOn(SecretsService, "updateSecret");
+    getSecretsSpy.mockResolvedValue(MOCK_GET_SECRETS_RESPONSE);
+    updateSecretSpy.mockResolvedValue(true);
+    renderSecretsSettings();
+
+    // render edit button within a secret list item
+    const secrets = await screen.findAllByTestId("secret-item");
+    const firstSecret = within(secrets[0]);
+    const editButton = firstSecret.getByTestId("edit-secret-button");
+
+    await userEvent.click(editButton);
+
+    // render edit form
+    const editForm = screen.getByTestId("edit-secret-form");
+
+    expect(screen.queryByTestId("add-secret-button")).not.toBeInTheDocument();
+    expect(editForm).toBeInTheDocument();
+    expect(screen.queryAllByTestId("secret-item")).toHaveLength(0);
+
+    // enter details
+    const nameInput = within(editForm).getByTestId("name-input");
+    const descriptionInput = within(editForm).getByTestId("description-input");
+    const submitButton = within(editForm).getByTestId("submit-button");
+
+    // should not show value input
+    const valueInput = within(editForm).queryByTestId("value-input");
+    expect(valueInput).not.toBeInTheDocument();
+
+    expect(nameInput).toHaveValue("My_Secret_1");
+    expect(descriptionInput).toHaveValue("My first secret");
+
+    await userEvent.clear(nameInput);
+    await userEvent.type(nameInput, "My_Edited_Secret");
+
+    await userEvent.clear(descriptionInput);
+    await userEvent.type(descriptionInput, "My edited secret description");
+
+    await userEvent.click(submitButton);
+
+    // make POST request
+    expect(updateSecretSpy).toHaveBeenCalledWith(
+      "My_Secret_1",
+      "My_Edited_Secret",
+      "My edited secret description",
+    );
+
+    // hide form
+    expect(screen.queryByTestId("edit-secret-form")).not.toBeInTheDocument();
+
+    // optimistic update
+    const updatedSecrets = await screen.findAllByTestId("secret-item");
+    expect(updatedSecrets).toHaveLength(2);
+    expect(updatedSecrets[0]).toHaveTextContent(/my_edited_secret/i);
+  });
+
+  it("should be able to cancel the create or edit form", async () => {
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    getSecretsSpy.mockResolvedValue(MOCK_GET_SECRETS_RESPONSE);
+    renderSecretsSettings();
+
+    // render form & hide items
+    expect(screen.queryByTestId("add-secret-form")).not.toBeInTheDocument();
+    const button = await screen.findByTestId("add-secret-button");
+    await userEvent.click(button);
+    const secretForm = screen.getByTestId("add-secret-form");
+    expect(secretForm).toBeInTheDocument();
+
+    // cancel button
+    const cancelButton = within(secretForm).getByTestId("cancel-button");
+    await userEvent.click(cancelButton);
+    expect(screen.queryByTestId("add-secret-form")).not.toBeInTheDocument();
+    expect(screen.queryByTestId("add-secret-button")).toBeInTheDocument();
+
+    // render edit button within a secret list item
+    const secrets = await screen.findAllByTestId("secret-item");
+    const firstSecret = within(secrets[0]);
+    const editButton = firstSecret.getByTestId("edit-secret-button");
+    await userEvent.click(editButton);
+
+    // render edit form
+    const editForm = screen.getByTestId("edit-secret-form");
+    expect(editForm).toBeInTheDocument();
+    expect(screen.queryAllByTestId("secret-item")).toHaveLength(0);
+
+    // cancel button
+    const cancelEditButton = within(editForm).getByTestId("cancel-button");
+    await userEvent.click(cancelEditButton);
+    expect(screen.queryByTestId("edit-secret-form")).not.toBeInTheDocument();
+    expect(screen.queryAllByTestId("secret-item")).toHaveLength(2);
+  });
+
+  it("should undo the optimistic update if the request fails", async () => {
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    const updateSecretSpy = vi.spyOn(SecretsService, "updateSecret");
+    getSecretsSpy.mockResolvedValue(MOCK_GET_SECRETS_RESPONSE);
+    updateSecretSpy.mockRejectedValue(new Error("Failed to update secret"));
+    renderSecretsSettings();
+
+    // render edit button within a secret list item
+    const secrets = await screen.findAllByTestId("secret-item");
+    const firstSecret = within(secrets[0]);
+    const editButton = firstSecret.getByTestId("edit-secret-button");
+
+    await userEvent.click(editButton);
+
+    // render edit form
+    const editForm = screen.getByTestId("edit-secret-form");
+
+    expect(editForm).toBeInTheDocument();
+    expect(screen.queryAllByTestId("secret-item")).toHaveLength(0);
+
+    // enter details
+    const nameInput = within(editForm).getByTestId("name-input");
+    const submitButton = within(editForm).getByTestId("submit-button");
+
+    // should not show value input
+    const valueInput = within(editForm).queryByTestId("value-input");
+    expect(valueInput).not.toBeInTheDocument();
+
+    await userEvent.clear(nameInput);
+    await userEvent.type(nameInput, "My_Edited_Secret");
+    await userEvent.click(submitButton);
+
+    // make POST request
+    expect(updateSecretSpy).toHaveBeenCalledWith(
+      "My_Secret_1",
+      "My_Edited_Secret",
+      "My first secret",
+    );
+
+    // hide form
+    expect(screen.queryByTestId("edit-secret-form")).not.toBeInTheDocument();
+
+    // no optimistic update
+    const updatedSecrets = await screen.findAllByTestId("secret-item");
+    expect(updatedSecrets).toHaveLength(2);
+    expect(updatedSecrets[0]).not.toHaveTextContent(/my edited secret/i);
+  });
+
+  it("should remove the secret from the list after deletion", async () => {
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    const deleteSecretSpy = vi.spyOn(SecretsService, "deleteSecret");
+    getSecretsSpy.mockResolvedValue(MOCK_GET_SECRETS_RESPONSE);
+    deleteSecretSpy.mockResolvedValue(true);
+    renderSecretsSettings();
+
+    // render delete button within a secret list item
+    const secrets = await screen.findAllByTestId("secret-item");
+    const secondSecret = within(secrets[1]);
+    const deleteButton = secondSecret.getByTestId("delete-secret-button");
+    await userEvent.click(deleteButton);
+
+    // confirmation modal
+    const confirmationModal = screen.getByTestId("confirmation-modal");
+    const confirmButton =
+      within(confirmationModal).getByTestId("confirm-button");
+    await userEvent.click(confirmButton);
+
+    // make DELETE request
+    expect(deleteSecretSpy).toHaveBeenCalledWith("My_Secret_2");
+    expect(screen.queryByTestId("confirmation-modal")).not.toBeInTheDocument();
+
+    // optimistic update
+    expect(screen.queryAllByTestId("secret-item")).toHaveLength(1);
+    expect(screen.queryByText("My_Secret_2")).not.toBeInTheDocument();
+  });
+
+  it("should be able to cancel the delete confirmation modal", async () => {
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    const deleteSecretSpy = vi.spyOn(SecretsService, "deleteSecret");
+    getSecretsSpy.mockResolvedValue(MOCK_GET_SECRETS_RESPONSE);
+    deleteSecretSpy.mockResolvedValue(true);
+    renderSecretsSettings();
+
+    // render delete button within a secret list item
+    const secrets = await screen.findAllByTestId("secret-item");
+    const secondSecret = within(secrets[1]);
+    const deleteButton = secondSecret.getByTestId("delete-secret-button");
+    await userEvent.click(deleteButton);
+
+    // confirmation modal
+    const confirmationModal = screen.getByTestId("confirmation-modal");
+    const cancelButton = within(confirmationModal).getByTestId("cancel-button");
+    await userEvent.click(cancelButton);
+
+    // no DELETE request
+    expect(deleteSecretSpy).not.toHaveBeenCalled();
+    expect(screen.queryByTestId("confirmation-modal")).not.toBeInTheDocument();
+    expect(screen.queryAllByTestId("secret-item")).toHaveLength(2);
+  });
+
+  it("should revert the optimistic update if the request fails", async () => {
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    const deleteSecretSpy = vi.spyOn(SecretsService, "deleteSecret");
+    getSecretsSpy.mockResolvedValue(MOCK_GET_SECRETS_RESPONSE);
+    deleteSecretSpy.mockRejectedValue(new Error("Failed to delete secret"));
+    renderSecretsSettings();
+
+    // render delete button within a secret list item
+    const secrets = await screen.findAllByTestId("secret-item");
+    const secondSecret = within(secrets[1]);
+    const deleteButton = secondSecret.getByTestId("delete-secret-button");
+    await userEvent.click(deleteButton);
+
+    // confirmation modal
+    const confirmationModal = screen.getByTestId("confirmation-modal");
+    const confirmButton =
+      within(confirmationModal).getByTestId("confirm-button");
+    await userEvent.click(confirmButton);
+
+    // make DELETE request
+    expect(deleteSecretSpy).toHaveBeenCalledWith("My_Secret_2");
+    expect(screen.queryByTestId("confirmation-modal")).not.toBeInTheDocument();
+
+    // optimistic update
+    expect(screen.queryAllByTestId("secret-item")).toHaveLength(2);
+    expect(screen.queryByText("My_Secret_2")).toBeInTheDocument();
+  });
+
+  it("should hide the no items message when in form view", async () => {
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    getSecretsSpy.mockResolvedValue([]);
+    renderSecretsSettings();
+
+    // render form & hide items
+    expect(screen.queryByTestId("no-secrets-message")).not.toBeInTheDocument();
+    const button = await screen.findByTestId("add-secret-button");
+    await userEvent.click(button);
+
+    const secretForm = screen.getByTestId("add-secret-form");
+    expect(secretForm).toBeInTheDocument();
+    expect(screen.queryByTestId("no-secrets-message")).not.toBeInTheDocument();
+  });
+
+  it("should not allow spaces in secret names", async () => {
+    const createSecretSpy = vi.spyOn(SecretsService, "createSecret");
+    renderSecretsSettings();
+
+    // render form & hide items
+    expect(screen.queryByTestId("add-secret-form")).not.toBeInTheDocument();
+    const button = await screen.findByTestId("add-secret-button");
+    await userEvent.click(button);
+
+    const secretForm = screen.getByTestId("add-secret-form");
+    expect(secretForm).toBeInTheDocument();
+
+    // enter details
+    const nameInput = within(secretForm).getByTestId("name-input");
+    const valueInput = within(secretForm).getByTestId("value-input");
+    const submitButton = within(secretForm).getByTestId("submit-button");
+
+    await userEvent.type(nameInput, "My Custom Secret With Spaces");
+    await userEvent.type(valueInput, "my-custom-secret-value");
+    await userEvent.click(submitButton);
+
+    // make POST request
+    expect(createSecretSpy).not.toHaveBeenCalled();
+
+    await userEvent.clear(nameInput);
+    await userEvent.type(nameInput, "MyCustomSecret");
+    await userEvent.click(submitButton);
+
+    expect(createSecretSpy).toHaveBeenCalledWith(
+      "MyCustomSecret",
+      "my-custom-secret-value",
+      undefined,
+    );
+  });
+
+  it("should not allow existing secret names", async () => {
+    const createSecretSpy = vi.spyOn(SecretsService, "createSecret");
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    getSecretsSpy.mockResolvedValue(MOCK_GET_SECRETS_RESPONSE.slice(0, 1));
+    renderSecretsSettings();
+
+    // render form & hide items
+    expect(screen.queryByTestId("add-secret-form")).not.toBeInTheDocument();
+    const button = await screen.findByTestId("add-secret-button");
+    await userEvent.click(button);
+
+    const secretForm = screen.getByTestId("add-secret-form");
+    expect(secretForm).toBeInTheDocument();
+
+    // enter details
+    const nameInput = within(secretForm).getByTestId("name-input");
+    const valueInput = within(secretForm).getByTestId("value-input");
+    const submitButton = within(secretForm).getByTestId("submit-button");
+
+    await userEvent.type(nameInput, "My_Secret_1");
+    await userEvent.type(valueInput, "my-custom-secret-value");
+    await userEvent.click(submitButton);
+
+    // make POST request
+    expect(createSecretSpy).not.toHaveBeenCalled();
+    expect(screen.queryByText(/secret already exists/i)).toBeInTheDocument();
+
+    await userEvent.clear(nameInput);
+    await userEvent.type(nameInput, "My_Custom_Secret");
+
+    await userEvent.clear(valueInput);
+    await userEvent.type(valueInput, "my-custom-secret-value");
+
+    await userEvent.click(submitButton);
+
+    expect(createSecretSpy).toHaveBeenCalledWith(
+      "My_Custom_Secret",
+      "my-custom-secret-value",
+      undefined,
+    );
+    expect(
+      screen.queryByText("SECRETS$SECRET_VALUE_REQUIRED"),
+    ).not.toBeInTheDocument();
+  });
+
+  it("should not submit whitespace secret names or values", async () => {
+    const createSecretSpy = vi.spyOn(SecretsService, "createSecret");
+    renderSecretsSettings();
+
+    // render form & hide items
+    expect(screen.queryByTestId("add-secret-form")).not.toBeInTheDocument();
+    const button = await screen.findByTestId("add-secret-button");
+    await userEvent.click(button);
+
+    const secretForm = screen.getByTestId("add-secret-form");
+    expect(secretForm).toBeInTheDocument();
+
+    // enter details
+    const nameInput = within(secretForm).getByTestId("name-input");
+    const valueInput = within(secretForm).getByTestId("value-input");
+    const submitButton = within(secretForm).getByTestId("submit-button");
+
+    await userEvent.type(nameInput, "   ");
+    await userEvent.type(valueInput, "my-custom-secret-value");
+    await userEvent.click(submitButton);
+
+    // make POST request
+    expect(createSecretSpy).not.toHaveBeenCalled();
+
+    await userEvent.clear(nameInput);
+    await userEvent.type(nameInput, "My_Custom_Secret");
+
+    await userEvent.clear(valueInput);
+    await userEvent.type(valueInput, "   ");
+
+    await userEvent.click(submitButton);
+
+    expect(createSecretSpy).not.toHaveBeenCalled();
+    expect(
+      screen.queryByText("SECRETS$SECRET_VALUE_REQUIRED"),
+    ).toBeInTheDocument();
+  });
+
+  it("should not reset ipout values on an invalid submit", async () => {
+    const getSecretsSpy = vi.spyOn(SecretsService, "getSecrets");
+    const createSecretSpy = vi.spyOn(SecretsService, "createSecret");
+    getSecretsSpy.mockResolvedValue(MOCK_GET_SECRETS_RESPONSE);
+
+    renderSecretsSettings();
+
+    // render form & hide items
+    expect(screen.queryByTestId("add-secret-form")).not.toBeInTheDocument();
+    const button = await screen.findByTestId("add-secret-button");
+    await userEvent.click(button);
+
+    const secretForm = screen.getByTestId("add-secret-form");
+    expect(secretForm).toBeInTheDocument();
+
+    // enter details
+    const nameInput = within(secretForm).getByTestId("name-input");
+    const valueInput = within(secretForm).getByTestId("value-input");
+    const submitButton = within(secretForm).getByTestId("submit-button");
+
+    await userEvent.type(nameInput, MOCK_GET_SECRETS_RESPONSE[0].name);
+    await userEvent.type(valueInput, "my-custom-secret-value");
+    await userEvent.click(submitButton);
+
+    // make POST request
+    expect(createSecretSpy).not.toHaveBeenCalled();
+    expect(screen.queryByText(/secret already exists/i)).toBeInTheDocument();
+
+    expect(nameInput).toHaveValue(MOCK_GET_SECRETS_RESPONSE[0].name);
+    expect(valueInput).toHaveValue("my-custom-secret-value");
+  });
+});
--- a/frontend/tests/routes/settings.test.tsx
+++ b/frontend/tests/routes/settings.test.tsx
@@ -79,7 +79,7 @@ describe("Settings Screen", () => {
  };

  it("should render the navbar", async () => {
-    const sectionsToInclude = ["llm", "git", "application"];
+    const sectionsToInclude = ["llm", "git", "application", "secrets"];
    const sectionsToExclude = ["api keys", "credits"];
    const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
    // @ts-expect-error - only return app mode
@@ -110,7 +110,13 @@ describe("Settings Screen", () => {
    getConfigSpy.mockResolvedValue({
      APP_MODE: "saas",
    });
-    const sectionsToInclude = ["git", "application", "credits", "api keys"];
+    const sectionsToInclude = [
+      "git",
+      "application",
+      "credits",
+      "secrets",
+      "api keys",
+    ];
    const sectionsToExclude = ["llm"];

    renderSettingsScreen();
--- a/frontend/tests/services/actions.test.ts
+++ b/frontend/tests/services/actions.test.ts
@@ -1,146 +0,0 @@
-import { describe, it, expect, vi, beforeEach } from "vitest";
-import { handleStatusMessage, handleActionMessage } from "#/services/actions";
-import store from "#/store";
-import { trackError } from "#/utils/error-handler";
-import ActionType from "#/types/action-type";
-import { ActionMessage } from "#/types/message";
-
-// Mock dependencies
-vi.mock("#/utils/error-handler", () => ({
-  trackError: vi.fn(),
-}));
-
-vi.mock("#/store", () => ({
-  default: {
-    dispatch: vi.fn(),
-  },
-}));
-
-describe("Actions Service", () => {
-  beforeEach(() => {
-    vi.clearAllMocks();
-  });
-
-  describe("handleStatusMessage", () => {
-    it("should dispatch info messages to status state", () => {
-      const message = {
-        type: "info",
-        message: "Runtime is not available",
-        id: "runtime.unavailable",
-        status_update: true as const,
-      };
-
-      handleStatusMessage(message);
-
-      expect(store.dispatch).toHaveBeenCalledWith(expect.objectContaining({
-        payload: message,
-      }));
-    });
-
-    it("should log error messages and display them in chat", () => {
-      const message = {
-        type: "error",
-        message: "Runtime connection failed",
-        id: "runtime.connection.failed",
-        status_update: true as const,
-      };
-
-      handleStatusMessage(message);
-
-      expect(trackError).toHaveBeenCalledWith({
-        message: "Runtime connection failed",
-        source: "chat",
-        metadata: { msgId: "runtime.connection.failed" },
-      });
-
-      expect(store.dispatch).toHaveBeenCalledWith(expect.objectContaining({
-        payload: message,
-      }));
-    });
-  });
-
-  describe("handleActionMessage", () => {
-    it("should use first-person perspective for task completion messages", () => {
-      // Test partial completion
-      const messagePartial: ActionMessage = {
-        id: 1,
-        action: ActionType.FINISH,
-        source: "agent",
-        message: "",
-        timestamp: new Date().toISOString(),
-        args: {
-          final_thought: "",
-          task_completed: "partial",
-          outputs: "",
-          thought: ""
-        }
-      };
-
-      // Mock implementation to capture the message
-      let capturedPartialMessage = "";
-      (store.dispatch as any).mockImplementation((action: any) => {
-        if (action.type === "chat/addAssistantMessage" &&
-            action.payload.includes("believe that the task was **completed partially**")) {
-          capturedPartialMessage = action.payload;
-        }
-      });
-
-      handleActionMessage(messagePartial);
-      expect(capturedPartialMessage).toContain("I believe that the task was **completed partially**");
-
-      // Test not completed
-      const messageNotCompleted: ActionMessage = {
-        id: 2,
-        action: ActionType.FINISH,
-        source: "agent",
-        message: "",
-        timestamp: new Date().toISOString(),
-        args: {
-          final_thought: "",
-          task_completed: "false",
-          outputs: "",
-          thought: ""
-        }
-      };
-
-      // Mock implementation to capture the message
-      let capturedNotCompletedMessage = "";
-      (store.dispatch as any).mockImplementation((action: any) => {
-        if (action.type === "chat/addAssistantMessage" &&
-            action.payload.includes("believe that the task was **not completed**")) {
-          capturedNotCompletedMessage = action.payload;
-        }
-      });
-
-      handleActionMessage(messageNotCompleted);
-      expect(capturedNotCompletedMessage).toContain("I believe that the task was **not completed**");
-
-      // Test completed successfully
-      const messageCompleted: ActionMessage = {
-        id: 3,
-        action: ActionType.FINISH,
-        source: "agent",
-        message: "",
-        timestamp: new Date().toISOString(),
-        args: {
-          final_thought: "",
-          task_completed: "true",
-          outputs: "",
-          thought: ""
-        }
-      };
-
-      // Mock implementation to capture the message
-      let capturedCompletedMessage = "";
-      (store.dispatch as any).mockImplementation((action: any) => {
-        if (action.type === "chat/addAssistantMessage" &&
-            action.payload.includes("believe that the task was **completed successfully**")) {
-          capturedCompletedMessage = action.payload;
-        }
-      });
-
-      handleActionMessage(messageCompleted);
-      expect(capturedCompletedMessage).toContain("I believe that the task was **completed successfully**");
-    });
-  });
-});
--- a/frontend/tests/services/observations.test.ts
+++ b/frontend/tests/services/observations.test.ts
@@ -1,51 +0,0 @@
-import { beforeEach, describe, expect, it, vi } from "vitest";
-import { handleObservationMessage } from "#/services/observations";
-import store from "#/store";
-import { ObservationMessage } from "#/types/message";
-
-// Mock dependencies
-vi.mock("#/store", () => ({
-  default: {
-    dispatch: vi.fn(),
-  },
-}));
-
-describe("Observations Service", () => {
-  beforeEach(() => {
-    vi.clearAllMocks();
-  });
-
-  describe("handleObservationMessage", () => {
-    const createErrorMessage = (): ObservationMessage => ({
-      id: 14,
-      timestamp: "2025-04-14T13:37:54.451843",
-      message: "The action has not been executed.",
-      cause: 12,
-      observation: "error",
-      content: "The action has not been executed.",
-      extras: {
-        error_id: "",
-        metadata: {},
-      },
-    });
-
-    it("should dispatch error messages exactly once", () => {
-      const errorMessage = createErrorMessage();
-
-      handleObservationMessage(errorMessage);
-
-      expect(store.dispatch).toHaveBeenCalledTimes(1);
-      expect(store.dispatch).toHaveBeenCalledWith({
-        type: "chat/addAssistantObservation",
-        payload: expect.objectContaining({
-          observation: "error",
-          content: "The action has not been executed.",
-          source: "user",
-          extras: {
-            error_id: "",
-          },
-        }),
-      });
-    });
-  });
-});
--- a/frontend/tests/services/observations.test.tsx
+++ b/frontend/tests/services/observations.test.tsx
@@ -1,8 +1,4 @@
-import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
-import { handleObservationMessage } from "#/services/observations";
-import { setScreenshotSrc, setUrl } from "#/state/browser-slice";
-import ObservationType from "#/types/observation-type";
-import store from "#/store";
+import { describe, it, vi, beforeEach, afterEach } from "vitest";

 // Mock the store module
 vi.mock("#/store", () => ({
@@ -20,43 +16,9 @@ describe("handleObservationMessage", () => {
    vi.resetAllMocks();
  });

-  it("updates browser state when receiving a browse observation", () => {
-    const message = {
-      id: "test-id",
-      cause: "test-cause",
-      observation: ObservationType.BROWSE,
-      content: "test content",
-      message: "test message",
-      extras: {
-        url: "https://example.com",
-        screenshot: "base64-screenshot-data",
-      },
-    };
-    
-    handleObservationMessage(message);
+  it.todo("updates browser state when receiving a browse observation");

-    // Check that setScreenshotSrc and setUrl were called with the correct values
-    expect(store.dispatch).toHaveBeenCalledWith(setScreenshotSrc("base64-screenshot-data"));
-    expect(store.dispatch).toHaveBeenCalledWith(setUrl("https://example.com"));
-  });
-
-  it("updates browser state when receiving a browse_interactive observation", () => {
-    const message = {
-      id: "test-id",
-      cause: "test-cause",
-      observation: ObservationType.BROWSE_INTERACTIVE,
-      content: "test content",
-      message: "test message",
-      extras: {
-        url: "https://example.com",
-        screenshot: "base64-screenshot-data",
-      },
-    };
-    
-    handleObservationMessage(message);
-
-    // Check that setScreenshotSrc and setUrl were called with the correct values
-    expect(store.dispatch).toHaveBeenCalledWith(setScreenshotSrc("base64-screenshot-data"));
-    expect(store.dispatch).toHaveBeenCalledWith(setUrl("https://example.com"));
-  });
-});
+  it.todo(
+    "updates browser state when receiving a browse_interactive observation",
+  );
+});
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
  "name": "openhands-frontend",
-  "version": "0.37.0",
+  "version": "0.39.0",
  "private": true,
  "type": "module",
  "engines": {
@@ -8,30 +8,30 @@
  },
  "dependencies": {
    "@heroui/react": "2.7.8",
-    "@microlink/react-json-view": "^1.26.1",
+    "@microlink/react-json-view": "^1.26.2",
    "@monaco-editor/react": "^4.7.0-rc.0",
-    "@react-router/node": "^7.5.3",
-    "@react-router/serve": "^7.5.3",
-    "@react-types/shared": "^3.29.0",
-    "@reduxjs/toolkit": "^2.7.0",
+    "@react-router/node": "^7.6.0",
+    "@react-router/serve": "^7.6.0",
+    "@react-types/shared": "^3.29.1",
+    "@reduxjs/toolkit": "^2.8.2",
    "@stripe/react-stripe-js": "^3.7.0",
    "@stripe/stripe-js": "^7.3.0",
-    "@tanstack/react-query": "^5.75.4",
+    "@tanstack/react-query": "^5.76.1",
    "@vitejs/plugin-react": "^4.4.0",
    "@xterm/addon-fit": "^0.10.0",
    "@xterm/xterm": "^5.4.0",
    "axios": "^1.9.0",
    "clsx": "^2.1.1",
    "eslint-config-airbnb-typescript": "^18.0.0",
-    "framer-motion": "^12.10.0",
-    "i18next": "^25.1.1",
+    "framer-motion": "^12.12.1",
+    "i18next": "^25.1.3",
    "i18next-browser-languagedetector": "^8.1.0",
    "i18next-http-backend": "^3.0.2",
-    "isbot": "^5.1.27",
+    "isbot": "^5.1.28",
    "jose": "^6.0.11",
-    "lucide-react": "^0.507.0",
+    "lucide-react": "^0.511.0",
    "monaco-editor": "^0.52.2",
-    "posthog-js": "^1.239.1",
+    "posthog-js": "^1.245.1",
    "react": "^19.1.0",
    "react-dom": "^19.1.0",
    "react-highlight": "^0.15.0",
@@ -40,15 +40,15 @@
    "react-icons": "^5.5.0",
    "react-markdown": "^10.1.0",
    "react-redux": "^9.2.0",
-    "react-router": "^7.5.3",
+    "react-router": "^7.6.0",
    "react-syntax-highlighter": "^15.6.1",
    "react-textarea-autosize": "^8.5.9",
    "remark-gfm": "^4.0.1",
    "sirv-cli": "^3.0.1",
    "socket.io-client": "^4.8.1",
-    "tailwind-merge": "^3.2.0",
+    "tailwind-merge": "^3.3.0",
    "vite": "^6.3.5",
-    "web-vitals": "^3.5.2",
+    "web-vitals": "^5.0.1",
    "ws": "^8.18.2"
  },
  "scripts": {
@@ -68,7 +68,8 @@
    "lint:fix": "eslint src --ext .ts,.tsx,.js --fix && prettier --write src/**/*.{ts,tsx}",
    "prepare": "cd .. && husky frontend/.husky",
    "typecheck": "react-router typegen && tsc",
-    "check-unlocalized-strings": "node scripts/check-unlocalized-strings.cjs"
+    "check-unlocalized-strings": "node scripts/check-unlocalized-strings.cjs",
+    "check-translation-completeness": "node scripts/check-translation-completeness.cjs"
  },
  "lint-staged": {
    "src/**/*.{ts,tsx,js}": [
@@ -82,28 +83,28 @@
    "@babel/types": "^7.27.0",
    "@mswjs/socket.io-binding": "^0.1.1",
    "@playwright/test": "^1.52.0",
-    "@react-router/dev": "^7.5.3",
+    "@react-router/dev": "^7.6.0",
    "@tailwindcss/typography": "^0.5.16",
    "@tanstack/eslint-plugin-query": "^5.74.7",
    "@testing-library/dom": "^10.4.0",
    "@testing-library/jest-dom": "^6.6.1",
    "@testing-library/react": "^16.3.0",
    "@testing-library/user-event": "^14.6.1",
-    "@types/node": "^22.15.12",
-    "@types/react": "^19.1.3",
-    "@types/react-dom": "^19.1.3",
+    "@types/node": "^22.15.21",
+    "@types/react": "^19.1.5",
+    "@types/react-dom": "^19.1.5",
    "@types/react-highlight": "^0.12.8",
    "@types/react-syntax-highlighter": "^15.5.13",
    "@types/ws": "^8.18.1",
    "@typescript-eslint/eslint-plugin": "^7.18.0",
    "@typescript-eslint/parser": "^7.18.0",
-    "@vitest/coverage-v8": "^3.1.3",
+    "@vitest/coverage-v8": "^3.1.4",
    "autoprefixer": "^10.4.21",
    "cross-env": "^7.0.3",
    "eslint": "^8.57.0",
    "eslint-config-airbnb": "^19.0.4",
    "eslint-config-airbnb-typescript": "^18.0.0",
-    "eslint-config-prettier": "^10.1.3",
+    "eslint-config-prettier": "^10.1.5",
    "eslint-plugin-import": "^2.29.1",
    "eslint-plugin-jsx-a11y": "^6.10.2",
    "eslint-plugin-prettier": "^5.4.0",
@@ -112,11 +113,11 @@
    "eslint-plugin-unused-imports": "^4.1.4",
    "husky": "^9.1.7",
    "jsdom": "^26.1.0",
-    "lint-staged": "^15.5.2",
+    "lint-staged": "^16.0.0",
    "msw": "^2.6.6",
    "postcss": "^8.5.2",
    "prettier": "^3.5.3",
-    "stripe": "^18.1.0",
+    "stripe": "^18.1.1",
    "tailwindcss": "^3.4.17",
    "typescript": "^5.8.3",
    "vite-plugin-svgr": "^4.2.0",
--- a/frontend/scripts/check-translation-completeness.cjs
+++ b/frontend/scripts/check-translation-completeness.cjs
@@ -0,0 +1,88 @@
+#!/usr/bin/env node
+
+/**
+ * Pre-commit hook script to check for translation completeness
+ * This script ensures that all translation keys have entries for all supported languages
+ */
+
+const fs = require('fs');
+const path = require('path');
+
+// Load the translation file
+const translationJsonPath = path.join(__dirname, '../src/i18n/translation.json');
+const translationJson = require(translationJsonPath);
+
+// Load the available languages from the i18n index file
+const i18nIndexPath = path.join(__dirname, '../src/i18n/index.ts');
+const i18nIndexContent = fs.readFileSync(i18nIndexPath, 'utf8');
+
+// Extract the language codes from the AvailableLanguages array
+const languageCodesRegex = /\{ label: "[^"]+", value: "([^"]+)" \}/g;
+const supportedLanguageCodes = [];
+let match;
+
+while ((match = languageCodesRegex.exec(i18nIndexContent)) !== null) {
+  supportedLanguageCodes.push(match[1]);
+}
+
+// Track missing and extra translations
+const missingTranslations = {};
+const extraLanguages = {};
+let hasErrors = false;
+
+// Check each translation key
+Object.entries(translationJson).forEach(([key, translations]) => {
+  // Get the languages available for this key
+  const availableLanguages = Object.keys(translations);
+
+  // Find missing languages for this key
+  const missing = supportedLanguageCodes.filter(
+    (langCode) => !availableLanguages.includes(langCode)
+  );
+
+  if (missing.length > 0) {
+    missingTranslations[key] = missing;
+    hasErrors = true;
+  }
+
+  // Find extra languages for this key
+  const extra = availableLanguages.filter(
+    (langCode) => !supportedLanguageCodes.includes(langCode)
+  );
+
+  if (extra.length > 0) {
+    extraLanguages[key] = extra;
+    hasErrors = true;
+  }
+});
+
+// Generate detailed error message if there are missing translations
+if (Object.keys(missingTranslations).length > 0) {
+  console.error('\x1b[31m%s\x1b[0m', 'ERROR: Missing translations detected');
+  console.error(`Found ${Object.keys(missingTranslations).length} translation keys with missing languages:`);
+  
+  Object.entries(missingTranslations).forEach(([key, langs]) => {
+    console.error(`- Key "${key}" is missing translations for: ${langs.join(', ')}`);
+  });
+  
+  console.error('\nPlease add the missing translations before committing.');
+}
+
+// Generate detailed error message if there are extra languages
+if (Object.keys(extraLanguages).length > 0) {
+  console.error('\x1b[31m%s\x1b[0m', 'ERROR: Extra languages detected');
+  console.error(`Found ${Object.keys(extraLanguages).length} translation keys with extra languages not in AvailableLanguages:`);
+  
+  Object.entries(extraLanguages).forEach(([key, langs]) => {
+    console.error(`- Key "${key}" has translations for unsupported languages: ${langs.join(', ')}`);
+  });
+  
+  console.error('\nPlease remove the extra languages before committing.');
+}
+
+// Exit with error code if there are issues
+if (hasErrors) {
+  process.exit(1);
+} else {
+  console.log('\x1b[32m%s\x1b[0m', 'All translation keys have complete language coverage!');
+}
--- a/frontend/scripts/check-unlocalized-strings.cjs
+++ b/frontend/scripts/check-unlocalized-strings.cjs
@@ -111,12 +111,26 @@ const EXCLUDED_TECHNICAL_STRINGS = [
  "GitLab API", // Git provider specific terminology
  "Pull Request", // Git provider specific terminology
  "GitHub API", // Git provider specific terminology
+  "add-secret-form", // Test ID for secret form
+  "edit-secret-form", // Test ID for secret form
 ];

 function isExcludedTechnicalString(str) {
  return EXCLUDED_TECHNICAL_STRINGS.includes(str);
 }

+function isLikelyCode(str) {
+  // A string with no spaces and at least one underscore or colon is likely a code.
+  // (e.g.: "browser_interactive" or "error:")
+  if (str.includes(" ")) {
+    return false
+  }
+  if (str.includes(":") || str.includes("_")){
+    return true
+  }
+  return false
+}
+
 function isCommonDevelopmentString(str) {
  // Technical patterns that are definitely not UI strings
  const technicalPatterns = [
@@ -383,6 +397,11 @@ function isLikelyUserFacingText(str) {
    return false;
  }

+  // Check if it looks like a code rather than a key
+  if (isLikelyCode(str)) {
+    return false
+  }
+
  // Check if it's a raw translation key that should be wrapped in t()
  if (isRawTranslationKey(str)) {
    return true;
--- a/frontend/src/api/file-service/file-service.utils.ts
+++ b/frontend/src/api/file-service/file-service.utils.ts
@@ -1,7 +1,9 @@
+import OpenHands from "#/api/open-hands";
+
 /**
 * Returns a URL compatible for the file service
 * @param conversationId ID of the conversation
 * @returns URL of the conversation
 */
 export const getConversationUrl = (conversationId: string) =>
-  `/api/conversations/${conversationId}`;
+  OpenHands.getConversationUrl(conversationId);
--- a/Show More
+++ b/Show More