Updatre coverage enforcement calculation

chore - Enforce python test coverage
2026-04-29 03:00:45 -04:00 · 2025-10-29 01:27:12 -05:00 · 2025-10-28 17:59:53 -05:00
704 changed files with 19006 additions and 38676 deletions
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@@ -1 +0,0 @@
-This way of running OpenHands is not officially supported. It is maintained by the community.
--- a/.devcontainer/setup.sh
+++ b/.devcontainer/setup.sh
@@ -7,8 +7,5 @@ git config --global --add safe.directory "$(realpath .)"
 # Install `nc`
 sudo apt update && sudo apt install netcat -y

-# Install `uv` and `uvx`
-wget -qO- https://astral.sh/uv/install.sh | sh
-
 # Do common setup tasks
 source .openhands/setup.sh
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,12 @@
 # CODEOWNERS file for OpenHands repository
 # See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners

-/frontend/ @amanape @hieptl
-/openhands-ui/ @amanape @hieptl
-/openhands/ @tofarr @malhotra5 @hieptl
-/enterprise/ @chuckbutkus @tofarr @malhotra5
+# Frontend code owners
+/frontend/ @amanape
+/openhands-ui/ @amanape
+
+# Evaluation code owners
 /evaluation/ @xingyaoww @neubig
+
+# Documentation code owners
+/docs/ @mamoodi
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -13,7 +13,6 @@
 - [ ] Other (dependency update, docs, typo fixes, etc.)

 ## Checklist
-<!-- AI/LLM AGENTS: This checklist is for a human author to complete. Do NOT check either of the two boxes below. Leave them unchecked until a human has personally reviewed and tested the changes. -->

 - [ ] I have read and reviewed the code and I understand what the code is doing.
 - [ ] I have tested the code to the best of my ability and ensured it works as expected.
--- a/.github/scripts/check_version_consistency.py
+++ b/.github/scripts/check_version_consistency.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+import os
+import re
+import sys
+
+
+def find_version_references(directory: str) -> tuple[set[str], set[str]]:
+    openhands_versions = set()
+    runtime_versions = set()
+
+    version_pattern_openhands = re.compile(r'openhands:(\d{1})\.(\d{2})')
+    version_pattern_runtime = re.compile(r'runtime:(\d{1})\.(\d{2})')
+
+    for root, _, files in os.walk(directory):
+        # Skip .git directory and docs/build directory
+        if '.git' in root or 'docs/build' in root:
+            continue
+
+        for file in files:
+            if file.endswith(
+                ('.md', '.yml', '.yaml', '.txt', '.html', '.py', '.js', '.ts')
+            ):
+                file_path = os.path.join(root, file)
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as f:
+                        content = f.read()
+
+                        # Find all openhands version references
+                        matches = version_pattern_openhands.findall(content)
+                        if matches:
+                            print(f'Found openhands version {matches} in {file_path}')
+                            openhands_versions.update(matches)
+
+                        # Find all runtime version references
+                        matches = version_pattern_runtime.findall(content)
+                        if matches:
+                            print(f'Found runtime version {matches} in {file_path}')
+                            runtime_versions.update(matches)
+                except Exception as e:
+                    print(f'Error reading {file_path}: {e}', file=sys.stderr)
+
+    return openhands_versions, runtime_versions
+
+
+def main():
+    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+    print(f'Checking version consistency in {repo_root}')
+    openhands_versions, runtime_versions = find_version_references(repo_root)
+
+    print(f'Found openhands versions: {sorted(openhands_versions)}')
+    print(f'Found runtime versions: {sorted(runtime_versions)}')
+
+    exit_code = 0
+
+    if len(openhands_versions) > 1:
+        print('Error: Multiple openhands versions found:', file=sys.stderr)
+        print('Found versions:', sorted(openhands_versions), file=sys.stderr)
+        exit_code = 1
+    elif len(openhands_versions) == 0:
+        print('Warning: No openhands version references found', file=sys.stderr)
+
+    if len(runtime_versions) > 1:
+        print('Error: Multiple runtime versions found:', file=sys.stderr)
+        print('Found versions:', sorted(runtime_versions), file=sys.stderr)
+        exit_code = 1
+    elif len(runtime_versions) == 0:
+        print('Warning: No runtime version references found', file=sys.stderr)
+
+    sys.exit(exit_code)
+
+
+if __name__ == '__main__':
+    main()
--- a/.github/scripts/update_pr_description.sh
+++ b/.github/scripts/update_pr_description.sh
@@ -13,9 +13,12 @@ DOCKER_RUN_COMMAND="docker run -it --rm \
  -p 3000:3000 \
  -v /var/run/docker.sock:/var/run/docker.sock \
  --add-host host.docker.internal:host-gateway \
-  -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.openhands.dev/openhands/runtime:${SHORT_SHA}-nikolaik \
+  -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/openhands/runtime:${SHORT_SHA}-nikolaik \
  --name openhands-app-${SHORT_SHA} \
-  docker.openhands.dev/openhands/openhands:${SHORT_SHA}"
+  docker.all-hands.dev/openhands/openhands:${SHORT_SHA}"
+
+# Define the uvx command
+UVX_RUN_COMMAND="uvx --python 3.12 --from git+https://github.com/OpenHands/OpenHands@${BRANCH_NAME}#subdirectory=openhands-cli openhands"

 # Get the current PR body
 PR_BODY=$(gh pr view "$PR_NUMBER" --json body --jq .body)
@@ -34,6 +37,11 @@ GUI with Docker:
 \`\`\`
 ${DOCKER_RUN_COMMAND}
 \`\`\`
+
+CLI with uvx:
+\`\`\`
+${UVX_RUN_COMMAND}
+\`\`\`
 EOF
 )
 else
@@ -49,6 +57,11 @@ GUI with Docker:
 \`\`\`
 ${DOCKER_RUN_COMMAND}
 \`\`\`
+
+CLI with uvx:
+\`\`\`
+${UVX_RUN_COMMAND}
+\`\`\`
 EOF
 )
 fi
--- a/.github/workflows/check-package-versions.yml
+++ b/.github/workflows/check-package-versions.yml
@@ -1,65 +0,0 @@
-name: Check Package Versions
-
-on:
-  push:
-    branches: [main]
-  pull_request:
-  workflow_dispatch:
-
-jobs:
-  check-package-versions:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
-
-      - name: Check for any 'rev' fields in pyproject.toml
-        run: |
-          python - <<'PY'
-          import sys, tomllib, pathlib
-
-          path = pathlib.Path("pyproject.toml")
-          if not path.exists():
-              print("❌ ERROR: pyproject.toml not found")
-              sys.exit(1)
-
-          try:
-              data = tomllib.loads(path.read_text(encoding="utf-8"))
-          except Exception as e:
-              print(f"❌ ERROR: Failed to parse pyproject.toml: {e}")
-              sys.exit(1)
-
-          poetry = data.get("tool", {}).get("poetry", {})
-          sections = {
-              "dependencies": poetry.get("dependencies", {}),
-          }
-
-          errors = []
-
-          print("🔍 Checking for any dependencies with 'rev' fields...\n")
-          for section_name, deps in sections.items():
-              if not isinstance(deps, dict):
-                  continue
-
-              for pkg_name, cfg in deps.items():
-                  if isinstance(cfg, dict) and "rev" in cfg:
-                      msg = f"  ✖ {pkg_name} in [{section_name}] uses rev='{cfg['rev']}' (NOT ALLOWED)"
-                      print(msg)
-                      errors.append(msg)
-                  else:
-                      print(f"  • {pkg_name}: OK")
-
-          if errors:
-              print("\n❌ FAILED: Found dependencies using 'rev' fields:\n" + "\n".join(errors))
-              print("\nPlease use versioned releases instead, e.g.:")
-              print('  my-package = "1.0.0"')
-              sys.exit(1)
-
-          print("\n✅ SUCCESS: No 'rev' fields found. All dependencies are using proper versioned releases.")
-          PY
--- a/.github/workflows/clean-up.yml
+++ b/.github/workflows/clean-up.yml
@@ -0,0 +1,69 @@
+# Workflow that cleans up outdated and old workflows to prevent out of disk issues
+name: Delete old workflow runs
+
+# This workflow is currently only triggered manually
+on:
+  workflow_dispatch:
+    inputs:
+      days:
+        description: 'Days-worth of runs to keep for each workflow'
+        required: true
+        default: '30'
+      minimum_runs:
+        description: 'Minimum runs to keep for each workflow'
+        required: true
+        default: '10'
+      delete_workflow_pattern:
+        description: 'Name or filename of the workflow (if not set, all workflows are targeted)'
+        required: false
+      delete_workflow_by_state_pattern:
+        description: 'Filter workflows by state: active, deleted, disabled_fork, disabled_inactivity, disabled_manually'
+        required: true
+        default: "ALL"
+        type: choice
+        options:
+          - "ALL"
+          - active
+          - deleted
+          - disabled_inactivity
+          - disabled_manually
+      delete_run_by_conclusion_pattern:
+        description: 'Remove runs based on conclusion: action_required, cancelled, failure, skipped, success'
+        required: true
+        default: 'ALL'
+        type: choice
+        options:
+          - 'ALL'
+          - 'Unsuccessful: action_required,cancelled,failure,skipped'
+          - action_required
+          - cancelled
+          - failure
+          - skipped
+          - success
+      dry_run:
+        description: 'Logs simulated changes, no deletions are performed'
+        required: false
+
+jobs:
+  del_runs:
+    runs-on: blacksmith-4vcpu-ubuntu-2204
+    permissions:
+      actions: write
+      contents: read
+    steps:
+      - name: Delete workflow runs
+        uses: Mattraks/delete-workflow-runs@v2
+        with:
+          token: ${{ github.token }}
+          repository: ${{ github.repository }}
+          retain_days: ${{ github.event.inputs.days }}
+          keep_minimum_runs: ${{ github.event.inputs.minimum_runs }}
+          delete_workflow_pattern: ${{ github.event.inputs.delete_workflow_pattern }}
+          delete_workflow_by_state_pattern: ${{ github.event.inputs.delete_workflow_by_state_pattern }}
+          delete_run_by_conclusion_pattern: >-
+            ${{
+              startsWith(github.event.inputs.delete_run_by_conclusion_pattern, 'Unsuccessful:')
+              && 'action_required,cancelled,failure,skipped'
+              || github.event.inputs.delete_run_by_conclusion_pattern
+            }}
+          dry_run: ${{ github.event.inputs.dry_run }}
--- a/.github/workflows/cli-build-binary-and-optionally-release.yml
+++ b/.github/workflows/cli-build-binary-and-optionally-release.yml
@@ -0,0 +1,114 @@
+# Workflow that builds and tests the CLI binary executable
+name: CLI - Build binary and optionally release
+
+# Run on pushes to main branch and CLI tags, and on pull requests when CLI files change
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - "*-cli"
+  pull_request:
+    paths:
+      - "openhands-cli/**"
+
+permissions:
+  contents: write       # needed to create releases or upload assets
+
+# Cancel previous runs if a new commit is pushed
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build-binary:
+    name: Build binary executable
+    strategy:
+      matrix:
+        include:
+          # Build on Ubuntu 22.04 for maximum GLIBC compatibility (GLIBC 2.31)
+          - os: ubuntu-22.04
+            platform: linux
+            artifact_name: openhands-cli-linux
+          # Build on macOS for macOS users
+          - os: macos-15
+            platform: macos
+            artifact_name: openhands-cli-macos
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.12
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          version: "latest"
+
+      - name: Install dependencies
+        working-directory: openhands-cli
+        run: |
+          uv sync
+
+      - name: Build binary executable
+        working-directory: openhands-cli
+        run: |
+          ./build.sh --install-pyinstaller | tee output.log
+          echo "Full output:"
+          cat output.log
+
+          if grep -q "❌" output.log; then
+            echo "❌ Found failure marker in output"
+            exit 1
+          fi
+
+          echo "✅ Build & test finished without ❌ markers"
+
+      - name: Upload binary artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.artifact_name }}
+          path: openhands-cli/dist/openhands*
+          retention-days: 30
+
+  create-github-release:
+    name: Create GitHub Release
+    runs-on: ubuntu-latest
+    needs: build-binary
+    if: startsWith(github.ref, 'refs/tags/')
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Download all artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: artifacts
+
+      - name: Prepare release assets
+        run: |
+          mkdir -p release-assets
+          # Copy binaries with appropriate names for release
+          if [ -f artifacts/openhands-cli-linux/openhands ]; then
+            cp artifacts/openhands-cli-linux/openhands release-assets/openhands-linux
+          fi
+          if [ -f artifacts/openhands-cli-macos/openhands ]; then
+            cp artifacts/openhands-cli-macos/openhands release-assets/openhands-macos
+          fi
+          ls -la release-assets/
+
+      - name: Create GitHub Release
+        uses: softprops/action-gh-release@v2
+        with:
+          files: release-assets/*
+          draft: true
+          prerelease: false
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/dispatch-to-docs.yml
+++ b/.github/workflows/dispatch-to-docs.yml
@@ -0,0 +1,23 @@
+name: Dispatch to docs repo
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'docs/**'
+  workflow_dispatch:
+
+jobs:
+  dispatch:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        repo: ["OpenHands/docs"]
+    steps:
+      - name: Push to docs repo
+        uses: peter-evans/repository-dispatch@v3
+        with:
+          token: ${{ secrets.ALLHANDS_BOT_GITHUB_PAT }}
+          repository: ${{ matrix.repo }}
+          event-type: update
+          client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}", "module": "openhands", "branch": "main"}'
--- a/.github/workflows/fe-e2e-tests.yml
+++ b/.github/workflows/fe-e2e-tests.yml
@@ -1,47 +0,0 @@
-# Workflow that runs frontend e2e tests with Playwright
-name: Run Frontend E2E Tests
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    paths:
-      - "frontend/**"
-      - ".github/workflows/fe-e2e-tests.yml"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  fe-e2e-test:
-    name: FE E2E Tests
-    runs-on: blacksmith-4vcpu-ubuntu-2204
-    strategy:
-      matrix:
-        node-version: [22]
-      fail-fast: true
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Set up Node.js
-        uses: useblacksmith/setup-node@v5
-        with:
-          node-version: ${{ matrix.node-version }}
-      - name: Install dependencies
-        working-directory: ./frontend
-        run: npm ci
-      - name: Install Playwright browsers
-        working-directory: ./frontend
-        run: npx playwright install --with-deps chromium
-      - name: Run Playwright tests
-        working-directory: ./frontend
-        run: npx playwright test --project=chromium
-      - name: Upload Playwright report
-        uses: actions/upload-artifact@v4
-        if: always()
-        with:
-          name: playwright-report
-          path: frontend/playwright-report/
-          retention-days: 30
--- a/.github/workflows/ghcr-build.yml
+++ b/.github/workflows/ghcr-build.yml
@@ -86,7 +86,7 @@ jobs:

  # Builds the runtime Docker images
  ghcr_build_runtime:
-    name: Build Runtime Image
+    name: Build Image
    runs-on: blacksmith-8vcpu-ubuntu-2204
    if: "!(github.event_name == 'push' && startsWith(github.ref, 'refs/tags/ext-v'))"
    permissions:
@@ -256,7 +256,7 @@ jobs:
  test_runtime_root:
    name: RT Unit Tests (Root)
    needs: [ghcr_build_runtime, define-matrix]
-    runs-on: blacksmith-4vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2204
    strategy:
      fail-fast: false
      matrix:
@@ -298,7 +298,7 @@ jobs:
          # We install pytest-xdist in order to run tests across CPUs
          poetry run pip install pytest-xdist

-          # Install to be able to retry on failures for flakey tests
+          # Install to be able to retry on failures for flaky tests
          poetry run pip install pytest-rerunfailures

          image_name=ghcr.io/${{ env.REPO_OWNER }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image.tag }}
@@ -311,14 +311,14 @@ jobs:
          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=false \
-          poetry run pytest -n 5 -raRs --reruns 2 --reruns-delay 3 -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py --durations=10
+          poetry run pytest -n 0 -raRs --reruns 2 --reruns-delay 5 -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py --durations=10
        env:
          DEBUG: "1"

  # Run unit tests with the Docker runtime Docker images as openhands user
  test_runtime_oh:
    name: RT Unit Tests (openhands)
-    runs-on: blacksmith-4vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2204
    needs: [ghcr_build_runtime, define-matrix]
    strategy:
      matrix:
@@ -370,7 +370,7 @@ jobs:
          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=true \
-          poetry run pytest -n 5 -raRs --reruns 2 --reruns-delay 3 -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py --durations=10
+          poetry run pytest -n 0 -raRs --reruns 2 --reruns-delay 5 -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py --durations=10
        env:
          DEBUG: "1"

--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -0,0 +1,199 @@
+name: Run Integration Tests
+
+on:
+  pull_request:
+    types: [labeled]
+  workflow_dispatch:
+    inputs:
+      reason:
+        description: 'Reason for manual trigger'
+        required: true
+        default: ''
+  schedule:
+    - cron: '30 22 * * *'  # Runs at 10:30pm UTC every day
+
+env:
+  N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation
+
+jobs:
+  run-integration-tests:
+    if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
+    runs-on: blacksmith-4vcpu-ubuntu-2204
+    permissions:
+      contents: "read"
+      id-token: "write"
+      pull-requests: "write"
+      issues: "write"
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install poetry via pipx
+        run: pipx install poetry
+
+      - name: Set up Python
+        uses: useblacksmith/setup-python@v6
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "poetry"
+
+      - name: Setup Node.js
+        uses: useblacksmith/setup-node@v5
+        with:
+          node-version: '22.x'
+
+      - name: Comment on PR if 'integration-test' label is present
+        if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test'
+        uses: KeisukeYamashita/create-comment@v1
+        with:
+          unique: false
+          comment: |
+            Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
+
+      - name: Install Python dependencies using Poetry
+        run: poetry install --with dev,test,runtime,evaluation
+
+      - name: Configure config.toml for testing with Haiku
+        env:
+          LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 10
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Build environment
+        run: make build
+
+      - name: Run integration test evaluation for Haiku
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'haiku_run'
+
+          # get integration tests report
+          REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE: $REPORT_FILE_HAIKU"
+          echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_HAIKU >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Wait a little bit
+        run: sleep 10
+
+      - name: Configure config.toml for testing with DeepSeek
+        env:
+          LLM_MODEL: "litellm_proxy/deepseek-chat"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 10
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Run integration test evaluation for DeepSeek
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'deepseek_run'
+
+          # get integration tests report
+          REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK"
+          echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      # -------------------------------------------------------------
+      # Run VisualBrowsingAgent tests for DeepSeek, limited to t05 and t06
+      - name: Wait a little bit (again)
+        run: sleep 5
+
+      - name: Configure config.toml for testing VisualBrowsingAgent (DeepSeek)
+        env:
+          LLM_MODEL: "litellm_proxy/deepseek-chat"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 15
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+      - name: Run integration test evaluation for VisualBrowsingAgent (DeepSeek)
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD VisualBrowsingAgent '' 15 $N_PROCESSES "t05_simple_browsing,t06_github_pr_browsing.py" 'visualbrowsing_deepseek_run'
+
+          # Find and export the visual browsing agent test results
+          REPORT_FILE_VISUALBROWSING_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/VisualBrowsingAgent/deepseek*_maxiter_15_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE_VISUALBROWSING_DEEPSEEK: $REPORT_FILE_VISUALBROWSING_DEEPSEEK"
+          echo "INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_VISUALBROWSING_DEEPSEEK >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Create archive of evaluation outputs
+        run: |
+          TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
+          cd evaluation/evaluation_outputs/outputs  # Change to the outputs directory
+          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/VisualBrowsingAgent/* # Only include the actual result directories
+
+      - name: Upload evaluation results as artifact
+        uses: actions/upload-artifact@v4
+        id: upload_results_artifact
+        with:
+          name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }}
+          path: integration_tests_*.tar.gz
+
+      - name: Get artifact URLs
+        run: |
+          echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
+
+      - name: Set timestamp and trigger reason
+        run: |
+          echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
+          elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
+          else
+            echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV
+          fi
+
+      - name: Comment with results and artifact link
+        id: create_comment
+        uses: KeisukeYamashita/create-comment@v1
+        with:
+          # if triggered by PR, use PR number, otherwise use 9745 as fallback issue number for manual triggers
+          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 9745 }}
+          unique: false
+          comment: |
+              Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}
+              Commit: ${{ github.sha }}
+              **Integration Tests Report (Haiku)**
+              Haiku LLM Test Results:
+              ${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
+              ---
+              **Integration Tests Report (DeepSeek)**
+              DeepSeek LLM Test Results:
+              ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
+              ---
+              **Integration Tests Report VisualBrowsing (DeepSeek)**
+              ${{ env.INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK }}
+              ---
+              Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -72,3 +72,34 @@ jobs:
      - name: Run pre-commit hooks
        working-directory: ./enterprise
        run: pre-commit run --all-files --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
+
+  lint-cli-python:
+    name: Lint CLI python
+    runs-on: blacksmith-4vcpu-ubuntu-2204
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Set up python
+        uses: useblacksmith/setup-python@v6
+        with:
+          python-version: 3.12
+          cache: "pip"
+      - name: Install pre-commit
+        run: pip install pre-commit==4.2.0
+      - name: Run pre-commit hooks
+        working-directory: ./openhands-cli
+        run: pre-commit run --all-files --config ./dev_config/python/.pre-commit-config.yaml
+
+  # Check version consistency across documentation
+  check-version-consistency:
+    name: Check version consistency
+    runs-on: blacksmith-4vcpu-ubuntu-2204
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up python
+        uses: useblacksmith/setup-python@v6
+        with:
+          python-version: 3.12
+      - name: Run version consistency check
+        run: .github/scripts/check_version_consistency.py
--- a/.github/workflows/mdx-lint.yml
+++ b/.github/workflows/mdx-lint.yml
@@ -0,0 +1,70 @@
+# Workflow that checks MDX format in docs/ folder
+name: MDX Lint
+
+# Run on pushes to main and on pull requests that modify docs/ files
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'docs/**/*.mdx'
+  pull_request:
+    paths:
+      - 'docs/**/*.mdx'
+
+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  mdx-lint:
+    name: Lint MDX files
+    runs-on: blacksmith-4vcpu-ubuntu-2204
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Node.js 22
+        uses: useblacksmith/setup-node@v5
+        with:
+          node-version: 22
+
+      - name: Install MDX dependencies
+        run: |
+          npm install @mdx-js/mdx@3 glob@10
+
+      - name: Validate MDX files
+        run: |
+          node -e "
+          const {compile} = require('@mdx-js/mdx');
+          const fs = require('fs');
+          const path = require('path');
+          const glob = require('glob');
+
+          async function validateMDXFiles() {
+            const files = glob.sync('docs/**/*.mdx');
+            console.log('Found', files.length, 'MDX files to validate');
+
+            let hasErrors = false;
+
+            for (const file of files) {
+              try {
+                const content = fs.readFileSync(file, 'utf8');
+                await compile(content);
+                console.log('✅ MDX parsing successful for', file);
+              } catch (err) {
+                console.error('❌ MDX parsing failed for', file, ':', err.message);
+                hasErrors = true;
+              }
+            }
+
+            if (hasErrors) {
+              console.error('\\n❌ Some MDX files have parsing errors. Please fix them before merging.');
+              process.exit(1);
+            } else {
+              console.log('\\n✅ All MDX files are valid!');
+            }
+          }
+
+          validateMDXFiles();
+          "
--- a/.github/workflows/py-tests.yml
+++ b/.github/workflows/py-tests.yml
@@ -48,10 +48,7 @@ jobs:
          python-version: ${{ matrix.python-version }}
          cache: "poetry"
      - name: Install Python dependencies using Poetry
-        run: |
-          poetry install --with dev,test,runtime
-          poetry run pip install pytest-xdist
-          poetry run pip install pytest-rerunfailures
+        run: poetry install --with dev,test,runtime
      - name: Build Environment
        run: make build
      - name: Run Unit Tests
@@ -59,7 +56,7 @@ jobs:
        env:
          COVERAGE_FILE: ".coverage.${{ matrix.python_version }}"
      - name: Run Runtime Tests with CLIRuntime
-        run: PYTHONPATH=".:$PYTHONPATH" TEST_RUNTIME=cli poetry run pytest -n 5 --reruns 2 --reruns-delay 3 -s tests/runtime/test_bash.py --cov=openhands --cov-branch
+        run: PYTHONPATH=".:$PYTHONPATH" TEST_RUNTIME=cli poetry run pytest -s tests/runtime/test_bash.py --cov=openhands --cov-branch
        env:
          COVERAGE_FILE: ".coverage.runtime.${{ matrix.python_version }}"
      - name: Store coverage file
@@ -70,7 +67,37 @@ jobs:
            .coverage.${{ matrix.python_version }}
            .coverage.runtime.${{ matrix.python_version }}
          include-hidden-files: true
-
+  # Run specific Windows python tests
+  test-on-windows:
+    name: Python Tests on Windows
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install pipx
+        run: pip install pipx
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "poetry"
+      - name: Install Python dependencies using Poetry
+        run: poetry install --with dev,test,runtime
+      - name: Run Windows unit tests
+        run: poetry run pytest -svv tests/unit/runtime/utils/test_windows_bash.py
+        env:
+          PYTHONPATH: ".;$env:PYTHONPATH"
+          DEBUG: "1"
+      - name: Run Windows runtime tests with LocalRuntime
+        run: $env:TEST_RUNTIME="local"; poetry run pytest -svv tests/runtime/test_bash.py
+        env:
+          PYTHONPATH: ".;$env:PYTHONPATH"
+          TEST_RUNTIME: local
+          DEBUG: "1"
  test-enterprise:
    name: Enterprise Python Unit Tests
    runs-on: blacksmith-4vcpu-ubuntu-2404
@@ -101,11 +128,56 @@ jobs:
          path: ".coverage.enterprise.${{ matrix.python_version }}"
          include-hidden-files: true

+  # Run CLI unit tests
+  test-cli-python:
+    name: CLI Unit Tests
+    runs-on: blacksmith-4vcpu-ubuntu-2404
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: useblacksmith/setup-python@v6
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          version: "latest"
+
+      - name: Install dependencies
+        working-directory: ./openhands-cli
+        run: |
+          uv sync --group dev
+
+      - name: Run CLI unit tests
+        working-directory: ./openhands-cli
+        env:
+          # write coverage to repo root so the merge step finds it
+          COVERAGE_FILE: "${{ github.workspace }}/.coverage.openhands-cli.${{ matrix.python-version }}"
+        run: |
+          uv run pytest --forked -n auto -s \
+            -p no:ddtrace -p no:ddtrace.pytest_bdd -p no:ddtrace.pytest_benchmark \
+            tests --cov=openhands_cli --cov-branch
+
+      - name: Store coverage file
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-openhands-cli
+          path: ".coverage.openhands-cli.${{ matrix.python-version }}"
+          include-hidden-files: true
+
+
  coverage-comment:
    name: Coverage Comment
-    if: github.event_name == 'pull_request'
    runs-on: ubuntu-latest
-    needs: [test-on-linux, test-enterprise]
+    needs: [test-on-linux, test-enterprise, test-cli-python]

    permissions:
      pull-requests: write
@@ -119,9 +191,27 @@ jobs:
          pattern: coverage-*
          merge-multiple: true

+      - name: Create symlink for CLI source files
+        run: ln -sf openhands-cli/openhands_cli openhands_cli
+
      - name: Coverage comment
+        # In PR mode leaves a comment, otherwise records coverage in branch python-coverage-comment-action-data.
        id: coverage_comment
        uses: py-cov-action/python-coverage-comment-action@v3
        with:
          GITHUB_TOKEN: ${{ github.token }}
          MERGE_COVERAGE_FILES: true
+
+      - name: Enforce coverage
+        # Fail if on PR AND there are uncovered lines AND diff coverage is less than total coverage.
+        # To debug, try a step to log outputs like: `echo ${{ toJSON(steps.coverage_comment.outputs) }}`
+        # Once we track base branch, reference_percent_covered will be better to use than new_percent_covered.
+        if: ${{ github.event_name == 'pull_request' && fromJSON(steps.coverage_comment.outputs.diff_total_num_violations) > 0 && steps.coverage_comment.outputs.diff_total_percent_covered < steps.coverage_comment.outputs.new_percent_covered }}
+        run: |
+          echo "Coverage decreased, which is not allowed."
+          echo "Please add some unit tests for the modified code."
+          echo
+          echo "  diff_total_num_violations: ${{ steps.coverage_comment.outputs.diff_total_num_violations }}"
+          echo "  diff_total_percent_covered: ${{ steps.coverage_comment.outputs.diff_total_percent_covered}}"
+          echo "  new_percent_covered: ${{ steps.coverage_comment.outputs.new_percent_covered}}"
+          exit 1
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@@ -10,6 +10,7 @@ on:
        type: choice
        options:
          - app server
+          - cli
        default: app server
  push:
    tags:
@@ -38,3 +39,36 @@ jobs:
        run: ./build.sh
      - name: publish
        run: poetry publish -u __token__ -p ${{ secrets.PYPI_TOKEN }}
+
+  release-cli:
+    name: Publish CLI to PyPI
+    runs-on: ubuntu-latest
+    # Run when manually dispatched for "cli" OR for tag pushes that contain '-cli'
+    if: |
+      (github.event_name == 'workflow_dispatch' && github.event.inputs.reason == 'cli')
+      || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') && contains(github.ref, '-cli'))
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.12
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          version: "latest"
+
+      - name: Build CLI package
+        working-directory: openhands-cli
+        run: |
+          # Clean dist directory to avoid conflicts with binary builds
+          rm -rf dist/
+          uv build
+
+      - name: Publish CLI to PyPI
+        working-directory: openhands-cli
+        run: |
+          uv publish --token ${{ secrets.PYPI_TOKEN_OPENHANDS }}
--- a/.github/workflows/run-eval.yml
+++ b/.github/workflows/run-eval.yml
@@ -0,0 +1,135 @@
+# Run evaluation on a PR, after releases, or manually
+name: Run Eval
+
+# Runs when a PR is labeled with one of the "run-eval-" labels, after releases, or manually triggered
+on:
+  pull_request:
+    types: [labeled]
+  release:
+    types: [published]
+  workflow_dispatch:
+    inputs:
+      branch:
+        description: 'Branch to evaluate'
+        required: true
+        default: 'main'
+      eval_instances:
+        description: 'Number of evaluation instances'
+        required: true
+        default: '50'
+        type: choice
+        options:
+          - '1'
+          - '2'
+          - '50'
+          - '100'
+      reason:
+        description: 'Reason for manual trigger'
+        required: false
+        default: ''
+
+env:
+  # Environment variable for the master GitHub issue number where all evaluation results will be commented
+  # This should be set to the issue number where you want all evaluation results to be posted
+  MASTER_EVAL_ISSUE_NUMBER: ${{ vars.MASTER_EVAL_ISSUE_NUMBER || '0' }}
+
+jobs:
+  trigger-job:
+    name: Trigger remote eval job
+    if: ${{ (github.event_name == 'pull_request' && (github.event.label.name == 'run-eval-1' || github.event.label.name == 'run-eval-2' || github.event.label.name == 'run-eval-50' || github.event.label.name == 'run-eval-100')) || github.event_name == 'release' || github.event_name == 'workflow_dispatch' }}
+    runs-on: blacksmith-4vcpu-ubuntu-2204
+
+    steps:
+      - name: Checkout branch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.head_ref || (github.event_name == 'workflow_dispatch' && github.event.inputs.branch) || github.ref }}
+
+      - name: Set evaluation parameters
+        id: eval_params
+        run: |
+          REPO_URL="https://github.com/${{ github.repository }}"
+          echo "Repository URL: $REPO_URL"
+
+          # Determine branch based on trigger type
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            EVAL_BRANCH="${{ github.head_ref }}"
+            echo "PR Branch: $EVAL_BRANCH"
+          elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            EVAL_BRANCH="${{ github.event.inputs.branch }}"
+            echo "Manual Branch: $EVAL_BRANCH"
+          else
+            # For release events, use the tag name or main branch
+            EVAL_BRANCH="${{ github.ref_name }}"
+            echo "Release Branch/Tag: $EVAL_BRANCH"
+          fi
+
+          # Determine evaluation instances based on trigger type
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            if [[ "${{ github.event.label.name }}" == "run-eval-1" ]]; then
+              EVAL_INSTANCES="1"
+            elif [[ "${{ github.event.label.name }}" == "run-eval-2" ]]; then
+              EVAL_INSTANCES="2"
+            elif [[ "${{ github.event.label.name }}" == "run-eval-50" ]]; then
+              EVAL_INSTANCES="50"
+            elif [[ "${{ github.event.label.name }}" == "run-eval-100" ]]; then
+              EVAL_INSTANCES="100"
+            fi
+          elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            EVAL_INSTANCES="${{ github.event.inputs.eval_instances }}"
+          else
+            # For release events, default to 50 instances
+            EVAL_INSTANCES="50"
+          fi
+
+          echo "Evaluation instances: $EVAL_INSTANCES"
+          echo "repo_url=$REPO_URL" >> $GITHUB_OUTPUT
+          echo "eval_branch=$EVAL_BRANCH" >> $GITHUB_OUTPUT
+          echo "eval_instances=$EVAL_INSTANCES" >> $GITHUB_OUTPUT
+
+      - name: Trigger remote job
+        run: |
+          # Determine PR number for the remote evaluation system
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            PR_NUMBER="${{ github.event.pull_request.number }}"
+          else
+            # For non-PR triggers, use the master issue number as PR number
+            PR_NUMBER="${{ env.MASTER_EVAL_ISSUE_NUMBER }}"
+          fi
+
+          curl -X POST \
+            -H "Authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+            -H "Accept: application/vnd.github+json" \
+            -d "{\"ref\": \"main\", \"inputs\": {\"github-repo\": \"${{ steps.eval_params.outputs.repo_url }}\", \"github-branch\": \"${{ steps.eval_params.outputs.eval_branch }}\", \"pr-number\": \"${PR_NUMBER}\", \"eval-instances\": \"${{ steps.eval_params.outputs.eval_instances }}\"}}" \
+            https://api.github.com/repos/OpenHands/evaluation/actions/workflows/create-branch.yml/dispatches
+
+          # Send Slack message
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            TRIGGER_URL="https://github.com/${{ github.repository }}/pull/${{ github.event.pull_request.number }}"
+            slack_text="PR $TRIGGER_URL has triggered evaluation on ${{ steps.eval_params.outputs.eval_instances }} instances..."
+          elif [[ "${{ github.event_name }}" == "release" ]]; then
+            TRIGGER_URL="https://github.com/${{ github.repository }}/releases/tag/${{ github.ref_name }}"
+            slack_text="Release $TRIGGER_URL has triggered evaluation on ${{ steps.eval_params.outputs.eval_instances }} instances..."
+          else
+            TRIGGER_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+            slack_text="Manual trigger (${{ github.event.inputs.reason || 'No reason provided' }}) has triggered evaluation on ${{ steps.eval_params.outputs.eval_instances }} instances for branch ${{ steps.eval_params.outputs.eval_branch }}..."
+          fi
+
+          curl -X POST -H 'Content-type: application/json' --data '{"text":"'"$slack_text"'"}' \
+            https://hooks.slack.com/services/${{ secrets.SLACK_TOKEN }}
+
+      - name: Comment on issue/PR
+        uses: KeisukeYamashita/create-comment@v1
+        with:
+          # For PR triggers, comment on the PR. For other triggers, comment on the master issue
+          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || env.MASTER_EVAL_ISSUE_NUMBER }}
+          unique: false
+          comment: |
+            **Evaluation Triggered**
+
+            **Trigger:** ${{ github.event_name == 'pull_request' && format('Pull Request #{0}', github.event.pull_request.number) || (github.event_name == 'release' && 'Release') || format('Manual Trigger: {0}', github.event.inputs.reason || 'No reason provided') }}
+            **Branch:** ${{ steps.eval_params.outputs.eval_branch }}
+            **Instances:** ${{ steps.eval_params.outputs.eval_instances }}
+            **Commit:** ${{ github.sha }}
+
+            Running evaluation on the specified branch. Once eval is done, the results will be posted here.
--- a/.gitignore
+++ b/.gitignore
@@ -185,9 +185,6 @@ cython_debug/
 .repomix
 repomix-output.txt

-# Emacs backup
-*~
-
 # evaluation
 evaluation/evaluation_outputs
 evaluation/outputs
--- a/.openhands/microagents/repo.md
+++ b/.openhands/microagents/repo.md
@@ -63,7 +63,7 @@ Frontend:
  - We use TanStack Query (fka React Query) for data fetching and cache management
  - Data Access Layer: API client methods are located in `frontend/src/api` and should never be called directly from UI components - they must always be wrapped with TanStack Query
  - Custom hooks are located in `frontend/src/hooks/query/` and `frontend/src/hooks/mutation/`
-  - Query hooks should follow the pattern use[Resource] (e.g., `useConversationSkills`)
+  - Query hooks should follow the pattern use[Resource] (e.g., `useConversationMicroagents`)
  - Mutation hooks should follow the pattern use[Action] (e.g., `useDeleteConversation`)
  - Architecture rule: UI components → TanStack Query hooks → Data Access Layer (`frontend/src/api`) → API endpoints

--- a/1
+++ b/1
@@ -1 +0,0 @@
-docs.all-hands.dev
--- a/COMMUNITY.md
+++ b/COMMUNITY.md
@@ -1,45 +1,43 @@
-# The OpenHands Community
+# 🙌 The OpenHands Community

-OpenHands is a community of engineers, academics, and enthusiasts reimagining software development for an AI-powered world.
+The OpenHands community is built around the belief that (1) AI and AI agents are going to fundamentally change the way
+we build software, and (2) if this is true, we should do everything we can to make sure that the benefits provided by
+such powerful technology are accessible to everyone.

-## Mission
+If this resonates with you, we'd love to have you join us in our quest!

-It’s very clear that AI is changing software development. We want the developer community to drive that change organically, through open source.
+## 🤝 How to Join

-So we’re not just building friendly interfaces for AI-driven development. We’re publishing _building blocks_ that empower developers to create new experiences, tailored to your own habits, needs, and imagination.
+Check out our [How to Join the Community section.](https://github.com/OpenHands/OpenHands?tab=readme-ov-file#-how-to-join-the-community)

-## Ethos
+## 💪 Becoming a Contributor

-We have two core values: **high openness** and **high agency**. While we don’t expect everyone in the community to embody these values, we want to establish them as norms.
+We welcome contributions from everyone! Whether you're a developer, a researcher, or simply enthusiastic about advancing
+the field of software engineering with AI, there are many ways to get involved:

-### High Openness
+- **Code Contributions:** Help us develop new core functionality, improve our agents, improve the frontend and other
+interfaces, or anything else that would help make OpenHands better.
+- **Research and Evaluation:** Contribute to our understanding of LLMs in software engineering, participate in
+evaluating the models, or suggest improvements.
+- **Feedback and Testing:** Use the OpenHands toolset, report bugs, suggest features, or provide feedback on usability.

-We welcome anyone and everyone into our community by default. You don’t have to be a software developer to help us build. You don’t have to be pro-AI to help us learn.
+For details, please check [CONTRIBUTING.md](./CONTRIBUTING.md).

-Our plans, our work, our successes, and our failures are all public record. We want the world to see not just the fruits of our work, but the whole process of growing it.
+## Code of Conduct

-We welcome thoughtful criticism, whether it’s a comment on a PR or feedback on the community as a whole.
+We have a [Code of Conduct](./CODE_OF_CONDUCT.md) that we expect all contributors to adhere to.
+Long story short, we are aiming for an open, welcoming, diverse, inclusive, and healthy community.
+All contributors are expected to contribute to building this sort of community.

-### High Agency
+## 🛠️ Becoming a Maintainer

-Everyone should feel empowered to contribute to OpenHands. Whether it’s by making a PR, hosting an event, sharing feedback, or just asking a question, don’t hold back!
+For contributors who have made significant and sustained contributions to the project, there is a possibility of joining
+the maintainer team. The process for this is as follows:

-OpenHands gives everyone the building blocks to create state-of-the-art developer experiences. We experiment constantly and love building new things.
+1. Any contributor who has made sustained and high-quality contributions to the codebase can be nominated by any
+maintainer. If you feel that you may qualify you can reach out to any of the maintainers that have reviewed your PRs and ask if you can be nominated.
+2. Once a maintainer nominates a new maintainer, there will be a discussion period among the maintainers for at least 3 days.
+3. If no concerns are raised the nomination will be accepted by acclamation, and if concerns are raised there will be a discussion and possible vote.

-Coding, development practices, and communities are changing rapidly. We won’t hesitate to change direction and make big bets.
-
-## Relationship to All Hands
-
-OpenHands is supported by the for-profit organization [All Hands AI, Inc](https://www.all-hands.dev/).
-
-All Hands was founded by three of the first major contributors to OpenHands:
-
- Xingyao Wang, a UIUC PhD candidate who got OpenHands to the top of the SWE-bench leaderboards
- Graham Neubig, a CMU Professor who rallied the academic community around OpenHands
- Robert Brennan, a software engineer who architected the user-facing features of OpenHands
-
-All Hands is an important part of the OpenHands ecosystem. We’ve raised over $20M--mainly to hire developers and researchers who can work on OpenHands full-time, and to provide them with expensive infrastructure. ([Join us!](https://allhandsai.applytojob.com/apply/))
-
-But we see OpenHands as much larger, and ultimately more important, than All Hands. When our financial responsibility to investors is at odds with our social responsibility to the community—as it inevitably will be, from time to time—we promise to navigate that conflict thoughtfully and transparently.
-
-At some point, we may transfer custody of OpenHands to an open source foundation. But for now, the [Benevolent Dictator approach](http://www.catb.org/~esr/writings/cathedral-bazaar/homesteading/ar01s16.html) helps us move forward with speed and intention. If we ever forget the “benevolent” part, please: fork us.
+Note that just making many PRs does not immediately imply that you will become a maintainer. We will be looking
+at sustained high-quality contributions over a period of time, as well as good teamwork and adherence to our [Code of Conduct](./CODE_OF_CONDUCT.md).
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -58,7 +58,7 @@ by implementing the [interface specified here](https://github.com/OpenHands/Open

 #### Testing
 When you write code, it is also good to write tests. Please navigate to the [`./tests`](./tests) folder to see existing test suites.
-At the moment, we have these kinds of tests: [`unit`](./tests/unit), [`runtime`](./tests/runtime), and [`end-to-end (e2e)`](./tests/e2e). Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.
+At the moment, we have two kinds of tests: [`unit`](./tests/unit) and [`integration`](./evaluation/integration_tests). Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.

 ## Sending Pull Requests to OpenHands

--- a/Development.md
+++ b/Development.md
@@ -91,14 +91,14 @@ make run
 #### Option B: Individual Server Startup

 - **Start the Backend Server:** If you prefer, you can start the backend server independently to focus on
-  backend-related tasks or configurations.
+backend-related tasks or configurations.

  ```bash
  make start-backend
  ```

 - **Start the Frontend Server:** Similarly, you can start the frontend server on its own to work on frontend-related
-  components or interface enhancements.
+components or interface enhancements.
  ```bash
  make start-frontend
  ```
@@ -110,7 +110,6 @@ You can use OpenHands to develop and improve OpenHands itself! This is a powerfu
 #### Quick Start

 1. **Build and run OpenHands:**
-
   ```bash
   export INSTALL_DOCKER=0
   export RUNTIME=local
@@ -118,7 +117,6 @@ You can use OpenHands to develop and improve OpenHands itself! This is a powerfu
   ```

 2. **Access the interface:**
-
   - Local development: http://localhost:3001
   - Remote/cloud environments: Use the appropriate external URL

@@ -161,7 +159,7 @@ poetry run pytest ./tests/unit/test_*.py
 To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker
 container image by setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.

-Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/openhands/runtime:1.0-nikolaik`
+Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/openhands/runtime:0.59-nikolaik`

 ## Develop inside Docker container

@@ -201,6 +199,6 @@ Here's a guide to the important documentation files in the repository:
 - [/containers/README.md](./containers/README.md): Information about Docker containers and deployment
 - [/tests/unit/README.md](./tests/unit/README.md): Guide to writing and running unit tests
 - [/evaluation/README.md](./evaluation/README.md): Documentation for the evaluation framework and benchmarks
- [/skills/README.md](./skills/README.md): Information about the skills architecture and implementation
+- [/microagents/README.md](./microagents/README.md): Information about the microagents architecture and implementation
 - [/openhands/server/README.md](./openhands/server/README.md): Server implementation details and API documentation
 - [/openhands/runtime/README.md](./openhands/runtime/README.md): Documentation for the runtime environment and execution model
--- a/README.md
+++ b/README.md
@@ -1,18 +1,22 @@
 <a name="readme-top"></a>

 <div align="center">
-  <img src="https://raw.githubusercontent.com/OpenHands/docs/main/openhands/static/img/logo.png" alt="Logo" width="200">
-  <h1 align="center" style="border-bottom: none">OpenHands: AI-Driven Development</h1>
+  <img src="https://raw.githubusercontent.com/All-Hands-AI/docs/main/openhands/static/img/logo.png" alt="Logo" width="200">
+  <h1 align="center">OpenHands: Code Less, Make More</h1>
 </div>


 <div align="center">
-  <a href="https://github.com/OpenHands/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/badge/LICENSE-MIT-20B2AA?style=for-the-badge" alt="MIT License"></a>
-  <a href="https://docs.google.com/spreadsheets/d/1wOUdFCMyY6Nt0AIqF705KN4JKOWgeI4wUGUP60krXXs/edit?gid=811504672#gid=811504672"><img src="https://img.shields.io/badge/SWEBench-77.6-00cc00?logoColor=FFE165&style=for-the-badge" alt="Benchmark Score"></a>
+  <a href="https://github.com/OpenHands/OpenHands/graphs/contributors"><img src="https://img.shields.io/github/contributors/OpenHands/OpenHands?style=for-the-badge&color=blue" alt="Contributors"></a>
+  <a href="https://github.com/OpenHands/OpenHands/stargazers"><img src="https://img.shields.io/github/stars/OpenHands/OpenHands?style=for-the-badge&color=blue" alt="Stargazers"></a>
+  <a href="https://github.com/OpenHands/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/OpenHands/OpenHands?style=for-the-badge&color=blue" alt="MIT License"></a>
  <br/>
-  <a href="https://docs.openhands.dev/sdk"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
-  <a href="https://arxiv.org/abs/2511.03690"><img src="https://img.shields.io/badge/Paper-000?logoColor=FFE165&logo=arxiv&style=for-the-badge" alt="Tech Report"></a>
-
+  <a href="https://all-hands.dev/joinslack"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
+  <a href="https://github.com/OpenHands/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits"></a>
+  <br/>
+  <a href="https://docs.all-hands.dev/usage/getting-started"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
+  <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper%20on%20Arxiv-000?logoColor=FFE165&logo=arxiv&style=for-the-badge" alt="Paper on Arxiv"></a>
+  <a href="https://docs.google.com/spreadsheets/d/1wOUdFCMyY6Nt0AIqF705KN4JKOWgeI4wUGUP60krXXs/edit?gid=0#gid=0"><img src="https://img.shields.io/badge/Benchmark%20score-000?logoColor=FFE165&logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark Score"></a>

  <!-- Keep these links. Translations will automatically update with the README. -->
  <a href="https://www.readme-i18n.com/OpenHands/OpenHands?lang=de">Deutsch</a> |
@@ -24,63 +28,157 @@
  <a href="https://www.readme-i18n.com/OpenHands/OpenHands?lang=ru">Русский</a> |
  <a href="https://www.readme-i18n.com/OpenHands/OpenHands?lang=zh">中文</a>

+  <hr>
 </div>

-<hr>
+Welcome to OpenHands (formerly OpenDevin), a platform for software development agents powered by AI.

-🙌 Welcome to OpenHands, a [community](COMMUNITY.md) focused on AI-driven development. We’d love for you to [join us on Slack](https://dub.sh/openhands).
+OpenHands agents can do anything a human developer can: modify code, run commands, browse the web,
+call APIs, and yes—even copy code snippets from StackOverflow.

-There are a few ways to work with OpenHands:
+Learn more at [docs.all-hands.dev](https://docs.all-hands.dev), or [sign up for OpenHands Cloud](https://app.all-hands.dev) to get started.

-### OpenHands Software Agent SDK
-The SDK is a composable Python library that contains all of our agentic tech. It's the engine that powers everything else below.

-Define agents in code, then run them locally, or scale to 1000s of agents in the cloud.
+> [!IMPORTANT]
+> **Upcoming change**: We are renaming our GitHub Org from `All-Hands-AI` to `OpenHands` on October 20th, 2025.
+> Check the [tracking issue](https://github.com/All-Hands-AI/OpenHands/issues/11376) for more information.

-[Check out the docs](https://docs.openhands.dev/sdk) or [view the source](https://github.com/OpenHands/software-agent-sdk/)

-### OpenHands CLI
-The CLI is the easiest way to start using OpenHands. The experience will be familiar to anyone who has worked
-with e.g. Claude Code or Codex. You can power it with Claude, GPT, or any other LLM.
+> [!IMPORTANT]
+> Using OpenHands for work? We'd love to chat! Fill out
+> [this short form](https://docs.google.com/forms/d/e/1FAIpQLSet3VbGaz8z32gW9Wm-Grl4jpt5WgMXPgJ4EDPVmCETCBpJtQ/viewform)
+> to join our Design Partner program, where you'll get early access to commercial features and the opportunity to provide input on our product roadmap.

-[Check out the docs](https://docs.openhands.dev/openhands/usage/run-openhands/cli-mode) or [view the source](https://github.com/OpenHands/OpenHands-CLI)
+## ☁️ OpenHands Cloud
+The easiest way to get started with OpenHands is on [OpenHands Cloud](https://app.all-hands.dev),
+which comes with $20 in free credits for new users.

-### OpenHands Local GUI
-Use the Local GUI for running agents on your laptop. It comes with a REST API and a single-page React application.
-The experience will be familiar to anyone who has used Devin or Jules.
+## 💻 Running OpenHands Locally

-[Check out the docs](https://docs.openhands.dev/openhands/usage/run-openhands/local-setup) or view the source in this repo.
+### Option 1: CLI Launcher (Recommended)

-### OpenHands Cloud
-This is a deployment of OpenHands GUI, running on hosted infrastructure.
+The easiest way to run OpenHands locally is using the CLI launcher with [uv](https://docs.astral.sh/uv/). This provides better isolation from your current project's virtual environment and is required for OpenHands' default MCP servers.

-You can try it with a free $10 credit by [signing in with your GitHub account](https://app.all-hands.dev).
+**Install uv** (if you haven't already):

-OpenHands Cloud comes with source-available features and integrations:
- Integrations with Slack, Jira, and Linear
- Multi-user support
- RBAC and permissions
- Collaboration features (e.g., conversation sharing)
+See the [uv installation guide](https://docs.astral.sh/uv/getting-started/installation/) for the latest installation instructions for your platform.

-### OpenHands Enterprise
-Large enterprises can work with us to self-host OpenHands Cloud in their own VPC, via Kubernetes.
-OpenHands Enterprise can also work with the CLI and SDK above.
+**Launch OpenHands**:
+```bash
+# Launch the GUI server
+uvx --python 3.12 --from openhands-ai openhands serve

-OpenHands Enterprise is source-available--you can see all the source code here in the enterprise/ directory,
-but you'll need to purchase a license if you want to run it for more than one month.
+# Or launch the CLI
+uvx --python 3.12 --from openhands-ai openhands
+```

-Enterprise contracts also come with extended support and access to our research team.
+You'll find OpenHands running at [http://localhost:3000](http://localhost:3000) (for GUI mode)!

-Learn more at [openhands.dev/enterprise](https://openhands.dev/enterprise)
+### Option 2: Docker

-### Everything Else
+<details>
+<summary>Click to expand Docker command</summary>

-Check out our [Product Roadmap](https://github.com/orgs/openhands/projects/1), and feel free to
-[open up an issue](https://github.com/OpenHands/OpenHands/issues) if there's something you'd like to see!
+You can also run OpenHands directly with Docker:

-You might also be interested in our [evaluation infrastructure](https://github.com/OpenHands/benchmarks), our [chrome extension](https://github.com/OpenHands/openhands-chrome-extension/), or our [Theory-of-Mind module](https://github.com/OpenHands/ToM-SWE).
+```bash
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.59-nikolaik

-All our work is available under the MIT license, except for the `enterprise/` directory in this repository (see the [enterprise license](enterprise/LICENSE) for details).
-The core `openhands` and `agent-server` Docker images are fully MIT-licensed as well.
+docker run -it --rm --pull=always \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.59-nikolaik \
+    -e LOG_ALL_EVENTS=true \
+    -v /var/run/docker.sock:/var/run/docker.sock \
+    -v ~/.openhands:/.openhands \
+    -p 3000:3000 \
+    --add-host host.docker.internal:host-gateway \
+    --name openhands-app \
+    docker.all-hands.dev/all-hands-ai/openhands:0.59
+```

-If you need help with anything, or just want to chat, [come find us on Slack](https://dub.sh/openhands).
+</details>
+
+> **Note**: If you used OpenHands before version 0.44, you may want to run `mv ~/.openhands-state ~/.openhands` to migrate your conversation history to the new location.
+
+> [!WARNING]
+> On a public network? See our [Hardened Docker Installation Guide](https://docs.all-hands.dev/usage/runtimes/docker#hardened-docker-installation)
+> to secure your deployment by restricting network binding and implementing additional security measures.
+
+### Getting Started
+
+When you open the application, you'll be asked to choose an LLM provider and add an API key.
+[Anthropic's Claude Sonnet 4.5](https://www.anthropic.com/api) (`anthropic/claude-sonnet-4-5-20250929`)
+works best, but you have [many options](https://docs.all-hands.dev/usage/llms).
+
+See the [Running OpenHands](https://docs.all-hands.dev/usage/installation) guide for
+system requirements and more information.
+
+## 💡 Other ways to run OpenHands
+
+> [!WARNING]
+> OpenHands is meant to be run by a single user on their local workstation.
+> It is not appropriate for multi-tenant deployments where multiple users share the same instance. There is no built-in authentication, isolation, or scalability.
+>
+> If you're interested in running OpenHands in a multi-tenant environment, check out the source-available, commercially-licensed
+> [OpenHands Cloud Helm Chart](https://github.com/openHands/OpenHands-cloud)
+
+You can [connect OpenHands to your local filesystem](https://docs.all-hands.dev/usage/runtimes/docker#connecting-to-your-filesystem),
+interact with it via a [friendly CLI](https://docs.all-hands.dev/usage/how-to/cli-mode),
+run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/usage/how-to/headless-mode),
+or run it on tagged issues with [a github action](https://docs.all-hands.dev/usage/how-to/github-action).
+
+Visit [Running OpenHands](https://docs.all-hands.dev/usage/installation) for more information and setup instructions.
+
+If you want to modify the OpenHands source code, check out [Development.md](https://github.com/OpenHands/OpenHands/blob/main/Development.md).
+
+Having issues? The [Troubleshooting Guide](https://docs.all-hands.dev/usage/troubleshooting) can help.
+
+## 📖 Documentation
+
+To learn more about the project, and for tips on using OpenHands,
+check out our [documentation](https://docs.all-hands.dev/usage/getting-started).
+
+There you'll find resources on how to use different LLM providers,
+troubleshooting resources, and advanced configuration options.
+
+## 🤝 How to Join the Community
+
+OpenHands is a community-driven project, and we welcome contributions from everyone. We do most of our communication
+through Slack, so this is the best place to start, but we also are happy to have you contact us on Github:
+
+- [Join our Slack workspace](https://all-hands.dev/joinslack) - Here we talk about research, architecture, and future development.
+- [Read or post Github Issues](https://github.com/OpenHands/OpenHands/issues) - Check out the issues we're working on, or add your own ideas.
+
+See more about the community in [COMMUNITY.md](./COMMUNITY.md) or find details on contributing in [CONTRIBUTING.md](./CONTRIBUTING.md).
+
+## 📈 Progress
+
+See the monthly OpenHands roadmap [here](https://github.com/orgs/OpenHands/projects/1) (updated at the maintainer's meeting at the end of each month).
+
+<p align="center">
+  <a href="https://star-history.com/#OpenHands/OpenHands&Date">
+    <img src="https://api.star-history.com/svg?repos=OpenHands/OpenHands&type=Date" width="500" alt="Star History Chart">
+  </a>
+</p>
+
+## 📜 License
+
+Distributed under the MIT License, with the exception of the `enterprise/` folder. See [`LICENSE`](./LICENSE) for more information.
+
+## 🙏 Acknowledgements
+
+OpenHands is built by a large number of contributors, and every contribution is greatly appreciated! We also build upon other open source projects, and we are deeply thankful for their work.
+
+For a list of open source projects and licenses used in OpenHands, please see our [CREDITS.md](./CREDITS.md) file.
+
+## 📚 Cite
+
+```
+@inproceedings{
+  wang2025openhands,
+  title={OpenHands: An Open Platform for {AI} Software Developers as Generalist Agents},
+  author={Xingyao Wang and Boxuan Li and Yufan Song and Frank F. Xu and Xiangru Tang and Mingchen Zhuge and Jiayi Pan and Yueqi Song and Bowen Li and Jaskirat Singh and Hoang H. Tran and Fuqiang Li and Ren Ma and Mingzhang Zheng and Bill Qian and Yanjun Shao and Niklas Muennighoff and Yizhe Zhang and Binyuan Hui and Junyang Lin and Robert Brennan and Hao Peng and Heng Ji and Graham Neubig},
+  booktitle={The Thirteenth International Conference on Learning Representations},
+  year={2025},
+  url={https://openreview.net/forum?id=OJd3ayDDoF}
+}
+```
--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -73,7 +73,7 @@ ENV VIRTUAL_ENV=/app/.venv \

 COPY --chown=openhands:openhands --chmod=770 --from=backend-builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}

-COPY --chown=openhands:openhands --chmod=770 ./skills ./skills
+COPY --chown=openhands:openhands --chmod=770 ./microagents ./microagents
 COPY --chown=openhands:openhands --chmod=770 ./openhands ./openhands
 COPY --chown=openhands:openhands --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
 COPY --chown=openhands:openhands pyproject.toml poetry.lock README.md MANIFEST.in LICENSE ./
--- a/containers/dev/README.md
+++ b/containers/dev/README.md
@@ -1,7 +1,7 @@
 # Develop in Docker

 > [!WARNING]
-> This way of running OpenHands is not officially supported. It is maintained by the community and may not work.
+> This is not officially supported and may not work.

 Install [Docker](https://docs.docker.com/engine/install/) on your host machine and run:

--- a/containers/dev/compose.yml
+++ b/containers/dev/compose.yml
@@ -12,7 +12,7 @@ services:
      - SANDBOX_API_HOSTNAME=host.docker.internal
      - DOCKER_HOST_ADDR=host.docker.internal
      #
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/openhands/runtime:1.0-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/openhands/runtime:0.59-nikolaik}
      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
    ports:
--- a/dev_config/python/.pre-commit-config.yaml
+++ b/dev_config/python/.pre-commit-config.yaml
@@ -3,9 +3,9 @@ repos:
    rev: v5.0.0
    hooks:
      - id: trailing-whitespace
-        exclude: ^(docs/|modules/|python/|openhands-ui/|third_party/|enterprise/)
+        exclude: ^(docs/|modules/|python/|openhands-ui/|third_party/|enterprise/|openhands-cli/)
      - id: end-of-file-fixer
-        exclude: ^(docs/|modules/|python/|openhands-ui/|third_party/|enterprise/)
+        exclude: ^(docs/|modules/|python/|openhands-ui/|third_party/|enterprise/|openhands-cli/)
      - id: check-yaml
        args: ["--allow-multiple-documents"]
      - id: debug-statements
@@ -28,12 +28,12 @@ repos:
        entry: ruff check --config dev_config/python/ruff.toml
        types_or: [python, pyi, jupyter]
        args: [--fix, --unsafe-fixes]
-        exclude: ^(third_party/|enterprise/)
+        exclude: ^(third_party/|enterprise/|openhands-cli/)
      # Run the formatter.
      - id: ruff-format
        entry: ruff format --config dev_config/python/ruff.toml
        types_or: [python, pyi, jupyter]
-        exclude: ^(third_party/|enterprise/)
+        exclude: ^(third_party/|enterprise/|openhands-cli/)

  - repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.15.0
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -7,7 +7,7 @@ services:
    image: openhands:latest
    container_name: openhands-app-${DATE:-}
    environment:
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.openhands.dev/openhands/runtime:1.0-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/openhands/runtime:0.59-nikolaik}
      #- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} # enable this only if you want a specific non-root sandbox user but you will have to manually adjust permissions of ~/.openhands for this user
      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
    ports:
--- a/enterprise/Dockerfile
+++ b/enterprise/Dockerfile
@@ -31,8 +31,9 @@ RUN pip install alembic psycopg2-binary cloud-sql-python-connector pg8000 gsprea
        "pillow>=11.3.0"

 WORKDIR /app
-COPY --chown=openhands:openhands --chmod=770 enterprise .
+COPY enterprise .

+RUN chown -R openhands:openhands /app && chmod -R 770 /app
 USER openhands

 # Command will be overridden by Kubernetes deployment template
--- a/enterprise/allhands-realm-github-provider.json.tmpl
+++ b/enterprise/allhands-realm-github-provider.json.tmpl
@@ -721,7 +721,6 @@
        "https://$WEB_HOST/oauth/keycloak/callback",
        "https://$WEB_HOST/oauth/keycloak/offline/callback",
        "https://$WEB_HOST/slack/keycloak-callback",
-        "https://$WEB_HOST/oauth/device/keycloak-callback",
        "https://$WEB_HOST/api/email/verified",
        "/realms/$KEYCLOAK_REALM_NAME/$KEYCLOAK_CLIENT_ID/*"
      ],
--- a/enterprise/enterprise_local/convert_to_env.py
+++ b/enterprise/enterprise_local/convert_to_env.py
@@ -116,7 +116,7 @@ lines.append('POSTHOG_CLIENT_KEY=test')
 lines.append('ENABLE_PROACTIVE_CONVERSATION_STARTERS=true')
 lines.append('MAX_CONCURRENT_CONVERSATIONS=10')
 lines.append('LITE_LLM_API_URL=https://llm-proxy.eval.all-hands.dev')
-lines.append('LITELLM_DEFAULT_MODEL=litellm_proxy/claude-opus-4-5-20251101')
+lines.append('LITELLM_DEFAULT_MODEL=litellm_proxy/claude-sonnet-4-20250514')
 lines.append(f'LITE_LLM_API_KEY={lite_llm_api_key}')
 lines.append('LOCAL_DEPLOYMENT=true')
 lines.append('DB_HOST=localhost')
--- a/enterprise/experiments/experiment_manager.py
+++ b/enterprise/experiments/experiment_manager.py
@@ -5,8 +5,12 @@ from experiments.constants import (
    EXPERIMENT_SYSTEM_PROMPT_EXPERIMENT,
 )
 from experiments.experiment_versions import (
+    handle_condenser_max_step_experiment,
    handle_system_prompt_experiment,
 )
+from experiments.experiment_versions._004_condenser_max_step_experiment import (
+    handle_condenser_max_step_experiment__v1,
+)

 from openhands.core.config.openhands_config import OpenHandsConfig
 from openhands.core.logger import openhands_logger as logger
@@ -27,6 +31,10 @@ class SaaSExperimentManager(ExperimentManager):
            )
            return agent

+        agent = handle_condenser_max_step_experiment__v1(
+            user_id, conversation_id, agent
+        )
+
        if EXPERIMENT_SYSTEM_PROMPT_EXPERIMENT:
            agent = agent.model_copy(
                update={'system_prompt_filename': 'system_prompt_long_horizon.j2'}
@@ -52,7 +60,20 @@ class SaaSExperimentManager(ExperimentManager):
        """
        logger.debug(
            'experiment_manager:run_conversation_variant_test:started',
-            extra={'user_id': user_id, 'conversation_id': conversation_id},
+            extra={'user_id': user_id},
+        )
+
+        # Skip all experiment processing if the experiment manager is disabled
+        if not ENABLE_EXPERIMENT_MANAGER:
+            logger.info(
+                'experiment_manager:run_conversation_variant_test:skipped',
+                extra={'reason': 'experiment_manager_disabled'},
+            )
+            return conversation_settings
+
+        # Apply conversation-scoped experiments
+        conversation_settings = handle_condenser_max_step_experiment(
+            user_id, conversation_id, conversation_settings
        )

        return conversation_settings
--- a/enterprise/integrations/github/github_manager.py
+++ b/enterprise/integrations/github/github_manager.py
@@ -22,7 +22,6 @@ from integrations.utils import (
    HOST_URL,
    OPENHANDS_RESOLVER_TEMPLATES_DIR,
 )
-from integrations.v1_utils import get_saas_user_auth
 from jinja2 import Environment, FileSystemLoader
 from pydantic import SecretStr
 from server.auth.constants import GITHUB_APP_CLIENT_ID, GITHUB_APP_PRIVATE_KEY
@@ -165,13 +164,8 @@ class GithubManager(Manager):
            )

        if await self.is_job_requested(message):
-            payload = message.message.get('payload', {})
-            user_id = payload['sender']['id']
-            keycloak_user_id = await self.token_manager.get_user_id_from_idp_user_id(
-                user_id, ProviderType.GITHUB
-            )
            github_view = await GithubFactory.create_github_view_from_payload(
-                message, keycloak_user_id
+                message, self.token_manager
            )
            logger.info(
                f'[GitHub] Creating job for {github_view.user_info.username} in {github_view.full_repo_name}#{github_view.issue_number}'
@@ -288,15 +282,8 @@ class GithubManager(Manager):
                        f'[Github]: Error summarizing issue solvability: {str(e)}'
                    )

-                saas_user_auth = await get_saas_user_auth(
-                    github_view.user_info.keycloak_user_id, self.token_manager
-                )
-
                await github_view.create_new_conversation(
-                    self.jinja_env,
-                    secret_store.provider_tokens,
-                    convo_metadata,
-                    saas_user_auth,
+                    self.jinja_env, secret_store.provider_tokens, convo_metadata
                )

                conversation_id = github_view.conversation_id
@@ -305,19 +292,18 @@ class GithubManager(Manager):
                    f'[GitHub] Created conversation {conversation_id} for user {user_info.username}'
                )

-                if not github_view.v1:
-                    # Create a GithubCallbackProcessor
-                    processor = GithubCallbackProcessor(
-                        github_view=github_view,
-                        send_summary_instruction=True,
-                    )
+                # Create a GithubCallbackProcessor
+                processor = GithubCallbackProcessor(
+                    github_view=github_view,
+                    send_summary_instruction=True,
+                )

-                    # Register the callback processor
-                    register_callback_processor(conversation_id, processor)
+                # Register the callback processor
+                register_callback_processor(conversation_id, processor)

-                    logger.info(
-                        f'[Github] Registered callback processor for conversation {conversation_id}'
-                    )
+                logger.info(
+                    f'[Github] Registered callback processor for conversation {conversation_id}'
+                )

                # Send message with conversation link
                conversation_link = CONVERSATION_URL.format(conversation_id)
--- a/enterprise/integrations/github/github_view.py
+++ b/enterprise/integrations/github/github_view.py
@@ -1,5 +1,4 @@
-from dataclasses import dataclass
-from uuid import UUID, uuid4
+from uuid import uuid4

 from github import Github, GithubIntegration
 from github.Issue import Issue
@@ -9,17 +8,16 @@ from integrations.github.github_types import (
    WorkflowRunStatus,
 )
 from integrations.models import Message
-from integrations.resolver_context import ResolverUserContext
 from integrations.types import ResolverViewInterface, UserData
 from integrations.utils import (
    ENABLE_PROACTIVE_CONVERSATION_STARTERS,
-    ENABLE_V1_GITHUB_RESOLVER,
    HOST,
    HOST_URL,
    get_oh_labels,
    has_exact_mention,
 )
 from jinja2 import Environment
+from pydantic.dataclasses import dataclass
 from server.auth.constants import GITHUB_APP_CLIENT_ID, GITHUB_APP_PRIVATE_KEY
 from server.auth.token_manager import TokenManager
 from server.config import get_config
@@ -28,24 +26,14 @@ from storage.proactive_conversation_store import ProactiveConversationStore
 from storage.saas_secrets_store import SaasSecretsStore
 from storage.saas_settings_store import SaasSettingsStore

-from openhands.agent_server.models import SendMessageRequest
-from openhands.app_server.app_conversation.app_conversation_models import (
-    AppConversationStartRequest,
-    AppConversationStartTaskStatus,
-)
-from openhands.app_server.config import get_app_conversation_service
-from openhands.app_server.services.injector import InjectorState
-from openhands.app_server.user.specifiy_user_context import USER_CONTEXT_ATTR
 from openhands.core.logger import openhands_logger as logger
 from openhands.integrations.github.github_service import GithubServiceImpl
 from openhands.integrations.provider import PROVIDER_TOKEN_TYPE, ProviderType
 from openhands.integrations.service_types import Comment
-from openhands.sdk import TextContent
 from openhands.server.services.conversation_service import (
    initialize_conversation,
    start_conversation,
 )
-from openhands.server.user_auth.user_auth import UserAuth
 from openhands.storage.data_models.conversation_metadata import (
    ConversationMetadata,
    ConversationTrigger,
@@ -88,38 +76,6 @@ async def get_user_proactive_conversation_setting(user_id: str | None) -> bool:
    return settings.enable_proactive_conversation_starters


-async def get_user_v1_enabled_setting(user_id: str) -> bool:
-    """Get the user's V1 conversation API setting.
-
-    Args:
-        user_id: The keycloak user ID
-
-    Returns:
-        True if V1 conversations are enabled for this user, False otherwise
-
-    Note:
-        This function checks both the global environment variable kill switch AND
-        the user's individual setting. Both must be true for the function to return true.
-    """
-    # Check the global environment variable first
-    if not ENABLE_V1_GITHUB_RESOLVER:
-        return False
-
-    config = get_config()
-    settings_store = SaasSettingsStore(
-        user_id=user_id, session_maker=session_maker, config=config
-    )
-
-    settings = await call_sync_from_async(
-        settings_store.get_user_settings_by_keycloak_id, user_id
-    )
-
-    if not settings or settings.v1_enabled is None:
-        return False
-
-    return settings.v1_enabled
-
-
 # =================================================
 # SECTION: Github view types
 # =================================================
@@ -140,7 +96,6 @@ class GithubIssue(ResolverViewInterface):
    title: str
    description: str
    previous_comments: list[Comment]
-    v1: bool

    async def _load_resolver_context(self):
        github_service = GithubServiceImpl(
@@ -187,19 +142,6 @@ class GithubIssue(ResolverViewInterface):

    async def initialize_new_conversation(self) -> ConversationMetadata:
        # FIXME: Handle if initialize_conversation returns None
-
-        v1_enabled = await get_user_v1_enabled_setting(self.user_info.keycloak_user_id)
-        logger.info(
-            f'[GitHub V1]: User flag found for {self.user_info.keycloak_user_id} is {v1_enabled}'
-        )
-        if v1_enabled:
-            # Create dummy conversationm metadata
-            # Don't save to conversation store
-            # V1 conversations are stored in a separate table
-            return ConversationMetadata(
-                conversation_id=uuid4().hex, selected_repository=self.full_repo_name
-            )
-
        conversation_metadata: ConversationMetadata = await initialize_conversation(  # type: ignore[assignment]
            user_id=self.user_info.keycloak_user_id,
            conversation_id=None,
@@ -216,36 +158,7 @@ class GithubIssue(ResolverViewInterface):
        jinja_env: Environment,
        git_provider_tokens: PROVIDER_TOKEN_TYPE,
        conversation_metadata: ConversationMetadata,
-        saas_user_auth: UserAuth,
    ):
-        v1_enabled = await get_user_v1_enabled_setting(self.user_info.keycloak_user_id)
-        logger.info(
-            f'[GitHub V1]: User flag found for {self.user_info.keycloak_user_id} is {v1_enabled}'
-        )
-        if v1_enabled:
-            try:
-                # Use V1 app conversation service
-                await self._create_v1_conversation(
-                    jinja_env, saas_user_auth, conversation_metadata
-                )
-                return
-
-            except Exception as e:
-                logger.warning(f'Error checking V1 settings, falling back to V0: {e}')
-
-        # Use existing V0 conversation service
-        await self._create_v0_conversation(
-            jinja_env, git_provider_tokens, conversation_metadata
-        )
-
-    async def _create_v0_conversation(
-        self,
-        jinja_env: Environment,
-        git_provider_tokens: PROVIDER_TOKEN_TYPE,
-        conversation_metadata: ConversationMetadata,
-    ):
-        """Create conversation using the legacy V0 system."""
-        logger.info('[GitHub]: Creating V0 conversation')
        custom_secrets = await self._get_user_secrets()

        user_instructions, conversation_instructions = await self._get_instructions(
@@ -264,78 +177,6 @@ class GithubIssue(ResolverViewInterface):
            conversation_instructions=conversation_instructions,
        )

-    async def _create_v1_conversation(
-        self,
-        jinja_env: Environment,
-        saas_user_auth: UserAuth,
-        conversation_metadata: ConversationMetadata,
-    ):
-        """Create conversation using the new V1 app conversation system."""
-        logger.info('[GitHub V1]: Creating V1 conversation')
-
-        user_instructions, conversation_instructions = await self._get_instructions(
-            jinja_env
-        )
-
-        # Create the initial message request
-        initial_message = SendMessageRequest(
-            role='user', content=[TextContent(text=user_instructions)]
-        )
-
-        # Create the GitHub V1 callback processor
-        github_callback_processor = self._create_github_v1_callback_processor()
-
-        # Get the app conversation service and start the conversation
-        injector_state = InjectorState()
-
-        # Create the V1 conversation start request with the callback processor
-        start_request = AppConversationStartRequest(
-            conversation_id=UUID(conversation_metadata.conversation_id),
-            system_message_suffix=conversation_instructions,
-            initial_message=initial_message,
-            selected_repository=self.full_repo_name,
-            git_provider=ProviderType.GITHUB,
-            title=f'GitHub Issue #{self.issue_number}: {self.title}',
-            trigger=ConversationTrigger.RESOLVER,
-            processors=[
-                github_callback_processor
-            ],  # Pass the callback processor directly
-        )
-
-        # Set up the GitHub user context for the V1 system
-        github_user_context = ResolverUserContext(saas_user_auth=saas_user_auth)
-        setattr(injector_state, USER_CONTEXT_ATTR, github_user_context)
-
-        async with get_app_conversation_service(
-            injector_state
-        ) as app_conversation_service:
-            async for task in app_conversation_service.start_app_conversation(
-                start_request
-            ):
-                if task.status == AppConversationStartTaskStatus.ERROR:
-                    logger.error(f'Failed to start V1 conversation: {task.detail}')
-                    raise RuntimeError(
-                        f'Failed to start V1 conversation: {task.detail}'
-                    )
-
-        self.v1 = True
-
-    def _create_github_v1_callback_processor(self):
-        """Create a V1 callback processor for GitHub integration."""
-        from openhands.app_server.event_callback.github_v1_callback_processor import (
-            GithubV1CallbackProcessor,
-        )
-
-        # Create and return the GitHub V1 callback processor
-        return GithubV1CallbackProcessor(
-            github_view_data={
-                'issue_number': self.issue_number,
-                'full_repo_name': self.full_repo_name,
-                'installation_id': self.installation_id,
-            },
-            send_summary_instruction=self.send_summary_instruction,
-        )
-

@dataclass
 class GithubIssueComment(GithubIssue):
@@ -391,18 +232,7 @@ class GithubPRComment(GithubIssueComment):
        return user_instructions, conversation_instructions

    async def initialize_new_conversation(self) -> ConversationMetadata:
-        v1_enabled = await get_user_v1_enabled_setting(self.user_info.keycloak_user_id)
-        logger.info(
-            f'[GitHub V1]: User flag found for {self.user_info.keycloak_user_id} is {v1_enabled}'
-        )
-        if v1_enabled:
-            # Create dummy conversationm metadata
-            # Don't save to conversation store
-            # V1 conversations are stored in a separate table
-            return ConversationMetadata(
-                conversation_id=uuid4().hex, selected_repository=self.full_repo_name
-            )
-
+        # FIXME: Handle if initialize_conversation returns None
        conversation_metadata: ConversationMetadata = await initialize_conversation(  # type: ignore[assignment]
            user_id=self.user_info.keycloak_user_id,
            conversation_id=None,
@@ -462,24 +292,6 @@ class GithubInlinePRComment(GithubPRComment):

        return user_instructions, conversation_instructions

-    def _create_github_v1_callback_processor(self):
-        """Create a V1 callback processor for GitHub integration."""
-        from openhands.app_server.event_callback.github_v1_callback_processor import (
-            GithubV1CallbackProcessor,
-        )
-
-        # Create and return the GitHub V1 callback processor
-        return GithubV1CallbackProcessor(
-            github_view_data={
-                'issue_number': self.issue_number,
-                'full_repo_name': self.full_repo_name,
-                'installation_id': self.installation_id,
-                'comment_id': self.comment_id,
-            },
-            inline_pr_comment=True,
-            send_summary_instruction=self.send_summary_instruction,
-        )
-

@dataclass
 class GithubFailingAction:
@@ -793,7 +605,7 @@ class GithubFactory:

    @staticmethod
    async def create_github_view_from_payload(
-        message: Message, keycloak_user_id: str
+        message: Message, token_manager: TokenManager
    ) -> ResolverViewInterface:
        """Create the appropriate class (GithubIssue or GithubPRComment) based on the payload.
        Also return metadata about the event (e.g., action type).
@@ -803,10 +615,17 @@ class GithubFactory:
        user_id = payload['sender']['id']
        username = payload['sender']['login']

+        keyloak_user_id = await token_manager.get_user_id_from_idp_user_id(
+            user_id, ProviderType.GITHUB
+        )
+
+        if keyloak_user_id is None:
+            logger.warning(f'Got invalid keyloak user id for GitHub User {user_id} ')
+
        selected_repo = GithubFactory.get_full_repo_name(repo_obj)
        is_public_repo = not repo_obj.get('private', True)
        user_info = UserData(
-            user_id=user_id, username=username, keycloak_user_id=keycloak_user_id
+            user_id=user_id, username=username, keycloak_user_id=keyloak_user_id
        )

        installation_id = message.message['installation']
@@ -830,7 +649,6 @@ class GithubFactory:
                title='',
                description='',
                previous_comments=[],
-                v1=False,
            )

        elif GithubFactory.is_issue_comment(message):
@@ -856,7 +674,6 @@ class GithubFactory:
                title='',
                description='',
                previous_comments=[],
-                v1=False,
            )

        elif GithubFactory.is_pr_comment(message):
@@ -898,7 +715,6 @@ class GithubFactory:
                title='',
                description='',
                previous_comments=[],
-                v1=False,
            )

        elif GithubFactory.is_inline_pr_comment(message):
@@ -932,7 +748,6 @@ class GithubFactory:
                title='',
                description='',
                previous_comments=[],
-                v1=False,
            )

        else:
--- a/enterprise/integrations/resolver_context.py
+++ b/enterprise/integrations/resolver_context.py
@@ -1,63 +0,0 @@
-from openhands.app_server.user.user_context import UserContext
-from openhands.app_server.user.user_models import UserInfo
-from openhands.integrations.provider import PROVIDER_TOKEN_TYPE
-from openhands.integrations.service_types import ProviderType
-from openhands.sdk.secret import SecretSource, StaticSecret
-from openhands.server.user_auth.user_auth import UserAuth
-
-
-class ResolverUserContext(UserContext):
-    """User context for resolver operations that inherits from UserContext."""
-
-    def __init__(
-        self,
-        saas_user_auth: UserAuth,
-    ):
-        self.saas_user_auth = saas_user_auth
-
-    async def get_user_id(self) -> str | None:
-        return await self.saas_user_auth.get_user_id()
-
-    async def get_user_info(self) -> UserInfo:
-        user_settings = await self.saas_user_auth.get_user_settings()
-        user_id = await self.saas_user_auth.get_user_id()
-        if user_settings:
-            return UserInfo(
-                id=user_id,
-                **user_settings.model_dump(context={'expose_secrets': True}),
-            )
-
-        return UserInfo(id=user_id)
-
-    async def get_authenticated_git_url(self, repository: str) -> str:
-        # This would need to be implemented based on the git provider tokens
-        # For now, return a basic HTTPS URL
-        return f'https://github.com/{repository}.git'
-
-    async def get_latest_token(self, provider_type: ProviderType) -> str | None:
-        # Return the appropriate token from git_provider_tokens
-
-        provider_tokens = await self.saas_user_auth.get_provider_tokens()
-        if provider_tokens:
-            return provider_tokens.get(provider_type)
-        return None
-
-    async def get_provider_tokens(self) -> PROVIDER_TOKEN_TYPE | None:
-        return await self.saas_user_auth.get_provider_tokens()
-
-    async def get_secrets(self) -> dict[str, SecretSource]:
-        """Get secrets for the user, including custom secrets."""
-        secrets = await self.saas_user_auth.get_secrets()
-        if secrets:
-            # Convert custom secrets to StaticSecret objects for SDK compatibility
-            # secrets.custom_secrets is of type Mapping[str, CustomSecret]
-            converted_secrets = {}
-            for key, custom_secret in secrets.custom_secrets.items():
-                # Extract the secret value from CustomSecret and convert to StaticSecret
-                secret_value = custom_secret.secret.get_secret_value()
-                converted_secrets[key] = StaticSecret(value=secret_value)
-            return converted_secrets
-        return {}
-
-    async def get_mcp_api_key(self) -> str | None:
-        return await self.saas_user_auth.get_mcp_api_key()
--- a/enterprise/integrations/types.py
+++ b/enterprise/integrations/types.py
@@ -19,7 +19,7 @@ class PRStatus(Enum):
 class UserData(BaseModel):
    user_id: int
    username: str
-    keycloak_user_id: str
+    keycloak_user_id: str | None


@dataclass
--- a/enterprise/integrations/utils.py
+++ b/enterprise/integrations/utils.py
@@ -51,11 +51,6 @@ ENABLE_SOLVABILITY_ANALYSIS = (
    os.getenv('ENABLE_SOLVABILITY_ANALYSIS', 'false').lower() == 'true'
 )

-# Toggle for V1 GitHub resolver feature
-ENABLE_V1_GITHUB_RESOLVER = (
-    os.getenv('ENABLE_V1_GITHUB_RESOLVER', 'false').lower() == 'true'
-)
-

 OPENHANDS_RESOLVER_TEMPLATES_DIR = 'openhands/integrations/templates/resolver/'
 jinja_env = Environment(loader=FileSystemLoader(OPENHANDS_RESOLVER_TEMPLATES_DIR))
--- a/enterprise/integrations/v1_utils.py
+++ b/enterprise/integrations/v1_utils.py
@@ -1,20 +0,0 @@
-from pydantic import SecretStr
-from server.auth.saas_user_auth import SaasUserAuth
-from server.auth.token_manager import TokenManager
-
-from openhands.core.logger import openhands_logger as logger
-from openhands.server.user_auth.user_auth import UserAuth
-
-
-async def get_saas_user_auth(
-    keycloak_user_id: str, token_manager: TokenManager
-) -> UserAuth:
-    offline_token = await token_manager.load_offline_token(keycloak_user_id)
-    if offline_token is None:
-        logger.info('no_offline_token_found')
-
-    user_auth = SaasUserAuth(
-        user_id=keycloak_user_id,
-        refresh_token=SecretStr(offline_token),
-    )
-    return user_auth
--- a/enterprise/migrations/versions/080_add_status_and_updated_at_to_callback.py
+++ b/enterprise/migrations/versions/080_add_status_and_updated_at_to_callback.py
@@ -1,71 +0,0 @@
-"""add status and updated_at to callback
-
-Revision ID: 080
-Revises: 079
-Create Date: 2025-11-05 00:00:00.000000
-
-"""
-
-from enum import Enum
-from typing import Sequence, Union
-
-import sqlalchemy as sa
-from alembic import op
-
-# revision identifiers, used by Alembic.
-revision: str = '080'
-down_revision: Union[str, None] = '079'
-branch_labels: Union[str, Sequence[str], None] = None
-depends_on: Union[str, Sequence[str], None] = None
-
-
-class EventCallbackStatus(Enum):
-    ACTIVE = 'ACTIVE'
-    DISABLED = 'DISABLED'
-    COMPLETED = 'COMPLETED'
-    ERROR = 'ERROR'
-
-
-def upgrade() -> None:
-    """Upgrade schema."""
-    status = sa.Enum(EventCallbackStatus, name='eventcallbackstatus')
-    status.create(op.get_bind(), checkfirst=True)
-    op.add_column(
-        'event_callback',
-        sa.Column('status', status, nullable=False, server_default='ACTIVE'),
-    )
-    op.add_column(
-        'event_callback',
-        sa.Column(
-            'updated_at', sa.DateTime, nullable=False, server_default=sa.func.now()
-        ),
-    )
-    op.drop_index('ix_event_callback_result_event_id')
-    op.drop_column('event_callback_result', 'event_id')
-    op.add_column(
-        'event_callback_result', sa.Column('event_id', sa.String, nullable=True)
-    )
-    op.create_index(
-        op.f('ix_event_callback_result_event_id'),
-        'event_callback_result',
-        ['event_id'],
-        unique=False,
-    )
-
-
-def downgrade() -> None:
-    """Downgrade schema."""
-    op.drop_column('event_callback', 'status')
-    op.drop_column('event_callback', 'updated_at')
-    op.drop_index('ix_event_callback_result_event_id')
-    op.drop_column('event_callback_result', 'event_id')
-    op.add_column(
-        'event_callback_result', sa.Column('event_id', sa.UUID, nullable=True)
-    )
-    op.create_index(
-        op.f('ix_event_callback_result_event_id'),
-        'event_callback_result',
-        ['event_id'],
-        unique=False,
-    )
-    op.execute('DROP TYPE eventcallbackstatus')
--- a/enterprise/migrations/versions/081_add_parent_conversation_id.py
+++ b/enterprise/migrations/versions/081_add_parent_conversation_id.py
@@ -1,41 +0,0 @@
-"""add parent_conversation_id to conversation_metadata
-
-Revision ID: 081
-Revises: 080
-Create Date: 2025-11-06 00:00:00.000000
-
-"""
-
-from typing import Sequence, Union
-
-import sqlalchemy as sa
-from alembic import op
-
-# revision identifiers, used by Alembic.
-revision: str = '081'
-down_revision: Union[str, None] = '080'
-branch_labels: Union[str, Sequence[str], None] = None
-depends_on: Union[str, Sequence[str], None] = None
-
-
-def upgrade() -> None:
-    """Upgrade schema."""
-    op.add_column(
-        'conversation_metadata',
-        sa.Column('parent_conversation_id', sa.String(), nullable=True),
-    )
-    op.create_index(
-        op.f('ix_conversation_metadata_parent_conversation_id'),
-        'conversation_metadata',
-        ['parent_conversation_id'],
-        unique=False,
-    )
-
-
-def downgrade() -> None:
-    """Downgrade schema."""
-    op.drop_index(
-        op.f('ix_conversation_metadata_parent_conversation_id'),
-        table_name='conversation_metadata',
-    )
-    op.drop_column('conversation_metadata', 'parent_conversation_id')
--- a/enterprise/migrations/versions/082_add_setting_up_skills_enum_value.py
+++ b/enterprise/migrations/versions/082_add_setting_up_skills_enum_value.py
@@ -1,51 +0,0 @@
-"""Add SETTING_UP_SKILLS to appconversationstarttaskstatus enum
-
-Revision ID: 082
-Revises: 081
-Create Date: 2025-11-19 12:00:00.000000
-
-"""
-
-from typing import Sequence, Union
-
-from alembic import op
-from sqlalchemy import text
-
-# revision identifiers, used by Alembic.
-revision: str = '082'
-down_revision: Union[str, Sequence[str], None] = '081'
-branch_labels: Union[str, Sequence[str], None] = None
-depends_on: Union[str, Sequence[str], None] = None
-
-
-def upgrade() -> None:
-    """Add SETTING_UP_SKILLS enum value to appconversationstarttaskstatus."""
-    # Check if the enum value already exists before adding it
-    # This handles the case where the enum was created with the value already included
-    connection = op.get_bind()
-    result = connection.execute(
-        text(
-            "SELECT 1 FROM pg_enum WHERE enumlabel = 'SETTING_UP_SKILLS' "
-            "AND enumtypid = (SELECT oid FROM pg_type WHERE typname = 'appconversationstarttaskstatus')"
-        )
-    )
-
-    if not result.fetchone():
-        # Add the new enum value only if it doesn't already exist
-        op.execute(
-            "ALTER TYPE appconversationstarttaskstatus ADD VALUE 'SETTING_UP_SKILLS'"
-        )
-
-
-def downgrade() -> None:
-    """Remove SETTING_UP_SKILLS enum value from appconversationstarttaskstatus.
-
-    Note: PostgreSQL doesn't support removing enum values directly.
-    This would require recreating the enum type and updating all references.
-    For safety, this downgrade is not implemented.
-    """
-    # PostgreSQL doesn't support removing enum values directly
-    # This would require a complex migration to recreate the enum
-    # For now, we'll leave this as a no-op since removing enum values
-    # is rarely needed and can be dangerous
-    pass
--- a/enterprise/migrations/versions/083_add_v1_enabled_to_user_settings.py
+++ b/enterprise/migrations/versions/083_add_v1_enabled_to_user_settings.py
@@ -1,35 +0,0 @@
-"""Add v1_enabled column to user_settings
-
-Revision ID: 083
-Revises: 082
-Create Date: 2025-11-18 00:00:00.000000
-
-"""
-
-from typing import Sequence, Union
-
-import sqlalchemy as sa
-from alembic import op
-
-# revision identifiers, used by Alembic.
-revision: str = '083'
-down_revision: Union[str, None] = '082'
-branch_labels: Union[str, Sequence[str], None] = None
-depends_on: Union[str, Sequence[str], None] = None
-
-
-def upgrade() -> None:
-    """Add v1_enabled column to user_settings table."""
-    op.add_column(
-        'user_settings',
-        sa.Column(
-            'v1_enabled',
-            sa.Boolean(),
-            nullable=True,
-        ),
-    )
-
-
-def downgrade() -> None:
-    """Remove v1_enabled column from user_settings table."""
-    op.drop_column('user_settings', 'v1_enabled')
--- a/enterprise/migrations/versions/084_create_device_codes_table.py
+++ b/enterprise/migrations/versions/084_create_device_codes_table.py
@@ -1,49 +0,0 @@
-"""Create device_codes table for OAuth 2.0 Device Flow
-
-Revision ID: 084
-Revises: 083
-Create Date: 2024-12-10 12:00:00.000000
-
-"""
-
-import sqlalchemy as sa
-from alembic import op
-
-# revision identifiers, used by Alembic.
-revision = '084'
-down_revision = '083'
-branch_labels = None
-depends_on = None
-
-
-def upgrade():
-    """Create device_codes table for OAuth 2.0 Device Flow."""
-    op.create_table(
-        'device_codes',
-        sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
-        sa.Column('device_code', sa.String(length=128), nullable=False),
-        sa.Column('user_code', sa.String(length=16), nullable=False),
-        sa.Column('status', sa.String(length=32), nullable=False),
-        sa.Column('keycloak_user_id', sa.String(length=255), nullable=True),
-        sa.Column('expires_at', sa.DateTime(timezone=True), nullable=False),
-        sa.Column('authorized_at', sa.DateTime(timezone=True), nullable=True),
-        # Rate limiting fields for RFC 8628 section 3.5 compliance
-        sa.Column('last_poll_time', sa.DateTime(timezone=True), nullable=True),
-        sa.Column('current_interval', sa.Integer(), nullable=False, default=5),
-        sa.PrimaryKeyConstraint('id'),
-    )
-
-    # Create indexes for efficient lookups
-    op.create_index(
-        'ix_device_codes_device_code', 'device_codes', ['device_code'], unique=True
-    )
-    op.create_index(
-        'ix_device_codes_user_code', 'device_codes', ['user_code'], unique=True
-    )
-
-
-def downgrade():
-    """Drop device_codes table."""
-    op.drop_index('ix_device_codes_user_code', table_name='device_codes')
-    op.drop_index('ix_device_codes_device_code', table_name='device_codes')
-    op.drop_table('device_codes')
--- a/enterprise/poetry.lock
+++ b/enterprise/poetry.lock
@@ -201,20 +201,19 @@ files = [

 [[package]]
 name = "anthropic"
-version = "0.75.0"
+version = "0.65.0"
 description = "The official Python library for the anthropic API"
 optional = false
-python-versions = ">=3.9"
+python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "anthropic-0.75.0-py3-none-any.whl", hash = "sha256:ea8317271b6c15d80225a9f3c670152746e88805a7a61e14d4a374577164965b"},
-    {file = "anthropic-0.75.0.tar.gz", hash = "sha256:e8607422f4ab616db2ea5baacc215dd5f028da99ce2f022e33c7c535b29f3dfb"},
+    {file = "anthropic-0.65.0-py3-none-any.whl", hash = "sha256:ba9d9f82678046c74ddf5698ca06d9f5b0f599cfac922ab0d5921638eb448d98"},
+    {file = "anthropic-0.65.0.tar.gz", hash = "sha256:6b6b6942574e54342050dfd42b8d856a8366b171daec147df3b80be4722733b9"},
 ]

 [package.dependencies]
 anyio = ">=3.5.0,<5"
 distro = ">=1.7.0,<2"
-docstring-parser = ">=0.15,<1"
 google-auth = {version = ">=2,<3", extras = ["requests"], optional = true, markers = "extra == \"vertex\""}
 httpx = ">=0.25.0,<1"
 jiter = ">=0.4.0,<1"
@@ -223,7 +222,7 @@ sniffio = "*"
 typing-extensions = ">=4.10,<5"

 [package.extras]
-aiohttp = ["aiohttp", "httpx-aiohttp (>=0.1.9)"]
+aiohttp = ["aiohttp", "httpx-aiohttp (>=0.1.8)"]
 bedrock = ["boto3 (>=1.28.57)", "botocore (>=1.31.57)"]
 vertex = ["google-auth[requests] (>=2,<3)"]

@@ -682,37 +681,34 @@ crt = ["awscrt (==0.27.6)"]

 [[package]]
 name = "browser-use"
-version = "0.10.1"
+version = "0.7.10"
 description = "Make websites accessible for AI agents"
 optional = false
 python-versions = "<4.0,>=3.11"
 groups = ["main"]
 files = [
-    {file = "browser_use-0.10.1-py3-none-any.whl", hash = "sha256:96e603bfc71098175342cdcb0592519e6f244412e740f0254e4389fdd82a977f"},
-    {file = "browser_use-0.10.1.tar.gz", hash = "sha256:5f211ecfdf1f9fd186160f10df70dedd661821231e30f1bce40939787abab223"},
+    {file = "browser_use-0.7.10-py3-none-any.whl", hash = "sha256:669e12571a0c0c4c93e5fd26abf9e2534eb9bacbc510328aedcab795bd8906a9"},
+    {file = "browser_use-0.7.10.tar.gz", hash = "sha256:f93ce59e06906c12d120360dee4aa33d83618ddf7c9a575dd0ac517d2de7ccbc"},
 ]

 [package.dependencies]
 aiohttp = "3.12.15"
-anthropic = ">=0.72.1,<1.0.0"
+anthropic = ">=0.58.2,<1.0.0"
 anyio = ">=4.9.0"
 authlib = ">=1.6.0"
 bubus = ">=1.5.6"
-cdp-use = ">=1.4.4"
-click = ">=8.1.8"
-cloudpickle = ">=3.1.1"
+cdp-use = ">=1.4.0"
 google-api-core = ">=2.25.0"
 google-api-python-client = ">=2.174.0"
 google-auth = ">=2.40.3"
 google-auth-oauthlib = ">=1.2.2"
-google-genai = ">=1.50.0,<2.0.0"
+google-genai = ">=1.29.0,<2.0.0"
 groq = ">=0.30.0"
+html2text = ">=2025.4.15"
 httpx = ">=0.28.1"
-inquirerpy = ">=0.3.4"
-markdownify = ">=1.2.0"
 mcp = ">=1.10.1"
 ollama = ">=0.5.1"
-openai = ">=2.7.2,<3.0.0"
+openai = ">=1.99.2,<2.0.0"
 pillow = ">=11.2.1"
 portalocker = ">=2.7.0,<3.0.0"
 posthog = ">=3.7.0"
@@ -721,24 +717,19 @@ pydantic = ">=2.11.5"
 pyobjc = {version = ">=11.0", markers = "platform_system == \"darwin\""}
 pyotp = ">=2.9.0"
 pypdf = ">=5.7.0"
-python-docx = ">=1.2.0"
 python-dotenv = ">=1.0.1"
 reportlab = ">=4.0.0"
 requests = ">=2.32.3"
-rich = ">=14.0.0"
 screeninfo = {version = ">=0.8.1", markers = "platform_system != \"darwin\""}
 typing-extensions = ">=4.12.2"
 uuid7 = ">=0.1.0"

 [package.extras]
-all = ["agentmail (==0.0.59)", "boto3 (>=1.38.45)", "botocore (>=1.37.23)", "imgcat (>=0.6.0)", "langchain-openai (>=0.3.26)", "oci (>=2.126.4)", "textual (>=3.2.0)"]
+all = ["agentmail (>=0.0.53)", "boto3 (>=1.38.45)", "botocore (>=1.37.23)", "click (>=8.1.8)", "imgcat (>=0.6.0)", "langchain-openai (>=0.3.26)", "rich (>=14.0.0)", "textual (>=3.2.0)"]
 aws = ["boto3 (>=1.38.45)"]
-cli = ["textual (>=3.2.0)"]
-cli-oci = ["oci (>=2.126.4)", "textual (>=3.2.0)"]
-code = ["matplotlib (>=3.9.0)", "numpy (>=2.3.2)", "pandas (>=2.2.0)", "tabulate (>=0.9.0)"]
-eval = ["anyio (>=4.9.0)", "datamodel-code-generator (>=0.26.0)", "lmnr[all] (==0.7.17)", "psutil (>=7.0.0)"]
-examples = ["agentmail (==0.0.59)", "botocore (>=1.37.23)", "imgcat (>=0.6.0)", "langchain-openai (>=0.3.26)"]
-oci = ["oci (>=2.126.4)"]
+cli = ["click (>=8.1.8)", "rich (>=14.0.0)", "textual (>=3.2.0)"]
+eval = ["anyio (>=4.9.0)", "browserbase (==1.4.0)", "datamodel-code-generator (>=0.26.0)", "hyperbrowser (==0.47.0)", "lmnr[all] (==0.7.10)", "psutil (>=7.0.0)"]
+examples = ["agentmail (>=0.0.53)", "botocore (>=1.37.23)", "imgcat (>=0.6.0)", "langchain-openai (>=0.3.26)"]
 video = ["imageio[ffmpeg] (>=2.37.0)", "numpy (>=2.3.2)"]

 [[package]]
@@ -851,14 +842,14 @@ files = [

 [[package]]
 name = "cdp-use"
-version = "1.4.4"
+version = "1.4.3"
 description = "Type safe generator/client library for CDP"
 optional = false
 python-versions = ">=3.11"
 groups = ["main"]
 files = [
-    {file = "cdp_use-1.4.4-py3-none-any.whl", hash = "sha256:e37e80e067db2653d6fdf953d4ff9e5d80d75daa27b7c6d48c0261cccbef73e1"},
-    {file = "cdp_use-1.4.4.tar.gz", hash = "sha256:330a848b517006eb9ad1dc468aa6434d913cf0c6918610760c36c3fdfdba0fab"},
+    {file = "cdp_use-1.4.3-py3-none-any.whl", hash = "sha256:c48664604470c2579aa1e677c3e3e7e24c4f300c54804c093d935abb50479ecd"},
+    {file = "cdp_use-1.4.3.tar.gz", hash = "sha256:9029c04bdc49fbd3939d2bf1988ad8d88e260729c7d5e35c2f6c87591f5a10e9"},
 ]

 [package.dependencies]
@@ -2979,29 +2970,28 @@ testing = ["pytest"]

 [[package]]
 name = "google-genai"
-version = "1.53.0"
+version = "1.32.0"
 description = "GenAI Python SDK"
 optional = false
-python-versions = ">=3.10"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "google_genai-1.53.0-py3-none-any.whl", hash = "sha256:65a3f99e5c03c372d872cda7419f5940e723374bb12a2f3ffd5e3e56e8eb2094"},
-    {file = "google_genai-1.53.0.tar.gz", hash = "sha256:938a26d22f3fd32c6eeeb4276ef204ef82884e63af9842ce3eac05ceb39cbd8d"},
+    {file = "google_genai-1.32.0-py3-none-any.whl", hash = "sha256:c0c4b1d45adf3aa99501050dd73da2f0dea09374002231052d81a6765d15e7f6"},
+    {file = "google_genai-1.32.0.tar.gz", hash = "sha256:349da3f5ff0e981066bd508585fcdd308d28fc4646f318c8f6d1aa6041f4c7e3"},
 ]

 [package.dependencies]
 anyio = ">=4.8.0,<5.0.0"
-google-auth = {version = ">=2.14.1,<3.0.0", extras = ["requests"]}
+google-auth = ">=2.14.1,<3.0.0"
 httpx = ">=0.28.1,<1.0.0"
-pydantic = ">=2.9.0,<3.0.0"
+pydantic = ">=2.0.0,<3.0.0"
 requests = ">=2.28.1,<3.0.0"
 tenacity = ">=8.2.3,<9.2.0"
 typing-extensions = ">=4.11.0,<5.0.0"
 websockets = ">=13.0.0,<15.1.0"

 [package.extras]
-aiohttp = ["aiohttp (<3.13.3)"]
-local-tokenizer = ["protobuf", "sentencepiece (>=0.2.0)"]
+aiohttp = ["aiohttp (<4.0.0)"]

 [[package]]
 name = "google-resumable-media"
@@ -3057,8 +3047,6 @@ files = [
    {file = "greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d"},
    {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5"},
    {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f"},
-    {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7"},
-    {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8"},
    {file = "greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c"},
    {file = "greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2"},
    {file = "greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246"},
@@ -3068,8 +3056,6 @@ files = [
    {file = "greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8"},
    {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52"},
    {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa"},
-    {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c"},
-    {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5"},
    {file = "greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9"},
    {file = "greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd"},
    {file = "greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb"},
@@ -3079,8 +3065,6 @@ files = [
    {file = "greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0"},
    {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0"},
    {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f"},
-    {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0"},
-    {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d"},
    {file = "greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02"},
    {file = "greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31"},
    {file = "greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945"},
@@ -3090,8 +3074,6 @@ files = [
    {file = "greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671"},
    {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b"},
    {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae"},
-    {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b"},
-    {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929"},
    {file = "greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b"},
    {file = "greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0"},
    {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f"},
@@ -3099,8 +3081,6 @@ files = [
    {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1"},
    {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735"},
    {file = "greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337"},
-    {file = "greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269"},
-    {file = "greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681"},
    {file = "greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01"},
    {file = "greenlet-3.2.4-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:b6a7c19cf0d2742d0809a4c05975db036fdff50cd294a93632d6a310bf9ac02c"},
    {file = "greenlet-3.2.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:27890167f55d2387576d1f41d9487ef171849ea0359ce1510ca6e06c8bece11d"},
@@ -3110,8 +3090,6 @@ files = [
    {file = "greenlet-3.2.4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9913f1a30e4526f432991f89ae263459b1c64d1608c0d22a5c79c287b3c70df"},
    {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b90654e092f928f110e0007f572007c9727b5265f7632c2fa7415b4689351594"},
    {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:81701fd84f26330f0d5f4944d4e92e61afe6319dcd9775e39396e39d7c3e5f98"},
-    {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:28a3c6b7cd72a96f61b0e4b2a36f681025b60ae4779cc73c1535eb5f29560b10"},
-    {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:52206cd642670b0b320a1fd1cbfd95bca0e043179c1d8a045f2c6109dfe973be"},
    {file = "greenlet-3.2.4-cp39-cp39-win32.whl", hash = "sha256:65458b409c1ed459ea899e939f0e1cdb14f58dbc803f2f93c5eab5694d32671b"},
    {file = "greenlet-3.2.4-cp39-cp39-win_amd64.whl", hash = "sha256:d2e685ade4dafd447ede19c31277a224a239a0a1a4eca4e6390efedf20260cfb"},
    {file = "greenlet-3.2.4.tar.gz", hash = "sha256:0dca0d95ff849f9a364385f36ab49f50065d76964944638be9691e1832e9f86d"},
@@ -3180,87 +3158,83 @@ protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4

 [[package]]
 name = "grpcio"
-version = "1.67.1"
+version = "1.74.0"
 description = "HTTP/2-based RPC framework"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "grpcio-1.67.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:8b0341d66a57f8a3119b77ab32207072be60c9bf79760fa609c5609f2deb1f3f"},
-    {file = "grpcio-1.67.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:f5a27dddefe0e2357d3e617b9079b4bfdc91341a91565111a21ed6ebbc51b22d"},
-    {file = "grpcio-1.67.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:43112046864317498a33bdc4797ae6a268c36345a910de9b9c17159d8346602f"},
-    {file = "grpcio-1.67.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9b929f13677b10f63124c1a410994a401cdd85214ad83ab67cc077fc7e480f0"},
-    {file = "grpcio-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7d1797a8a3845437d327145959a2c0c47c05947c9eef5ff1a4c80e499dcc6fa"},
-    {file = "grpcio-1.67.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:0489063974d1452436139501bf6b180f63d4977223ee87488fe36858c5725292"},
-    {file = "grpcio-1.67.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9fd042de4a82e3e7aca44008ee2fb5da01b3e5adb316348c21980f7f58adc311"},
-    {file = "grpcio-1.67.1-cp310-cp310-win32.whl", hash = "sha256:638354e698fd0c6c76b04540a850bf1db27b4d2515a19fcd5cf645c48d3eb1ed"},
-    {file = "grpcio-1.67.1-cp310-cp310-win_amd64.whl", hash = "sha256:608d87d1bdabf9e2868b12338cd38a79969eaf920c89d698ead08f48de9c0f9e"},
-    {file = "grpcio-1.67.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:7818c0454027ae3384235a65210bbf5464bd715450e30a3d40385453a85a70cb"},
-    {file = "grpcio-1.67.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ea33986b70f83844cd00814cee4451055cd8cab36f00ac64a31f5bb09b31919e"},
-    {file = "grpcio-1.67.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:c7a01337407dd89005527623a4a72c5c8e2894d22bead0895306b23c6695698f"},
-    {file = "grpcio-1.67.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80b866f73224b0634f4312a4674c1be21b2b4afa73cb20953cbbb73a6b36c3cc"},
-    {file = "grpcio-1.67.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9fff78ba10d4250bfc07a01bd6254a6d87dc67f9627adece85c0b2ed754fa96"},
-    {file = "grpcio-1.67.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:8a23cbcc5bb11ea7dc6163078be36c065db68d915c24f5faa4f872c573bb400f"},
-    {file = "grpcio-1.67.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1a65b503d008f066e994f34f456e0647e5ceb34cfcec5ad180b1b44020ad4970"},
-    {file = "grpcio-1.67.1-cp311-cp311-win32.whl", hash = "sha256:e29ca27bec8e163dca0c98084040edec3bc49afd10f18b412f483cc68c712744"},
-    {file = "grpcio-1.67.1-cp311-cp311-win_amd64.whl", hash = "sha256:786a5b18544622bfb1e25cc08402bd44ea83edfb04b93798d85dca4d1a0b5be5"},
-    {file = "grpcio-1.67.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:267d1745894200e4c604958da5f856da6293f063327cb049a51fe67348e4f953"},
-    {file = "grpcio-1.67.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:85f69fdc1d28ce7cff8de3f9c67db2b0ca9ba4449644488c1e0303c146135ddb"},
-    {file = "grpcio-1.67.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:f26b0b547eb8d00e195274cdfc63ce64c8fc2d3e2d00b12bf468ece41a0423a0"},
-    {file = "grpcio-1.67.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4422581cdc628f77302270ff839a44f4c24fdc57887dc2a45b7e53d8fc2376af"},
-    {file = "grpcio-1.67.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d7616d2ded471231c701489190379e0c311ee0a6c756f3c03e6a62b95a7146e"},
-    {file = "grpcio-1.67.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8a00efecde9d6fcc3ab00c13f816313c040a28450e5e25739c24f432fc6d3c75"},
-    {file = "grpcio-1.67.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:699e964923b70f3101393710793289e42845791ea07565654ada0969522d0a38"},
-    {file = "grpcio-1.67.1-cp312-cp312-win32.whl", hash = "sha256:4e7b904484a634a0fff132958dabdb10d63e0927398273917da3ee103e8d1f78"},
-    {file = "grpcio-1.67.1-cp312-cp312-win_amd64.whl", hash = "sha256:5721e66a594a6c4204458004852719b38f3d5522082be9061d6510b455c90afc"},
-    {file = "grpcio-1.67.1-cp313-cp313-linux_armv7l.whl", hash = "sha256:aa0162e56fd10a5547fac8774c4899fc3e18c1aa4a4759d0ce2cd00d3696ea6b"},
-    {file = "grpcio-1.67.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:beee96c8c0b1a75d556fe57b92b58b4347c77a65781ee2ac749d550f2a365dc1"},
-    {file = "grpcio-1.67.1-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:a93deda571a1bf94ec1f6fcda2872dad3ae538700d94dc283c672a3b508ba3af"},
-    {file = "grpcio-1.67.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e6f255980afef598a9e64a24efce87b625e3e3c80a45162d111a461a9f92955"},
-    {file = "grpcio-1.67.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e838cad2176ebd5d4a8bb03955138d6589ce9e2ce5d51c3ada34396dbd2dba8"},
-    {file = "grpcio-1.67.1-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:a6703916c43b1d468d0756c8077b12017a9fcb6a1ef13faf49e67d20d7ebda62"},
-    {file = "grpcio-1.67.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:917e8d8994eed1d86b907ba2a61b9f0aef27a2155bca6cbb322430fc7135b7bb"},
-    {file = "grpcio-1.67.1-cp313-cp313-win32.whl", hash = "sha256:e279330bef1744040db8fc432becc8a727b84f456ab62b744d3fdb83f327e121"},
-    {file = "grpcio-1.67.1-cp313-cp313-win_amd64.whl", hash = "sha256:fa0c739ad8b1996bd24823950e3cb5152ae91fca1c09cc791190bf1627ffefba"},
-    {file = "grpcio-1.67.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:178f5db771c4f9a9facb2ab37a434c46cb9be1a75e820f187ee3d1e7805c4f65"},
-    {file = "grpcio-1.67.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0f3e49c738396e93b7ba9016e153eb09e0778e776df6090c1b8c91877cc1c426"},
-    {file = "grpcio-1.67.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:24e8a26dbfc5274d7474c27759b54486b8de23c709d76695237515bc8b5baeab"},
-    {file = "grpcio-1.67.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3b6c16489326d79ead41689c4b84bc40d522c9a7617219f4ad94bc7f448c5085"},
-    {file = "grpcio-1.67.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:60e6a4dcf5af7bbc36fd9f81c9f372e8ae580870a9e4b6eafe948cd334b81cf3"},
-    {file = "grpcio-1.67.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:95b5f2b857856ed78d72da93cd7d09b6db8ef30102e5e7fe0961fe4d9f7d48e8"},
-    {file = "grpcio-1.67.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b49359977c6ec9f5d0573ea4e0071ad278ef905aa74e420acc73fd28ce39e9ce"},
-    {file = "grpcio-1.67.1-cp38-cp38-win32.whl", hash = "sha256:f5b76ff64aaac53fede0cc93abf57894ab2a7362986ba22243d06218b93efe46"},
-    {file = "grpcio-1.67.1-cp38-cp38-win_amd64.whl", hash = "sha256:804c6457c3cd3ec04fe6006c739579b8d35c86ae3298ffca8de57b493524b771"},
-    {file = "grpcio-1.67.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:a25bdea92b13ff4d7790962190bf6bf5c4639876e01c0f3dda70fc2769616335"},
-    {file = "grpcio-1.67.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cdc491ae35a13535fd9196acb5afe1af37c8237df2e54427be3eecda3653127e"},
-    {file = "grpcio-1.67.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:85f862069b86a305497e74d0dc43c02de3d1d184fc2c180993aa8aa86fbd19b8"},
-    {file = "grpcio-1.67.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ec74ef02010186185de82cc594058a3ccd8d86821842bbac9873fd4a2cf8be8d"},
-    {file = "grpcio-1.67.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01f616a964e540638af5130469451cf580ba8c7329f45ca998ab66e0c7dcdb04"},
-    {file = "grpcio-1.67.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:299b3d8c4f790c6bcca485f9963b4846dd92cf6f1b65d3697145d005c80f9fe8"},
-    {file = "grpcio-1.67.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:60336bff760fbb47d7e86165408126f1dded184448e9a4c892189eb7c9d3f90f"},
-    {file = "grpcio-1.67.1-cp39-cp39-win32.whl", hash = "sha256:5ed601c4c6008429e3d247ddb367fe8c7259c355757448d7c1ef7bd4a6739e8e"},
-    {file = "grpcio-1.67.1-cp39-cp39-win_amd64.whl", hash = "sha256:5db70d32d6703b89912af16d6d45d78406374a8b8ef0d28140351dd0ec610e98"},
-    {file = "grpcio-1.67.1.tar.gz", hash = "sha256:3dc2ed4cabea4dc14d5e708c2b426205956077cc5de419b4d4079315017e9732"},
+    {file = "grpcio-1.74.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:85bd5cdf4ed7b2d6438871adf6afff9af7096486fcf51818a81b77ef4dd30907"},
+    {file = "grpcio-1.74.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:68c8ebcca945efff9d86d8d6d7bfb0841cf0071024417e2d7f45c5e46b5b08eb"},
+    {file = "grpcio-1.74.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:e154d230dc1bbbd78ad2fdc3039fa50ad7ffcf438e4eb2fa30bce223a70c7486"},
+    {file = "grpcio-1.74.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8978003816c7b9eabe217f88c78bc26adc8f9304bf6a594b02e5a49b2ef9c11"},
+    {file = "grpcio-1.74.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3d7bd6e3929fd2ea7fbc3f562e4987229ead70c9ae5f01501a46701e08f1ad9"},
+    {file = "grpcio-1.74.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:136b53c91ac1d02c8c24201bfdeb56f8b3ac3278668cbb8e0ba49c88069e1bdc"},
+    {file = "grpcio-1.74.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fe0f540750a13fd8e5da4b3eaba91a785eea8dca5ccd2bc2ffe978caa403090e"},
+    {file = "grpcio-1.74.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4e4181bfc24413d1e3a37a0b7889bea68d973d4b45dd2bc68bb766c140718f82"},
+    {file = "grpcio-1.74.0-cp310-cp310-win32.whl", hash = "sha256:1733969040989f7acc3d94c22f55b4a9501a30f6aaacdbccfaba0a3ffb255ab7"},
+    {file = "grpcio-1.74.0-cp310-cp310-win_amd64.whl", hash = "sha256:9e912d3c993a29df6c627459af58975b2e5c897d93287939b9d5065f000249b5"},
+    {file = "grpcio-1.74.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31"},
+    {file = "grpcio-1.74.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4"},
+    {file = "grpcio-1.74.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce"},
+    {file = "grpcio-1.74.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3"},
+    {file = "grpcio-1.74.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182"},
+    {file = "grpcio-1.74.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d"},
+    {file = "grpcio-1.74.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f"},
+    {file = "grpcio-1.74.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4"},
+    {file = "grpcio-1.74.0-cp311-cp311-win32.whl", hash = "sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b"},
+    {file = "grpcio-1.74.0-cp311-cp311-win_amd64.whl", hash = "sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11"},
+    {file = "grpcio-1.74.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8"},
+    {file = "grpcio-1.74.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6"},
+    {file = "grpcio-1.74.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5"},
+    {file = "grpcio-1.74.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49"},
+    {file = "grpcio-1.74.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7"},
+    {file = "grpcio-1.74.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3"},
+    {file = "grpcio-1.74.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707"},
+    {file = "grpcio-1.74.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b"},
+    {file = "grpcio-1.74.0-cp312-cp312-win32.whl", hash = "sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c"},
+    {file = "grpcio-1.74.0-cp312-cp312-win_amd64.whl", hash = "sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc"},
+    {file = "grpcio-1.74.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89"},
+    {file = "grpcio-1.74.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01"},
+    {file = "grpcio-1.74.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e"},
+    {file = "grpcio-1.74.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91"},
+    {file = "grpcio-1.74.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249"},
+    {file = "grpcio-1.74.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362"},
+    {file = "grpcio-1.74.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f"},
+    {file = "grpcio-1.74.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20"},
+    {file = "grpcio-1.74.0-cp313-cp313-win32.whl", hash = "sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa"},
+    {file = "grpcio-1.74.0-cp313-cp313-win_amd64.whl", hash = "sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24"},
+    {file = "grpcio-1.74.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:4bc5fca10aaf74779081e16c2bcc3d5ec643ffd528d9e7b1c9039000ead73bae"},
+    {file = "grpcio-1.74.0-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:6bab67d15ad617aff094c382c882e0177637da73cbc5532d52c07b4ee887a87b"},
+    {file = "grpcio-1.74.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:655726919b75ab3c34cdad39da5c530ac6fa32696fb23119e36b64adcfca174a"},
+    {file = "grpcio-1.74.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a2b06afe2e50ebfd46247ac3ba60cac523f54ec7792ae9ba6073c12daf26f0a"},
+    {file = "grpcio-1.74.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f251c355167b2360537cf17bea2cf0197995e551ab9da6a0a59b3da5e8704f9"},
+    {file = "grpcio-1.74.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:8f7b5882fb50632ab1e48cb3122d6df55b9afabc265582808036b6e51b9fd6b7"},
+    {file = "grpcio-1.74.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:834988b6c34515545b3edd13e902c1acdd9f2465d386ea5143fb558f153a7176"},
+    {file = "grpcio-1.74.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:22b834cef33429ca6cc28303c9c327ba9a3fafecbf62fae17e9a7b7163cc43ac"},
+    {file = "grpcio-1.74.0-cp39-cp39-win32.whl", hash = "sha256:7d95d71ff35291bab3f1c52f52f474c632db26ea12700c2ff0ea0532cb0b5854"},
+    {file = "grpcio-1.74.0-cp39-cp39-win_amd64.whl", hash = "sha256:ecde9ab49f58433abe02f9ed076c7b5be839cf0153883a6d23995937a82392fa"},
+    {file = "grpcio-1.74.0.tar.gz", hash = "sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1"},
 ]

 [package.extras]
-protobuf = ["grpcio-tools (>=1.67.1)"]
+protobuf = ["grpcio-tools (>=1.74.0)"]

 [[package]]
 name = "grpcio-status"
-version = "1.67.1"
+version = "1.71.2"
 description = "Status proto mapping for gRPC"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "grpcio_status-1.67.1-py3-none-any.whl", hash = "sha256:16e6c085950bdacac97c779e6a502ea671232385e6e37f258884d6883392c2bd"},
-    {file = "grpcio_status-1.67.1.tar.gz", hash = "sha256:2bf38395e028ceeecfd8866b081f61628114b384da7d51ae064ddc8d766a5d11"},
+    {file = "grpcio_status-1.71.2-py3-none-any.whl", hash = "sha256:803c98cb6a8b7dc6dbb785b1111aed739f241ab5e9da0bba96888aa74704cfd3"},
+    {file = "grpcio_status-1.71.2.tar.gz", hash = "sha256:c7a97e176df71cdc2c179cd1847d7fc86cca5832ad12e9798d7fed6b7a1aab50"},
 ]

 [package.dependencies]
 googleapis-common-protos = ">=1.5.5"
-grpcio = ">=1.67.1"
+grpcio = ">=1.71.2"
 protobuf = ">=5.26.1,<6.0dev"

 [[package]]
@@ -3551,25 +3525,6 @@ files = [
    {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
 ]

-[[package]]
-name = "inquirerpy"
-version = "0.3.4"
-description = "Python port of Inquirer.js (A collection of common interactive command-line user interfaces)"
-optional = false
-python-versions = ">=3.7,<4.0"
-groups = ["main"]
-files = [
-    {file = "InquirerPy-0.3.4-py3-none-any.whl", hash = "sha256:c65fdfbac1fa00e3ee4fb10679f4d3ed7a012abf4833910e63c295827fe2a7d4"},
-    {file = "InquirerPy-0.3.4.tar.gz", hash = "sha256:89d2ada0111f337483cb41ae31073108b2ec1e618a49d7110b0d7ade89fc197e"},
-]
-
-[package.dependencies]
-pfzy = ">=0.3.1,<0.4.0"
-prompt-toolkit = ">=3.0.1,<4.0.0"
-
-[package.extras]
-docs = ["Sphinx (>=4.1.2,<5.0.0)", "furo (>=2021.8.17-beta.43,<2022.0.0)", "myst-parser (>=0.15.1,<0.16.0)", "sphinx-autobuild (>=2021.3.14,<2022.0.0)", "sphinx-copybutton (>=0.4.0,<0.5.0)"]
-
 [[package]]
 name = "installer"
 version = "0.7.0"
@@ -4558,39 +4513,42 @@ valkey = ["valkey (>=6)"]

 [[package]]
 name = "litellm"
-version = "1.80.7"
+version = "1.77.7"
 description = "Library to easily interface with LLM API providers"
 optional = false
-python-versions = "<4.0,>=3.9"
+python-versions = ">=3.8.1,<4.0, !=3.9.7"
 groups = ["main"]
-files = [
-    {file = "litellm-1.80.7-py3-none-any.whl", hash = "sha256:f7d993f78c1e0e4e1202b2a925cc6540b55b6e5fb055dd342d88b145ab3102ed"},
-    {file = "litellm-1.80.7.tar.gz", hash = "sha256:3977a8d195aef842d01c18bf9e22984829363c6a4b54daf9a43c9dd9f190b42c"},
-]
+files = []
+develop = false

 [package.dependencies]
 aiohttp = ">=3.10"
 click = "*"
 fastuuid = ">=0.13.0"
-grpcio = ">=1.62.3,<1.68.0"
 httpx = ">=0.23.0"
 importlib-metadata = ">=6.8.0"
-jinja2 = ">=3.1.2,<4.0.0"
-jsonschema = ">=4.22.0,<5.0.0"
-openai = ">=2.8.0"
-pydantic = ">=2.5.0,<3.0.0"
+jinja2 = "^3.1.2"
+jsonschema = "^4.22.0"
+openai = ">=1.99.5"
+pydantic = "^2.5.0"
 python-dotenv = ">=0.2.0"
 tiktoken = ">=0.7.0"
 tokenizers = "*"

 [package.extras]
 caching = ["diskcache (>=5.6.1,<6.0.0)"]
-extra-proxy = ["azure-identity (>=1.15.0,<2.0.0) ; python_version >= \"3.9\"", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-iam (>=2.19.1,<3.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "redisvl (>=0.4.1,<0.5.0) ; python_version >= \"3.9\" and python_version < \"3.14\"", "resend (>=0.8.0)"]
+extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-iam (>=2.19.1,<3.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "redisvl (>=0.4.1,<0.5.0) ; python_version >= \"3.9\" and python_version < \"3.14\"", "resend (>=0.8.0,<0.9.0)"]
 mlflow = ["mlflow (>3.1.4) ; python_version >= \"3.10\""]
-proxy = ["PyJWT (>=2.10.1,<3.0.0) ; python_version >= \"3.9\"", "apscheduler (>=3.10.4,<4.0.0)", "azure-identity (>=1.15.0,<2.0.0) ; python_version >= \"3.9\"", "azure-storage-blob (>=12.25.1,<13.0.0)", "backoff", "boto3 (==1.36.0)", "cryptography", "fastapi (>=0.120.1)", "fastapi-sso (>=0.16.0,<0.17.0)", "gunicorn (>=23.0.0,<24.0.0)", "litellm-enterprise (==0.1.22)", "litellm-proxy-extras (==0.4.9)", "mcp (>=1.21.2,<2.0.0) ; python_version >= \"3.10\"", "orjson (>=3.9.7,<4.0.0)", "polars (>=1.31.0,<2.0.0) ; python_version >= \"3.10\"", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.18,<0.0.19)", "pyyaml (>=6.0.1,<7.0.0)", "rich (==13.7.1)", "rq", "soundfile (>=0.12.1,<0.13.0)", "uvicorn (>=0.31.1,<0.32.0)", "uvloop (>=0.21.0,<0.22.0) ; sys_platform != \"win32\"", "websockets (>=15.0.1,<16.0.0)"]
-semantic-router = ["semantic-router (>=0.1.12) ; python_version >= \"3.9\" and python_version < \"3.14\""]
+proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "azure-identity (>=1.15.0,<2.0.0)", "azure-storage-blob (>=12.25.1,<13.0.0)", "backoff", "boto3 (==1.36.0)", "cryptography", "fastapi (>=0.115.5,<0.116.0)", "fastapi-sso (>=0.16.0,<0.17.0)", "gunicorn (>=23.0.0,<24.0.0)", "litellm-enterprise (==0.1.20)", "litellm-proxy-extras (==0.2.25)", "mcp (>=1.10.0,<2.0.0) ; python_version >= \"3.10\"", "orjson (>=3.9.7,<4.0.0)", "polars (>=1.31.0,<2.0.0) ; python_version >= \"3.10\"", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.18,<0.0.19)", "pyyaml (>=6.0.1,<7.0.0)", "rich (==13.7.1)", "rq", "uvicorn (>=0.29.0,<0.30.0)", "uvloop (>=0.21.0,<0.22.0) ; sys_platform != \"win32\"", "websockets (>=13.1.0,<14.0.0)"]
+semantic-router = ["semantic-router ; python_version >= \"3.9\""]
 utils = ["numpydoc"]

+[package.source]
+type = "git"
+url = "https://github.com/BerriAI/litellm.git"
+reference = "v1.77.7.dev9"
+resolved_reference = "763d2f8ccdd8412dbe6d4ac0e136d9ac34dcd4c0"
+
 [[package]]
 name = "llvmlite"
 version = "0.44.0"
@@ -4622,63 +4580,6 @@ files = [
    {file = "llvmlite-0.44.0.tar.gz", hash = "sha256:07667d66a5d150abed9157ab6c0b9393c9356f229784a4385c02f99e94fc94d4"},
 ]

-[[package]]
-name = "lmnr"
-version = "0.7.24"
-description = "Python SDK for Laminar"
-optional = false
-python-versions = "<4,>=3.10"
-groups = ["main"]
-files = [
-    {file = "lmnr-0.7.24-py3-none-any.whl", hash = "sha256:ad780d4a62ece897048811f3368639c240a9329ab31027da8c96545137a3a08a"},
-    {file = "lmnr-0.7.24.tar.gz", hash = "sha256:aa6973f46fc4ba95c9061c1feceb58afc02eb43c9376c21e32545371ff6123d7"},
-]
-
-[package.dependencies]
-grpcio = ">=1"
-httpx = ">=0.24.0"
-opentelemetry-api = ">=1.33.0"
-opentelemetry-exporter-otlp-proto-grpc = ">=1.33.0"
-opentelemetry-exporter-otlp-proto-http = ">=1.33.0"
-opentelemetry-instrumentation = ">=0.54b0"
-opentelemetry-instrumentation-threading = ">=0.57b0"
-opentelemetry-sdk = ">=1.33.0"
-opentelemetry-semantic-conventions = ">=0.54b0"
-opentelemetry-semantic-conventions-ai = ">=0.4.13"
-orjson = ">=3.0.0"
-packaging = ">=22.0"
-pydantic = ">=2.0.3,<3.0.0"
-python-dotenv = ">=1.0"
-tenacity = ">=8.0"
-tqdm = ">=4.0"
-
-[package.extras]
-alephalpha = ["opentelemetry-instrumentation-alephalpha (>=0.47.1)"]
-all = ["opentelemetry-instrumentation-alephalpha (>=0.47.1)", "opentelemetry-instrumentation-bedrock (>=0.47.1)", "opentelemetry-instrumentation-chromadb (>=0.47.1)", "opentelemetry-instrumentation-cohere (>=0.47.1)", "opentelemetry-instrumentation-crewai (>=0.47.1)", "opentelemetry-instrumentation-haystack (>=0.47.1)", "opentelemetry-instrumentation-lancedb (>=0.47.1)", "opentelemetry-instrumentation-langchain (>=0.47.1,<0.48.0)", "opentelemetry-instrumentation-llamaindex (>=0.47.1)", "opentelemetry-instrumentation-marqo (>=0.47.1)", "opentelemetry-instrumentation-mcp (>=0.47.1)", "opentelemetry-instrumentation-milvus (>=0.47.1)", "opentelemetry-instrumentation-mistralai (>=0.47.1)", "opentelemetry-instrumentation-ollama (>=0.47.1)", "opentelemetry-instrumentation-pinecone (>=0.47.1)", "opentelemetry-instrumentation-qdrant (>=0.47.1)", "opentelemetry-instrumentation-replicate (>=0.47.1)", "opentelemetry-instrumentation-sagemaker (>=0.47.1)", "opentelemetry-instrumentation-together (>=0.47.1)", "opentelemetry-instrumentation-transformers (>=0.47.1)", "opentelemetry-instrumentation-vertexai (>=0.47.1)", "opentelemetry-instrumentation-watsonx (>=0.47.1)", "opentelemetry-instrumentation-weaviate (>=0.47.1)"]
-bedrock = ["opentelemetry-instrumentation-bedrock (>=0.47.1)"]
-chromadb = ["opentelemetry-instrumentation-chromadb (>=0.47.1)"]
-claude-agent-sdk = ["lmnr-claude-code-proxy (>=0.1.0a5)"]
-cohere = ["opentelemetry-instrumentation-cohere (>=0.47.1)"]
-crewai = ["opentelemetry-instrumentation-crewai (>=0.47.1)"]
-haystack = ["opentelemetry-instrumentation-haystack (>=0.47.1)"]
-lancedb = ["opentelemetry-instrumentation-lancedb (>=0.47.1)"]
-langchain = ["opentelemetry-instrumentation-langchain (>=0.47.1,<0.48.0)"]
-llamaindex = ["opentelemetry-instrumentation-llamaindex (>=0.47.1)"]
-marqo = ["opentelemetry-instrumentation-marqo (>=0.47.1)"]
-mcp = ["opentelemetry-instrumentation-mcp (>=0.47.1)"]
-milvus = ["opentelemetry-instrumentation-milvus (>=0.47.1)"]
-mistralai = ["opentelemetry-instrumentation-mistralai (>=0.47.1)"]
-ollama = ["opentelemetry-instrumentation-ollama (>=0.47.1)"]
-pinecone = ["opentelemetry-instrumentation-pinecone (>=0.47.1)"]
-qdrant = ["opentelemetry-instrumentation-qdrant (>=0.47.1)"]
-replicate = ["opentelemetry-instrumentation-replicate (>=0.47.1)"]
-sagemaker = ["opentelemetry-instrumentation-sagemaker (>=0.47.1)"]
-together = ["opentelemetry-instrumentation-together (>=0.47.1)"]
-transformers = ["opentelemetry-instrumentation-transformers (>=0.47.1)"]
-vertexai = ["opentelemetry-instrumentation-vertexai (>=0.47.1)"]
-watsonx = ["opentelemetry-instrumentation-watsonx (>=0.47.1)"]
-weaviate = ["opentelemetry-instrumentation-weaviate (>=0.47.1)"]
-
 [[package]]
 name = "lxml"
 version = "6.0.1"
@@ -5660,28 +5561,28 @@ pydantic = ">=2.9"

 [[package]]
 name = "openai"
-version = "2.8.0"
+version = "1.99.9"
 description = "The official Python library for the openai API"
 optional = false
-python-versions = ">=3.9"
+python-versions = ">=3.8"
 groups = ["main", "test"]
 files = [
-    {file = "openai-2.8.0-py3-none-any.whl", hash = "sha256:ba975e347f6add2fe13529ccb94d54a578280e960765e5224c34b08d7e029ddf"},
-    {file = "openai-2.8.0.tar.gz", hash = "sha256:4851908f6d6fcacbd47ba659c5ac084f7725b752b6bfa1e948b6fbfc111a6bad"},
+    {file = "openai-1.99.9-py3-none-any.whl", hash = "sha256:9dbcdb425553bae1ac5d947147bebbd630d91bbfc7788394d4c4f3a35682ab3a"},
+    {file = "openai-1.99.9.tar.gz", hash = "sha256:f2082d155b1ad22e83247c3de3958eb4255b20ccf4a1de2e6681b6957b554e92"},
 ]

 [package.dependencies]
 anyio = ">=3.5.0,<5"
 distro = ">=1.7.0,<2"
 httpx = ">=0.23.0,<1"
-jiter = ">=0.10.0,<1"
+jiter = ">=0.4.0,<1"
 pydantic = ">=1.9.0,<3"
 sniffio = "*"
 tqdm = ">4"
 typing-extensions = ">=4.11,<5"

 [package.extras]
-aiohttp = ["aiohttp", "httpx-aiohttp (>=0.1.9)"]
+aiohttp = ["aiohttp", "httpx-aiohttp (>=0.1.8)"]
 datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
 realtime = ["websockets (>=13,<16)"]
 voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]
@@ -5836,31 +5737,35 @@ llama = ["llama-index (>=0.12.29,<0.13.0)", "llama-index-core (>=0.12.29,<0.13.0

 [[package]]
 name = "openhands-agent-server"
-version = "1.6.0"
+version = "1.0.0a4"
 description = "OpenHands Agent Server - REST/WebSocket interface for OpenHands AI Agent"
 optional = false
 python-versions = ">=3.12"
 groups = ["main"]
-files = [
-    {file = "openhands_agent_server-1.6.0-py3-none-any.whl", hash = "sha256:e6ae865ac3e7a96b234e10a0faad23f6210e025bbf7721cb66bc7a71d160848c"},
-    {file = "openhands_agent_server-1.6.0.tar.gz", hash = "sha256:44ce7694ae2d4bb0666d318ef13e6618bd4dc73022c60354839fe6130e67d02a"},
-]
+files = []
+develop = false

 [package.dependencies]
 aiosqlite = ">=0.19"
 alembic = ">=1.13"
 docker = ">=7.1,<8"
 fastapi = ">=0.104"
-openhands-sdk = "*"
 pydantic = ">=2"
 sqlalchemy = ">=2"
 uvicorn = ">=0.31.1"
 websockets = ">=12"
 wsproto = ">=1.2.0"

+[package.source]
+type = "git"
+url = "https://github.com/OpenHands/agent-sdk.git"
+reference = "ce0a71af55dfce101f7419fbdb0116178f01e109"
+resolved_reference = "ce0a71af55dfce101f7419fbdb0116178f01e109"
+subdirectory = "openhands-agent-server"
+
 [[package]]
 name = "openhands-ai"
-version = "0.0.0-post.5687+7853b41ad"
+version = "0.59.0"
 description = "OpenHands: Code Less, Make More"
 optional = false
 python-versions = "^3.12,<3.14"
@@ -5877,7 +5782,6 @@ bashlex = "^0.18"
 boto3 = "*"
 browsergym-core = "0.13.3"
 deprecated = "*"
-deprecation = "^2.1.0"
 dirhash = "*"
 docker = "*"
 fastapi = "*"
@@ -5896,15 +5800,14 @@ json-repair = "*"
 jupyter_kernel_gateway = "*"
 kubernetes = "^33.1.0"
 libtmux = ">=0.46.2"
-litellm = ">=1.74.3, <=1.80.7, !=1.64.4, !=1.67.*"
-lmnr = "^0.7.20"
+litellm = ">=1.74.3, <1.78.0, !=1.64.4, !=1.67.*"
 memory-profiler = "^0.61.0"
 numpy = "*"
-openai = "2.8.0"
+openai = "1.99.9"
 openhands-aci = "0.3.2"
-openhands-agent-server = "1.6.0"
-openhands-sdk = "1.6.0"
-openhands-tools = "1.6.0"
+openhands-agent-server = {git = "https://github.com/OpenHands/agent-sdk.git", rev = "ce0a71af55dfce101f7419fbdb0116178f01e109", subdirectory = "openhands-agent-server"}
+openhands-sdk = {git = "https://github.com/OpenHands/agent-sdk.git", rev = "ce0a71af55dfce101f7419fbdb0116178f01e109", subdirectory = "openhands-sdk"}
+openhands-tools = {git = "https://github.com/OpenHands/agent-sdk.git", rev = "ce0a71af55dfce101f7419fbdb0116178f01e109", subdirectory = "openhands-tools"}
 opentelemetry-api = "^1.33.1"
 opentelemetry-exporter-otlp-proto-grpc = "^1.33.1"
 pathspec = "^0.12.1"
@@ -5960,22 +5863,18 @@ url = ".."

 [[package]]
 name = "openhands-sdk"
-version = "1.6.0"
+version = "1.0.0a4"
 description = "OpenHands SDK - Core functionality for building AI agents"
 optional = false
 python-versions = ">=3.12"
 groups = ["main"]
-files = [
-    {file = "openhands_sdk-1.6.0-py3-none-any.whl", hash = "sha256:94d2f87fb35406373da6728ae2d88584137f9e9b67fa0e940444c72f2e44e7d3"},
-    {file = "openhands_sdk-1.6.0.tar.gz", hash = "sha256:f45742350e3874a7f5b08befc4a9d5adc7e4454f7ab5f8391c519eee3116090f"},
-]
+files = []
+develop = false

 [package.dependencies]
-deprecation = ">=2.1.0"
 fastmcp = ">=2.11.3"
 httpx = ">=0.27.0"
-litellm = ">=1.80.7"
-lmnr = ">=0.7.24"
+litellm = ">=1.77.7.dev9"
 pydantic = ">=2.11.7"
 python-frontmatter = ">=1.1.0"
 python-json-logger = ">=3.3.0"
@@ -5985,28 +5884,39 @@ websockets = ">=12"
 [package.extras]
 boto3 = ["boto3 (>=1.35.0)"]

+[package.source]
+type = "git"
+url = "https://github.com/OpenHands/agent-sdk.git"
+reference = "ce0a71af55dfce101f7419fbdb0116178f01e109"
+resolved_reference = "ce0a71af55dfce101f7419fbdb0116178f01e109"
+subdirectory = "openhands-sdk"
+
 [[package]]
 name = "openhands-tools"
-version = "1.6.0"
+version = "1.0.0a4"
 description = "OpenHands Tools - Runtime tools for AI agents"
 optional = false
 python-versions = ">=3.12"
 groups = ["main"]
-files = [
-    {file = "openhands_tools-1.6.0-py3-none-any.whl", hash = "sha256:176556d44186536751b23fe052d3505492cc2afb8d52db20fb7a2cc0169cd57a"},
-    {file = "openhands_tools-1.6.0.tar.gz", hash = "sha256:d07ba31050fd4a7891a4c48388aa53ce9f703e17064ddbd59146d6c77e5980b3"},
-]
+files = []
+develop = false

 [package.dependencies]
 bashlex = ">=0.18"
 binaryornot = ">=0.4.4"
-browser-use = ">=0.8.0"
+browser-use = ">=0.7.7"
 cachetools = "*"
 func-timeout = ">=4.3.5"
 libtmux = ">=0.46.2"
 openhands-sdk = "*"
 pydantic = ">=2.11.7"
-tom-swe = ">=1.0.3"
+
+[package.source]
+type = "git"
+url = "https://github.com/OpenHands/agent-sdk.git"
+reference = "ce0a71af55dfce101f7419fbdb0116178f01e109"
+resolved_reference = "ce0a71af55dfce101f7419fbdb0116178f01e109"
+subdirectory = "openhands-tools"

 [[package]]
 name = "openpyxl"
@@ -6078,62 +5988,6 @@ opentelemetry-proto = "1.36.0"
 opentelemetry-sdk = ">=1.36.0,<1.37.0"
 typing-extensions = ">=4.6.0"

-[[package]]
-name = "opentelemetry-exporter-otlp-proto-http"
-version = "1.36.0"
-description = "OpenTelemetry Collector Protobuf over HTTP Exporter"
-optional = false
-python-versions = ">=3.9"
-groups = ["main"]
-files = [
-    {file = "opentelemetry_exporter_otlp_proto_http-1.36.0-py3-none-any.whl", hash = "sha256:3d769f68e2267e7abe4527f70deb6f598f40be3ea34c6adc35789bea94a32902"},
-    {file = "opentelemetry_exporter_otlp_proto_http-1.36.0.tar.gz", hash = "sha256:dd3637f72f774b9fc9608ab1ac479f8b44d09b6fb5b2f3df68a24ad1da7d356e"},
-]
-
-[package.dependencies]
-googleapis-common-protos = ">=1.52,<2.0"
-opentelemetry-api = ">=1.15,<2.0"
-opentelemetry-exporter-otlp-proto-common = "1.36.0"
-opentelemetry-proto = "1.36.0"
-opentelemetry-sdk = ">=1.36.0,<1.37.0"
-requests = ">=2.7,<3.0"
-typing-extensions = ">=4.5.0"
-
-[[package]]
-name = "opentelemetry-instrumentation"
-version = "0.57b0"
-description = "Instrumentation Tools & Auto Instrumentation for OpenTelemetry Python"
-optional = false
-python-versions = ">=3.9"
-groups = ["main"]
-files = [
-    {file = "opentelemetry_instrumentation-0.57b0-py3-none-any.whl", hash = "sha256:9109280f44882e07cec2850db28210b90600ae9110b42824d196de357cbddf7e"},
-    {file = "opentelemetry_instrumentation-0.57b0.tar.gz", hash = "sha256:f2a30135ba77cdea2b0e1df272f4163c154e978f57214795d72f40befd4fcf05"},
-]
-
-[package.dependencies]
-opentelemetry-api = ">=1.4,<2.0"
-opentelemetry-semantic-conventions = "0.57b0"
-packaging = ">=18.0"
-wrapt = ">=1.0.0,<2.0.0"
-
-[[package]]
-name = "opentelemetry-instrumentation-threading"
-version = "0.57b0"
-description = "Thread context propagation support for OpenTelemetry"
-optional = false
-python-versions = ">=3.9"
-groups = ["main"]
-files = [
-    {file = "opentelemetry_instrumentation_threading-0.57b0-py3-none-any.whl", hash = "sha256:adfd64857c8c78d6111cf80552311e1713bad64272dd81abdd61f07b892a161b"},
-    {file = "opentelemetry_instrumentation_threading-0.57b0.tar.gz", hash = "sha256:06fa4c98d6bfe4670e7532497670ac202db42afa647ff770aedce0e422421c6e"},
-]
-
-[package.dependencies]
-opentelemetry-api = ">=1.12,<2.0"
-opentelemetry-instrumentation = "0.57b0"
-wrapt = ">=1.0.0,<2.0.0"
-
 [[package]]
 name = "opentelemetry-proto"
 version = "1.36.0"
@@ -6182,115 +6036,6 @@ files = [
 opentelemetry-api = "1.36.0"
 typing-extensions = ">=4.5.0"

-[[package]]
-name = "opentelemetry-semantic-conventions-ai"
-version = "0.4.13"
-description = "OpenTelemetry Semantic Conventions Extension for Large Language Models"
-optional = false
-python-versions = "<4,>=3.9"
-groups = ["main"]
-files = [
-    {file = "opentelemetry_semantic_conventions_ai-0.4.13-py3-none-any.whl", hash = "sha256:883a30a6bb5deaec0d646912b5f9f6dcbb9f6f72557b73d0f2560bf25d13e2d5"},
-    {file = "opentelemetry_semantic_conventions_ai-0.4.13.tar.gz", hash = "sha256:94efa9fb4ffac18c45f54a3a338ffeb7eedb7e1bb4d147786e77202e159f0036"},
-]
-
-[[package]]
-name = "orjson"
-version = "3.11.4"
-description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy"
-optional = false
-python-versions = ">=3.9"
-groups = ["main"]
-files = [
-    {file = "orjson-3.11.4-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:e3aa2118a3ece0d25489cbe48498de8a5d580e42e8d9979f65bf47900a15aba1"},
-    {file = "orjson-3.11.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a69ab657a4e6733133a3dca82768f2f8b884043714e8d2b9ba9f52b6efef5c44"},
-    {file = "orjson-3.11.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3740bffd9816fc0326ddc406098a3a8f387e42223f5f455f2a02a9f834ead80c"},
-    {file = "orjson-3.11.4-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65fd2f5730b1bf7f350c6dc896173d3460d235c4be007af73986d7cd9a2acd23"},
-    {file = "orjson-3.11.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fdc3ae730541086158d549c97852e2eea6820665d4faf0f41bf99df41bc11ea"},
-    {file = "orjson-3.11.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e10b4d65901da88845516ce9f7f9736f9638d19a1d483b3883dc0182e6e5edba"},
-    {file = "orjson-3.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb6a03a678085f64b97f9d4a9ae69376ce91a3a9e9b56a82b1580d8e1d501aff"},
-    {file = "orjson-3.11.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2c82e4f0b1c712477317434761fbc28b044c838b6b1240d895607441412371ac"},
-    {file = "orjson-3.11.4-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:d58c166a18f44cc9e2bad03a327dc2d1a3d2e85b847133cfbafd6bfc6719bd79"},
-    {file = "orjson-3.11.4-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:94f206766bf1ea30e1382e4890f763bd1eefddc580e08fec1ccdc20ddd95c827"},
-    {file = "orjson-3.11.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:41bf25fb39a34cf8edb4398818523277ee7096689db352036a9e8437f2f3ee6b"},
-    {file = "orjson-3.11.4-cp310-cp310-win32.whl", hash = "sha256:fa9627eba4e82f99ca6d29bc967f09aba446ee2b5a1ea728949ede73d313f5d3"},
-    {file = "orjson-3.11.4-cp310-cp310-win_amd64.whl", hash = "sha256:23ef7abc7fca96632d8174ac115e668c1e931b8fe4dde586e92a500bf1914dcc"},
-    {file = "orjson-3.11.4-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:5e59d23cd93ada23ec59a96f215139753fbfe3a4d989549bcb390f8c00370b39"},
-    {file = "orjson-3.11.4-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:5c3aedecfc1beb988c27c79d52ebefab93b6c3921dbec361167e6559aba2d36d"},
-    {file = "orjson-3.11.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da9e5301f1c2caa2a9a4a303480d79c9ad73560b2e7761de742ab39fe59d9175"},
-    {file = "orjson-3.11.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8873812c164a90a79f65368f8f96817e59e35d0cc02786a5356f0e2abed78040"},
-    {file = "orjson-3.11.4-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d7feb0741ebb15204e748f26c9638e6665a5fa93c37a2c73d64f1669b0ddc63"},
-    {file = "orjson-3.11.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:01ee5487fefee21e6910da4c2ee9eef005bee568a0879834df86f888d2ffbdd9"},
-    {file = "orjson-3.11.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d40d46f348c0321df01507f92b95a377240c4ec31985225a6668f10e2676f9a"},
-    {file = "orjson-3.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95713e5fc8af84d8edc75b785d2386f653b63d62b16d681687746734b4dfc0be"},
-    {file = "orjson-3.11.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ad73ede24f9083614d6c4ca9a85fe70e33be7bf047ec586ee2363bc7418fe4d7"},
-    {file = "orjson-3.11.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:842289889de515421f3f224ef9c1f1efb199a32d76d8d2ca2706fa8afe749549"},
-    {file = "orjson-3.11.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:3b2427ed5791619851c52a1261b45c233930977e7de8cf36de05636c708fa905"},
-    {file = "orjson-3.11.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3c36e524af1d29982e9b190573677ea02781456b2e537d5840e4538a5ec41907"},
-    {file = "orjson-3.11.4-cp311-cp311-win32.whl", hash = "sha256:87255b88756eab4a68ec61837ca754e5d10fa8bc47dc57f75cedfeaec358d54c"},
-    {file = "orjson-3.11.4-cp311-cp311-win_amd64.whl", hash = "sha256:e2d5d5d798aba9a0e1fede8d853fa899ce2cb930ec0857365f700dffc2c7af6a"},
-    {file = "orjson-3.11.4-cp311-cp311-win_arm64.whl", hash = "sha256:6bb6bb41b14c95d4f2702bce9975fda4516f1db48e500102fc4d8119032ff045"},
-    {file = "orjson-3.11.4-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:d4371de39319d05d3f482f372720b841c841b52f5385bd99c61ed69d55d9ab50"},
-    {file = "orjson-3.11.4-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:e41fd3b3cac850eaae78232f37325ed7d7436e11c471246b87b2cd294ec94853"},
-    {file = "orjson-3.11.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:600e0e9ca042878c7fdf189cf1b028fe2c1418cc9195f6cb9824eb6ed99cb938"},
-    {file = "orjson-3.11.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7bbf9b333f1568ef5da42bc96e18bf30fd7f8d54e9ae066d711056add508e415"},
-    {file = "orjson-3.11.4-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4806363144bb6e7297b8e95870e78d30a649fdc4e23fc84daa80c8ebd366ce44"},
-    {file = "orjson-3.11.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad355e8308493f527d41154e9053b86a5be892b3b359a5c6d5d95cda23601cb2"},
-    {file = "orjson-3.11.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c8a7517482667fb9f0ff1b2f16fe5829296ed7a655d04d68cd9711a4d8a4e708"},
-    {file = "orjson-3.11.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97eb5942c7395a171cbfecc4ef6701fc3c403e762194683772df4c54cfbb2210"},
-    {file = "orjson-3.11.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:149d95d5e018bdd822e3f38c103b1a7c91f88d38a88aada5c4e9b3a73a244241"},
-    {file = "orjson-3.11.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:624f3951181eb46fc47dea3d221554e98784c823e7069edb5dbd0dc826ac909b"},
-    {file = "orjson-3.11.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:03bfa548cf35e3f8b3a96c4e8e41f753c686ff3d8e182ce275b1751deddab58c"},
-    {file = "orjson-3.11.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:525021896afef44a68148f6ed8a8bf8375553d6066c7f48537657f64823565b9"},
-    {file = "orjson-3.11.4-cp312-cp312-win32.whl", hash = "sha256:b58430396687ce0f7d9eeb3dd47761ca7d8fda8e9eb92b3077a7a353a75efefa"},
-    {file = "orjson-3.11.4-cp312-cp312-win_amd64.whl", hash = "sha256:c6dbf422894e1e3c80a177133c0dda260f81428f9de16d61041949f6a2e5c140"},
-    {file = "orjson-3.11.4-cp312-cp312-win_arm64.whl", hash = "sha256:d38d2bc06d6415852224fcc9c0bfa834c25431e466dc319f0edd56cca81aa96e"},
-    {file = "orjson-3.11.4-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:2d6737d0e616a6e053c8b4acc9eccea6b6cce078533666f32d140e4f85002534"},
-    {file = "orjson-3.11.4-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:afb14052690aa328cc118a8e09f07c651d301a72e44920b887c519b313d892ff"},
-    {file = "orjson-3.11.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38aa9e65c591febb1b0aed8da4d469eba239d434c218562df179885c94e1a3ad"},
-    {file = "orjson-3.11.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f2cf4dfaf9163b0728d061bebc1e08631875c51cd30bf47cb9e3293bfbd7dcd5"},
-    {file = "orjson-3.11.4-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89216ff3dfdde0e4070932e126320a1752c9d9a758d6a32ec54b3b9334991a6a"},
-    {file = "orjson-3.11.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9daa26ca8e97fae0ce8aa5d80606ef8f7914e9b129b6b5df9104266f764ce436"},
-    {file = "orjson-3.11.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c8b2769dc31883c44a9cd126560327767f848eb95f99c36c9932f51090bfce9"},
-    {file = "orjson-3.11.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1469d254b9884f984026bd9b0fa5bbab477a4bfe558bba6848086f6d43eb5e73"},
-    {file = "orjson-3.11.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:68e44722541983614e37117209a194e8c3ad07838ccb3127d96863c95ec7f1e0"},
-    {file = "orjson-3.11.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:8e7805fda9672c12be2f22ae124dcd7b03928d6c197544fe12174b86553f3196"},
-    {file = "orjson-3.11.4-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:04b69c14615fb4434ab867bf6f38b2d649f6f300af30a6705397e895f7aec67a"},
-    {file = "orjson-3.11.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:639c3735b8ae7f970066930e58cf0ed39a852d417c24acd4a25fc0b3da3c39a6"},
-    {file = "orjson-3.11.4-cp313-cp313-win32.whl", hash = "sha256:6c13879c0d2964335491463302a6ca5ad98105fc5db3565499dcb80b1b4bd839"},
-    {file = "orjson-3.11.4-cp313-cp313-win_amd64.whl", hash = "sha256:09bf242a4af98732db9f9a1ec57ca2604848e16f132e3f72edfd3c5c96de009a"},
-    {file = "orjson-3.11.4-cp313-cp313-win_arm64.whl", hash = "sha256:a85f0adf63319d6c1ba06fb0dbf997fced64a01179cf17939a6caca662bf92de"},
-    {file = "orjson-3.11.4-cp314-cp314-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:42d43a1f552be1a112af0b21c10a5f553983c2a0938d2bbb8ecd8bc9fb572803"},
-    {file = "orjson-3.11.4-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:26a20f3fbc6c7ff2cb8e89c4c5897762c9d88cf37330c6a117312365d6781d54"},
-    {file = "orjson-3.11.4-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e3f20be9048941c7ffa8fc523ccbd17f82e24df1549d1d1fe9317712d19938e"},
-    {file = "orjson-3.11.4-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:aac364c758dc87a52e68e349924d7e4ded348dedff553889e4d9f22f74785316"},
-    {file = "orjson-3.11.4-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d5c54a6d76e3d741dcc3f2707f8eeb9ba2a791d3adbf18f900219b62942803b1"},
-    {file = "orjson-3.11.4-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f28485bdca8617b79d44627f5fb04336897041dfd9fa66d383a49d09d86798bc"},
-    {file = "orjson-3.11.4-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bfc2a484cad3585e4ba61985a6062a4c2ed5c7925db6d39f1fa267c9d166487f"},
-    {file = "orjson-3.11.4-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e34dbd508cb91c54f9c9788923daca129fe5b55c5b4eebe713bf5ed3791280cf"},
-    {file = "orjson-3.11.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b13c478fa413d4b4ee606ec8e11c3b2e52683a640b006bb586b3041c2ca5f606"},
-    {file = "orjson-3.11.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:724ca721ecc8a831b319dcd72cfa370cc380db0bf94537f08f7edd0a7d4e1780"},
-    {file = "orjson-3.11.4-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:977c393f2e44845ce1b540e19a786e9643221b3323dae190668a98672d43fb23"},
-    {file = "orjson-3.11.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1e539e382cf46edec157ad66b0b0872a90d829a6b71f17cb633d6c160a223155"},
-    {file = "orjson-3.11.4-cp314-cp314-win32.whl", hash = "sha256:d63076d625babab9db5e7836118bdfa086e60f37d8a174194ae720161eb12394"},
-    {file = "orjson-3.11.4-cp314-cp314-win_amd64.whl", hash = "sha256:0a54d6635fa3aaa438ae32e8570b9f0de36f3f6562c308d2a2a452e8b0592db1"},
-    {file = "orjson-3.11.4-cp314-cp314-win_arm64.whl", hash = "sha256:78b999999039db3cf58f6d230f524f04f75f129ba3d1ca2ed121f8657e575d3d"},
-    {file = "orjson-3.11.4-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:405261b0a8c62bcbd8e2931c26fdc08714faf7025f45531541e2b29e544b545b"},
-    {file = "orjson-3.11.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af02ff34059ee9199a3546f123a6ab4c86caf1708c79042caf0820dc290a6d4f"},
-    {file = "orjson-3.11.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0b2eba969ea4203c177c7b38b36c69519e6067ee68c34dc37081fac74c796e10"},
-    {file = "orjson-3.11.4-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0baa0ea43cfa5b008a28d3c07705cf3ada40e5d347f0f44994a64b1b7b4b5350"},
-    {file = "orjson-3.11.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80fd082f5dcc0e94657c144f1b2a3a6479c44ad50be216cf0c244e567f5eae19"},
-    {file = "orjson-3.11.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1e3704d35e47d5bee811fb1cbd8599f0b4009b14d451c4c57be5a7e25eb89a13"},
-    {file = "orjson-3.11.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:caa447f2b5356779d914658519c874cf3b7629e99e63391ed519c28c8aea4919"},
-    {file = "orjson-3.11.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:bba5118143373a86f91dadb8df41d9457498226698ebdf8e11cbb54d5b0e802d"},
-    {file = "orjson-3.11.4-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:622463ab81d19ef3e06868b576551587de8e4d518892d1afab71e0fbc1f9cffc"},
-    {file = "orjson-3.11.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3e0a700c4b82144b72946b6629968df9762552ee1344bfdb767fecdd634fbd5a"},
-    {file = "orjson-3.11.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6e18a5c15e764e5f3fc569b47872450b4bcea24f2a6354c0a0e95ad21045d5a9"},
-    {file = "orjson-3.11.4-cp39-cp39-win32.whl", hash = "sha256:fb1c37c71cad991ef4d89c7a634b5ffb4447dbd7ae3ae13e8f5ee7f1775e7ab1"},
-    {file = "orjson-3.11.4-cp39-cp39-win_amd64.whl", hash = "sha256:e2985ce8b8c42d00492d0ed79f2bd2b6460d00f2fa671dfde4bf2e02f49bf5c6"},
-    {file = "orjson-3.11.4.tar.gz", hash = "sha256:39485f4ab4c9b30a3943cfe99e1a213c4776fb69e8abd68f66b83d5a0b0fdc6d"},
-]
-
 [[package]]
 name = "packaging"
 version = "25.0"
@@ -6507,21 +6252,6 @@ files = [
 [package.dependencies]
 ptyprocess = ">=0.5"

-[[package]]
-name = "pfzy"
-version = "0.3.4"
-description = "Python port of the fzy fuzzy string matching algorithm"
-optional = false
-python-versions = ">=3.7,<4.0"
-groups = ["main"]
-files = [
-    {file = "pfzy-0.3.4-py3-none-any.whl", hash = "sha256:5f50d5b2b3207fa72e7ec0ef08372ef652685470974a107d0d4999fc5a903a96"},
-    {file = "pfzy-0.3.4.tar.gz", hash = "sha256:717ea765dd10b63618e7298b2d98efd819e0b30cd5905c9707223dceeb94b3f1"},
-]
-
-[package.extras]
-docs = ["Sphinx (>=4.1.2,<5.0.0)", "furo (>=2021.8.17-beta.43,<2022.0.0)", "myst-parser (>=0.15.1,<0.16.0)", "sphinx-autobuild (>=2021.3.14,<2022.0.0)", "sphinx-copybutton (>=0.4.0,<0.5.0)"]
-
 [[package]]
 name = "pg8000"
 version = "1.31.5"
@@ -13323,31 +13053,6 @@ dev = ["tokenizers[testing]"]
 docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"]
 testing = ["black (==22.3)", "datasets", "numpy", "pytest", "pytest-asyncio", "requests", "ruff"]

-[[package]]
-name = "tom-swe"
-version = "1.0.3"
-description = "Theory of Mind modeling for Software Engineering assistants"
-optional = false
-python-versions = ">=3.10"
-groups = ["main"]
-files = [
-    {file = "tom_swe-1.0.3-py3-none-any.whl", hash = "sha256:7b1172b29eb5c8fb7f1975016e7b6a238511b9ac2a7a980bd400dcb4e29773f2"},
-    {file = "tom_swe-1.0.3.tar.gz", hash = "sha256:57c97d0104e563f15bd39edaf2aa6ac4c3e9444afd437fb92458700d22c6c0f5"},
-]
-
-[package.dependencies]
-jinja2 = ">=3.0.0"
-json-repair = ">=0.1.0"
-litellm = ">=1.0.0"
-pydantic = ">=2.0.0"
-python-dotenv = ">=1.0.0"
-tiktoken = ">=0.8.0"
-tqdm = ">=4.65.0"
-
-[package.extras]
-dev = ["aiofiles (>=23.0.0)", "black (>=22.0.0)", "datasets (>=2.0.0)", "fastapi (>=0.104.0)", "httpx (>=0.25.0)", "huggingface-hub (>=0.0.0)", "isort (>=5.0.0)", "mypy (>=1.0.0)", "numpy (>=1.24.0)", "pandas (>=2.0.0)", "pre-commit (>=3.6.0)", "pytest (>=7.0.0)", "pytest-cov (>=6.2.1)", "rich (>=13.0.0)", "ruff (>=0.3.0)", "typing-extensions (>=4.0.0)", "uvicorn (>=0.24.0)"]
-search = ["bm25s (>=0.2.0)", "pystemmer (>=2.2.0)"]
-
 [[package]]
 name = "toml"
 version = "0.10.2"
--- a/enterprise/saas_server.py
+++ b/enterprise/saas_server.py
@@ -34,7 +34,6 @@ from server.routes.integration.jira_dc import jira_dc_integration_router  # noqa
 from server.routes.integration.linear import linear_integration_router  # noqa: E402
 from server.routes.integration.slack import slack_router  # noqa: E402
 from server.routes.mcp_patch import patch_mcp_server  # noqa: E402
-from server.routes.oauth_device import oauth_device_router  # noqa: E402
 from server.routes.readiness import readiness_router  # noqa: E402
 from server.routes.user import saas_user_router  # noqa: E402

@@ -61,7 +60,6 @@ base_app.mount('/internal/metrics', metrics_app())
 base_app.include_router(readiness_router)  # Add routes for readiness checks
 base_app.include_router(api_router)  # Add additional route for github auth
 base_app.include_router(oauth_router)  # Add additional route for oauth callback
-base_app.include_router(oauth_device_router)  # Add OAuth 2.0 Device Flow routes
 base_app.include_router(saas_user_router)  # Add additional route SAAS user calls
 base_app.include_router(
    billing_router
--- a/enterprise/server/auth/constants.py
+++ b/enterprise/server/auth/constants.py
@@ -30,11 +30,3 @@ JIRA_DC_CLIENT_SECRET = os.getenv('JIRA_DC_CLIENT_SECRET', '').strip()
 JIRA_DC_BASE_URL = os.getenv('JIRA_DC_BASE_URL', '').strip()
 JIRA_DC_ENABLE_OAUTH = os.getenv('JIRA_DC_ENABLE_OAUTH', '1') in ('1', 'true')
 AUTH_URL = os.getenv('AUTH_URL', '').rstrip('/')
-ROLE_CHECK_ENABLED = os.getenv('ROLE_CHECK_ENABLED', 'false').lower() in (
-    '1',
-    'true',
-    't',
-    'yes',
-    'y',
-    'on',
-)
--- a/enterprise/server/auth/saas_user_auth.py
+++ b/enterprise/server/auth/saas_user_auth.py
@@ -203,15 +203,6 @@ class SaasUserAuth(UserAuth):
        self.settings_store = settings_store
        return settings_store

-    async def get_mcp_api_key(self) -> str:
-        api_key_store = ApiKeyStore.get_instance()
-        mcp_api_key = api_key_store.retrieve_mcp_api_key(self.user_id)
-        if not mcp_api_key:
-            mcp_api_key = api_key_store.create_api_key(
-                self.user_id, 'MCP_API_KEY', None
-            )
-        return mcp_api_key
-
    @classmethod
    async def get_instance(cls, request: Request) -> UserAuth:
        logger.debug('saas_user_auth_get_instance')
@@ -252,12 +243,7 @@ def get_api_key_from_header(request: Request):
    # This is a temp hack
    # Streamable HTTP MCP Client works via redirect requests, but drops the Authorization header for reason
    # We include `X-Session-API-Key` header by default due to nested runtimes, so it used as a drop in replacement here
-    session_api_key = request.headers.get('X-Session-API-Key')
-    if session_api_key:
-        return session_api_key
-
-    # Fallback to X-Access-Token header as an additional option
-    return request.headers.get('X-Access-Token')
+    return request.headers.get('X-Session-API-Key')


 async def saas_user_auth_from_bearer(request: Request) -> SaasUserAuth | None:
--- a/enterprise/server/constants.py
+++ b/enterprise/server/constants.py
@@ -25,7 +25,6 @@ USER_SETTINGS_VERSION_TO_MODEL = {
    2: 'claude-3-7-sonnet-20250219',
    3: 'claude-sonnet-4-20250514',
    4: 'claude-sonnet-4-20250514',
-    5: 'claude-opus-4-5-20251101',
 }

 LITELLM_DEFAULT_MODEL = os.getenv('LITELLM_DEFAULT_MODEL')
@@ -51,7 +50,7 @@ SUBSCRIPTION_PRICE_DATA = {
    },
 }

-DEFAULT_INITIAL_BUDGET = float(os.environ.get('DEFAULT_INITIAL_BUDGET', '10'))
+DEFAULT_INITIAL_BUDGET = float(os.environ.get('DEFAULT_INITIAL_BUDGET', '20'))
 STRIPE_API_KEY = os.environ.get('STRIPE_API_KEY', None)
 STRIPE_WEBHOOK_SECRET = os.environ.get('STRIPE_WEBHOOK_SECRET', None)
 REQUIRE_PAYMENT = os.environ.get('REQUIRE_PAYMENT', '0') in ('1', 'true')
--- a/enterprise/server/legacy_conversation_manager.py
+++ b/enterprise/server/legacy_conversation_manager.py
@@ -0,0 +1,331 @@
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass, field
+
+import socketio
+from server.clustered_conversation_manager import ClusteredConversationManager
+from server.saas_nested_conversation_manager import SaasNestedConversationManager
+
+from openhands.core.config import LLMConfig, OpenHandsConfig
+from openhands.events.action import MessageAction
+from openhands.server.config.server_config import ServerConfig
+from openhands.server.conversation_manager.conversation_manager import (
+    ConversationManager,
+)
+from openhands.server.data_models.agent_loop_info import AgentLoopInfo
+from openhands.server.monitoring import MonitoringListener
+from openhands.server.session.conversation import ServerConversation
+from openhands.storage.data_models.settings import Settings
+from openhands.storage.files import FileStore
+from openhands.utils.async_utils import wait_all
+
+_LEGACY_ENTRY_TIMEOUT_SECONDS = 3600
+
+
+@dataclass
+class LegacyCacheEntry:
+    """Cache entry for legacy mode status."""
+
+    is_legacy: bool
+    timestamp: float
+
+
+@dataclass
+class LegacyConversationManager(ConversationManager):
+    """
+    Conversation manager for use while migrating - since existing conversations are not nested!
+    Separate class from SaasNestedConversationManager so it can be easliy removed in a few weeks.
+    (As of 2025-07-23)
+    """
+
+    sio: socketio.AsyncServer
+    config: OpenHandsConfig
+    server_config: ServerConfig
+    file_store: FileStore
+    conversation_manager: SaasNestedConversationManager
+    legacy_conversation_manager: ClusteredConversationManager
+    _legacy_cache: dict[str, LegacyCacheEntry] = field(default_factory=dict)
+
+    async def __aenter__(self):
+        await wait_all(
+            [
+                self.conversation_manager.__aenter__(),
+                self.legacy_conversation_manager.__aenter__(),
+            ]
+        )
+        return self
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        await wait_all(
+            [
+                self.conversation_manager.__aexit__(exc_type, exc_value, traceback),
+                self.legacy_conversation_manager.__aexit__(
+                    exc_type, exc_value, traceback
+                ),
+            ]
+        )
+
+    async def request_llm_completion(
+        self,
+        sid: str,
+        service_id: str,
+        llm_config: LLMConfig,
+        messages: list[dict[str, str]],
+    ) -> str:
+        session = self.get_agent_session(sid)
+        llm_registry = session.llm_registry
+        return llm_registry.request_extraneous_completion(
+            service_id, llm_config, messages
+        )
+
+    async def attach_to_conversation(
+        self, sid: str, user_id: str | None = None
+    ) -> ServerConversation | None:
+        if await self.should_start_in_legacy_mode(sid):
+            return await self.legacy_conversation_manager.attach_to_conversation(
+                sid, user_id
+            )
+        return await self.conversation_manager.attach_to_conversation(sid, user_id)
+
+    async def detach_from_conversation(self, conversation: ServerConversation):
+        if await self.should_start_in_legacy_mode(conversation.sid):
+            return await self.legacy_conversation_manager.detach_from_conversation(
+                conversation
+            )
+        return await self.conversation_manager.detach_from_conversation(conversation)
+
+    async def join_conversation(
+        self,
+        sid: str,
+        connection_id: str,
+        settings: Settings,
+        user_id: str | None,
+    ) -> AgentLoopInfo:
+        if await self.should_start_in_legacy_mode(sid):
+            return await self.legacy_conversation_manager.join_conversation(
+                sid, connection_id, settings, user_id
+            )
+        return await self.conversation_manager.join_conversation(
+            sid, connection_id, settings, user_id
+        )
+
+    def get_agent_session(self, sid: str):
+        session = self.legacy_conversation_manager.get_agent_session(sid)
+        if session is None:
+            session = self.conversation_manager.get_agent_session(sid)
+        return session
+
+    async def get_running_agent_loops(
+        self, user_id: str | None = None, filter_to_sids: set[str] | None = None
+    ) -> set[str]:
+        if filter_to_sids and len(filter_to_sids) == 1:
+            sid = next(iter(filter_to_sids))
+            if await self.should_start_in_legacy_mode(sid):
+                return await self.legacy_conversation_manager.get_running_agent_loops(
+                    user_id, filter_to_sids
+                )
+            return await self.conversation_manager.get_running_agent_loops(
+                user_id, filter_to_sids
+            )
+
+        # Get all running agent loops from both managers
+        agent_loops, legacy_agent_loops = await wait_all(
+            [
+                self.conversation_manager.get_running_agent_loops(
+                    user_id, filter_to_sids
+                ),
+                self.legacy_conversation_manager.get_running_agent_loops(
+                    user_id, filter_to_sids
+                ),
+            ]
+        )
+
+        # Combine the results
+        result = set()
+        for sid in legacy_agent_loops:
+            if await self.should_start_in_legacy_mode(sid):
+                result.add(sid)
+
+        for sid in agent_loops:
+            if not await self.should_start_in_legacy_mode(sid):
+                result.add(sid)
+
+        return result
+
+    async def is_agent_loop_running(self, sid: str) -> bool:
+        return bool(await self.get_running_agent_loops(filter_to_sids={sid}))
+
+    async def get_connections(
+        self, user_id: str | None = None, filter_to_sids: set[str] | None = None
+    ) -> dict[str, str]:
+        if filter_to_sids and len(filter_to_sids) == 1:
+            sid = next(iter(filter_to_sids))
+            if await self.should_start_in_legacy_mode(sid):
+                return await self.legacy_conversation_manager.get_connections(
+                    user_id, filter_to_sids
+                )
+            return await self.conversation_manager.get_connections(
+                user_id, filter_to_sids
+            )
+        agent_loops, legacy_agent_loops = await wait_all(
+            [
+                self.conversation_manager.get_connections(user_id, filter_to_sids),
+                self.legacy_conversation_manager.get_connections(
+                    user_id, filter_to_sids
+                ),
+            ]
+        )
+        legacy_agent_loops.update(agent_loops)
+        return legacy_agent_loops
+
+    async def maybe_start_agent_loop(
+        self,
+        sid: str,
+        settings: Settings,
+        user_id: str,  # type: ignore[override]
+        initial_user_msg: MessageAction | None = None,
+        replay_json: str | None = None,
+    ) -> AgentLoopInfo:
+        if await self.should_start_in_legacy_mode(sid):
+            return await self.legacy_conversation_manager.maybe_start_agent_loop(
+                sid, settings, user_id, initial_user_msg, replay_json
+            )
+        return await self.conversation_manager.maybe_start_agent_loop(
+            sid, settings, user_id, initial_user_msg, replay_json
+        )
+
+    async def send_to_event_stream(self, connection_id: str, data: dict):
+        return await self.legacy_conversation_manager.send_to_event_stream(
+            connection_id, data
+        )
+
+    async def send_event_to_conversation(self, sid: str, data: dict):
+        if await self.should_start_in_legacy_mode(sid):
+            await self.legacy_conversation_manager.send_event_to_conversation(sid, data)
+        await self.conversation_manager.send_event_to_conversation(sid, data)
+
+    async def disconnect_from_session(self, connection_id: str):
+        return await self.legacy_conversation_manager.disconnect_from_session(
+            connection_id
+        )
+
+    async def close_session(self, sid: str):
+        if await self.should_start_in_legacy_mode(sid):
+            await self.legacy_conversation_manager.close_session(sid)
+        await self.conversation_manager.close_session(sid)
+
+    async def get_agent_loop_info(
+        self, user_id: str | None = None, filter_to_sids: set[str] | None = None
+    ) -> list[AgentLoopInfo]:
+        if filter_to_sids and len(filter_to_sids) == 1:
+            sid = next(iter(filter_to_sids))
+            if await self.should_start_in_legacy_mode(sid):
+                return await self.legacy_conversation_manager.get_agent_loop_info(
+                    user_id, filter_to_sids
+                )
+            return await self.conversation_manager.get_agent_loop_info(
+                user_id, filter_to_sids
+            )
+        agent_loops, legacy_agent_loops = await wait_all(
+            [
+                self.conversation_manager.get_agent_loop_info(user_id, filter_to_sids),
+                self.legacy_conversation_manager.get_agent_loop_info(
+                    user_id, filter_to_sids
+                ),
+            ]
+        )
+
+        # Combine results
+        result = []
+        legacy_sids = set()
+
+        # Add legacy agent loops
+        for agent_loop in legacy_agent_loops:
+            if await self.should_start_in_legacy_mode(agent_loop.conversation_id):
+                result.append(agent_loop)
+                legacy_sids.add(agent_loop.conversation_id)
+
+        # Add non-legacy agent loops
+        for agent_loop in agent_loops:
+            if (
+                agent_loop.conversation_id not in legacy_sids
+                and not await self.should_start_in_legacy_mode(
+                    agent_loop.conversation_id
+                )
+            ):
+                result.append(agent_loop)
+
+        return result
+
+    def _cleanup_expired_cache_entries(self):
+        """Remove expired entries from the local cache."""
+        current_time = time.time()
+        expired_keys = [
+            key
+            for key, entry in self._legacy_cache.items()
+            if current_time - entry.timestamp > _LEGACY_ENTRY_TIMEOUT_SECONDS
+        ]
+        for key in expired_keys:
+            del self._legacy_cache[key]
+
+    async def should_start_in_legacy_mode(self, conversation_id: str) -> bool:
+        """
+        Check if a conversation should run in legacy mode by directly checking the runtime.
+        The /list method does not include stopped conversations even though the PVC for these
+        may not yet have been deleted, so we need to check /sessions/{session_id} directly.
+        """
+        # Clean up expired entries periodically
+        self._cleanup_expired_cache_entries()
+
+        # First check the local cache
+        if conversation_id in self._legacy_cache:
+            cached_entry = self._legacy_cache[conversation_id]
+            # Check if the cached value is still valid
+            if time.time() - cached_entry.timestamp <= _LEGACY_ENTRY_TIMEOUT_SECONDS:
+                return cached_entry.is_legacy
+
+        # If not in cache or expired, check the runtime directly
+        runtime = await self.conversation_manager._get_runtime(conversation_id)
+        is_legacy = self.is_legacy_runtime(runtime)
+
+        # Cache the result with current timestamp
+        self._legacy_cache[conversation_id] = LegacyCacheEntry(is_legacy, time.time())
+
+        return is_legacy
+
+    def is_legacy_runtime(self, runtime: dict | None) -> bool:
+        """
+        Determine if a runtime is a legacy runtime based on its command.
+
+        Args:
+            runtime: The runtime dictionary or None if not found
+
+        Returns:
+            bool: True if this is a legacy runtime, False otherwise
+        """
+        if runtime is None:
+            return False
+        return 'openhands.server' not in runtime['command']
+
+    @classmethod
+    def get_instance(
+        cls,
+        sio: socketio.AsyncServer,
+        config: OpenHandsConfig,
+        file_store: FileStore,
+        server_config: ServerConfig,
+        monitoring_listener: MonitoringListener,
+    ) -> ConversationManager:
+        return LegacyConversationManager(
+            sio=sio,
+            config=config,
+            server_config=server_config,
+            file_store=file_store,
+            conversation_manager=SaasNestedConversationManager.get_instance(
+                sio, config, file_store, server_config, monitoring_listener
+            ),
+            legacy_conversation_manager=ClusteredConversationManager.get_instance(
+                sio, config, file_store, server_config, monitoring_listener
+            ),
+        )
--- a/enterprise/server/middleware.py
+++ b/enterprise/server/middleware.py
@@ -152,22 +152,17 @@ class SetAuthCookieMiddleware:
            return False
        path = request.url.path

-        ignore_paths = (
+        is_api_that_should_attach = path.startswith('/api') and path not in (
            '/api/options/config',
            '/api/keycloak/callback',
            '/api/billing/success',
            '/api/billing/cancel',
            '/api/billing/customer-setup-success',
            '/api/billing/stripe-webhook',
-            '/oauth/device/authorize',
-            '/oauth/device/token',
        )
-        if path in ignore_paths:
-            return False

        is_mcp = path.startswith('/mcp')
-        is_api_route = path.startswith('/api')
-        return is_api_route or is_mcp
+        return is_api_that_should_attach or is_mcp

    async def _logout(self, request: Request):
        # Log out of keycloak - this prevents issues where you did not log in with the idp you believe you used
--- a/enterprise/server/routes/auth.py
+++ b/enterprise/server/routes/auth.py
@@ -12,7 +12,6 @@ from server.auth.constants import (
    KEYCLOAK_CLIENT_ID,
    KEYCLOAK_REALM_NAME,
    KEYCLOAK_SERVER_URL_EXT,
-    ROLE_CHECK_ENABLED,
 )
 from server.auth.gitlab_sync import schedule_gitlab_repo_sync
 from server.auth.saas_user_auth import SaasUserAuth
@@ -133,12 +132,6 @@ async def keycloak_callback(

    user_info = await token_manager.get_user_info(keycloak_access_token)
    logger.debug(f'user_info: {user_info}')
-    if ROLE_CHECK_ENABLED and 'roles' not in user_info:
-        return JSONResponse(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            content={'error': 'Missing required role'},
-        )
-
    if 'sub' not in user_info or 'preferred_username' not in user_info:
        return JSONResponse(
            status_code=status.HTTP_400_BAD_REQUEST,
--- a/enterprise/server/routes/integration/github.py
+++ b/enterprise/server/routes/integration/github.py
@@ -1,4 +1,3 @@
-import asyncio
 import hashlib
 import hmac
 import os
@@ -59,8 +58,7 @@ async def github_events(
        )

    try:
-        # Add timeout to prevent hanging on slow/stalled clients
-        payload = await asyncio.wait_for(request.body(), timeout=15.0)
+        payload = await request.body()
        verify_github_signature(payload, x_hub_signature_256)

        payload_data = await request.json()
@@ -80,12 +78,6 @@ async def github_events(
            status_code=200,
            content={'message': 'GitHub events endpoint reached successfully.'},
        )
-    except asyncio.TimeoutError:
-        logger.warning('GitHub webhook request timed out waiting for request body')
-        return JSONResponse(
-            status_code=408,
-            content={'error': 'Request timeout - client took too long to send data.'},
-        )
    except Exception as e:
        logger.exception(f'Error processing GitHub event: {e}')
        return JSONResponse(status_code=400, content={'error': 'Invalid payload.'})
--- a/enterprise/server/routes/oauth_device.py
+++ b/enterprise/server/routes/oauth_device.py
@@ -1,324 +0,0 @@
-"""OAuth 2.0 Device Flow endpoints for CLI authentication."""
-
-from datetime import UTC, datetime, timedelta
-from typing import Optional
-
-from fastapi import APIRouter, Depends, Form, HTTPException, Request, status
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel
-from storage.api_key_store import ApiKeyStore
-from storage.database import session_maker
-from storage.device_code_store import DeviceCodeStore
-
-from openhands.core.logger import openhands_logger as logger
-from openhands.server.user_auth import get_user_id
-
-# ---------------------------------------------------------------------------
-# Constants
-# ---------------------------------------------------------------------------
-
-DEVICE_CODE_EXPIRES_IN = 600  # 10 minutes
-DEVICE_TOKEN_POLL_INTERVAL = 5  # seconds
-
-API_KEY_NAME = 'Device Link Access Key'
-KEY_EXPIRATION_TIME = timedelta(days=1)  # Key expires in 24 hours
-
-# ---------------------------------------------------------------------------
-# Models
-# ---------------------------------------------------------------------------
-
-
-class DeviceAuthorizationResponse(BaseModel):
-    device_code: str
-    user_code: str
-    verification_uri: str
-    verification_uri_complete: str
-    expires_in: int
-    interval: int
-
-
-class DeviceTokenResponse(BaseModel):
-    access_token: str  # This will be the user's API key
-    token_type: str = 'Bearer'
-    expires_in: Optional[int] = None  # API keys may not have expiration
-
-
-class DeviceTokenErrorResponse(BaseModel):
-    error: str
-    error_description: Optional[str] = None
-    interval: Optional[int] = None  # Required for slow_down error
-
-
-# ---------------------------------------------------------------------------
-# Router + stores
-# ---------------------------------------------------------------------------
-
-oauth_device_router = APIRouter(prefix='/oauth/device')
-device_code_store = DeviceCodeStore(session_maker)
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _oauth_error(
-    status_code: int,
-    error: str,
-    description: str,
-    interval: Optional[int] = None,
-) -> JSONResponse:
-    """Return a JSON OAuth-style error response."""
-    return JSONResponse(
-        status_code=status_code,
-        content=DeviceTokenErrorResponse(
-            error=error,
-            error_description=description,
-            interval=interval,
-        ).model_dump(),
-    )
-
-
-# ---------------------------------------------------------------------------
-# Endpoints
-# ---------------------------------------------------------------------------
-
-
-@oauth_device_router.post('/authorize', response_model=DeviceAuthorizationResponse)
-async def device_authorization(
-    http_request: Request,
-) -> DeviceAuthorizationResponse:
-    """Start device flow by generating device and user codes."""
-    try:
-        device_code_entry = device_code_store.create_device_code(
-            expires_in=DEVICE_CODE_EXPIRES_IN,
-        )
-
-        base_url = str(http_request.base_url).rstrip('/')
-        verification_uri = f'{base_url}/oauth/device/verify'
-        verification_uri_complete = (
-            f'{verification_uri}?user_code={device_code_entry.user_code}'
-        )
-
-        logger.info(
-            'Device authorization initiated',
-            extra={'user_code': device_code_entry.user_code},
-        )
-
-        return DeviceAuthorizationResponse(
-            device_code=device_code_entry.device_code,
-            user_code=device_code_entry.user_code,
-            verification_uri=verification_uri,
-            verification_uri_complete=verification_uri_complete,
-            expires_in=DEVICE_CODE_EXPIRES_IN,
-            interval=device_code_entry.current_interval,
-        )
-    except Exception as e:
-        logger.exception('Error in device authorization: %s', str(e))
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail='Internal server error',
-        ) from e
-
-
-@oauth_device_router.post('/token')
-async def device_token(device_code: str = Form(...)):
-    """Poll for a token until the user authorizes or the code expires."""
-    try:
-        device_code_entry = device_code_store.get_by_device_code(device_code)
-
-        if not device_code_entry:
-            return _oauth_error(
-                status.HTTP_400_BAD_REQUEST,
-                'invalid_grant',
-                'Invalid device code',
-            )
-
-        # Check rate limiting (RFC 8628 section 3.5)
-        is_too_fast, current_interval = device_code_entry.check_rate_limit()
-        if is_too_fast:
-            # Update poll time and increase interval
-            device_code_store.update_poll_time(device_code, increase_interval=True)
-            logger.warning(
-                'Client polling too fast, returning slow_down error',
-                extra={
-                    'device_code': device_code[:8] + '...',  # Log partial for privacy
-                    'new_interval': current_interval,
-                },
-            )
-            return _oauth_error(
-                status.HTTP_400_BAD_REQUEST,
-                'slow_down',
-                f'Polling too frequently. Wait at least {current_interval} seconds between requests.',
-                interval=current_interval,
-            )
-
-        # Update poll time for successful rate limit check
-        device_code_store.update_poll_time(device_code, increase_interval=False)
-
-        if device_code_entry.is_expired():
-            return _oauth_error(
-                status.HTTP_400_BAD_REQUEST,
-                'expired_token',
-                'Device code has expired',
-            )
-
-        if device_code_entry.status == 'denied':
-            return _oauth_error(
-                status.HTTP_400_BAD_REQUEST,
-                'access_denied',
-                'User denied the authorization request',
-            )
-
-        if device_code_entry.status == 'pending':
-            return _oauth_error(
-                status.HTTP_400_BAD_REQUEST,
-                'authorization_pending',
-                'User has not yet completed authorization',
-            )
-
-        if device_code_entry.status == 'authorized':
-            # Retrieve the specific API key for this device using the user_code
-            api_key_store = ApiKeyStore.get_instance()
-            device_key_name = f'{API_KEY_NAME} ({device_code_entry.user_code})'
-            device_api_key = api_key_store.retrieve_api_key_by_name(
-                device_code_entry.keycloak_user_id, device_key_name
-            )
-
-            if not device_api_key:
-                logger.error(
-                    'No device API key found for authorized device',
-                    extra={
-                        'user_id': device_code_entry.keycloak_user_id,
-                        'user_code': device_code_entry.user_code,
-                    },
-                )
-                return _oauth_error(
-                    status.HTTP_500_INTERNAL_SERVER_ERROR,
-                    'server_error',
-                    'API key not found',
-                )
-
-            # Return the API key as access_token
-            return DeviceTokenResponse(
-                access_token=device_api_key,
-            )
-
-        # Fallback for unexpected status values
-        logger.error(
-            'Unknown device code status',
-            extra={'status': device_code_entry.status},
-        )
-        return _oauth_error(
-            status.HTTP_500_INTERNAL_SERVER_ERROR,
-            'server_error',
-            'Unknown device code status',
-        )
-
-    except Exception as e:
-        logger.exception('Error in device token: %s', str(e))
-        return _oauth_error(
-            status.HTTP_500_INTERNAL_SERVER_ERROR,
-            'server_error',
-            'Internal server error',
-        )
-
-
-@oauth_device_router.post('/verify-authenticated')
-async def device_verification_authenticated(
-    user_code: str = Form(...),
-    user_id: str = Depends(get_user_id),
-):
-    """Process device verification for authenticated users (called by frontend)."""
-    try:
-        if not user_id:
-            raise HTTPException(
-                status_code=status.HTTP_401_UNAUTHORIZED,
-                detail='Authentication required',
-            )
-
-        # Validate device code
-        device_code_entry = device_code_store.get_by_user_code(user_code)
-        if not device_code_entry:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail='The device code is invalid or has expired.',
-            )
-
-        if not device_code_entry.is_pending():
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail='This device code has already been processed.',
-            )
-
-        # First, authorize the device code
-        success = device_code_store.authorize_device_code(
-            user_code=user_code,
-            user_id=user_id,
-        )
-
-        if not success:
-            logger.error(
-                'Failed to authorize device code',
-                extra={'user_code': user_code, 'user_id': user_id},
-            )
-            raise HTTPException(
-                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                detail='Failed to authorize the device. Please try again.',
-            )
-
-        # Only create API key AFTER successful authorization
-        api_key_store = ApiKeyStore.get_instance()
-        try:
-            # Create a unique API key for this device using user_code in the name
-            device_key_name = f'{API_KEY_NAME} ({user_code})'
-            api_key_store.create_api_key(
-                user_id,
-                name=device_key_name,
-                expires_at=datetime.now(UTC) + KEY_EXPIRATION_TIME,
-            )
-            logger.info(
-                'Created new device API key for user after successful authorization',
-                extra={'user_id': user_id, 'user_code': user_code},
-            )
-        except Exception as e:
-            logger.exception(
-                'Failed to create device API key after authorization: %s', str(e)
-            )
-
-            # Clean up: revert the device authorization since API key creation failed
-            # This prevents the device from being in an authorized state without an API key
-            try:
-                device_code_store.deny_device_code(user_code)
-                logger.info(
-                    'Reverted device authorization due to API key creation failure',
-                    extra={'user_code': user_code, 'user_id': user_id},
-                )
-            except Exception as cleanup_error:
-                logger.exception(
-                    'Failed to revert device authorization during cleanup: %s',
-                    str(cleanup_error),
-                )
-
-            raise HTTPException(
-                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                detail='Failed to create API key for device access.',
-            )
-
-        logger.info(
-            'Device code authorized with API key successfully',
-            extra={'user_code': user_code, 'user_id': user_id},
-        )
-        return JSONResponse(
-            status_code=status.HTTP_200_OK,
-            content={'message': 'Device authorized successfully!'},
-        )
-
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.exception('Error in device verification: %s', str(e))
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail='An unexpected error occurred. Please try again.',
-        )
--- a/enterprise/server/saas_nested_conversation_manager.py
+++ b/enterprise/server/saas_nested_conversation_manager.py
@@ -31,7 +31,6 @@ from openhands.events.event_store import EventStore
 from openhands.events.serialization.event import event_to_dict
 from openhands.integrations.provider import PROVIDER_TOKEN_TYPE, ProviderHandler
 from openhands.runtime.impl.remote.remote_runtime import RemoteRuntime
-from openhands.runtime.plugins.vscode import VSCodeRequirement
 from openhands.runtime.runtime_status import RuntimeStatus
 from openhands.server.config.server_config import ServerConfig
 from openhands.server.constants import ROOM_KEY
@@ -71,14 +70,6 @@ RUNTIME_CONVERSATION_URL = RUNTIME_URL_PATTERN + (
    else '/api/conversations/{conversation_id}'
 )

-RUNTIME_USERNAME = os.getenv('RUNTIME_USERNAME')
-
-SU_TO_USER = os.getenv('SU_TO_USER', 'false')
-truthy = {'1', 'true', 't', 'yes', 'y', 'on'}
-SU_TO_USER = str(SU_TO_USER.lower() in truthy).lower()
-
-DISABLE_VSCODE_PLUGIN = os.getenv('DISABLE_VSCODE_PLUGIN', 'false').lower() == 'true'
-
 # Time in seconds before a Redis entry is considered expired if not refreshed
 _REDIS_ENTRY_TIMEOUT_SECONDS = 300

@@ -781,11 +772,7 @@ class SaasNestedConversationManager(ConversationManager):
        env_vars['SERVE_FRONTEND'] = '0'
        env_vars['RUNTIME'] = 'local'
        # TODO: In the long term we may come up with a more secure strategy for user management within the nested runtime.
-        env_vars['USER'] = (
-            RUNTIME_USERNAME
-            if RUNTIME_USERNAME
-            else ('openhands' if config.run_as_openhands else 'root')
-        )
+        env_vars['USER'] = 'openhands' if config.run_as_openhands else 'root'
        env_vars['PERMITTED_CORS_ORIGINS'] = ','.join(PERMITTED_CORS_ORIGINS)
        env_vars['port'] = '60000'
        # TODO: These values are static in the runtime-api project, but do not get copied into the runtime ENV
@@ -802,8 +789,6 @@ class SaasNestedConversationManager(ConversationManager):
        env_vars['INITIAL_NUM_WARM_SERVERS'] = '1'
        env_vars['INIT_GIT_IN_EMPTY_WORKSPACE'] = '1'
        env_vars['ENABLE_V1'] = '0'
-        env_vars['SU_TO_USER'] = SU_TO_USER
-        env_vars['DISABLE_VSCODE_PLUGIN'] = str(DISABLE_VSCODE_PLUGIN).lower()

        # We need this for LLM traces tracking to identify the source of the LLM calls
        env_vars['WEB_HOST'] = WEB_HOST
@@ -819,18 +804,11 @@ class SaasNestedConversationManager(ConversationManager):
        if self._runtime_container_image:
            config.sandbox.runtime_container_image = self._runtime_container_image

-        plugins = [
-            plugin
-            for plugin in agent.sandbox_plugins
-            if not (DISABLE_VSCODE_PLUGIN and isinstance(plugin, VSCodeRequirement))
-        ]
-        logger.info(f'Loaded plugins for runtime {sid}: {plugins}')
-
        runtime = RemoteRuntime(
            config=config,
            event_stream=None,  # type: ignore[arg-type]
            sid=sid,
-            plugins=plugins,
+            plugins=agent.sandbox_plugins,
            # env_vars=env_vars,
            # status_callback: Callable[..., None] | None = None,
            attach_to_existing=False,
--- a/enterprise/storage/api_key_store.py
+++ b/enterprise/storage/api_key_store.py
@@ -17,13 +17,10 @@ from openhands.core.logger import openhands_logger as logger
 class ApiKeyStore:
    session_maker: sessionmaker

-    API_KEY_PREFIX = 'sk-oh-'
-
    def generate_api_key(self, length: int = 32) -> str:
-        """Generate a random API key with the sk-oh- prefix."""
+        """Generate a random API key."""
        alphabet = string.ascii_letters + string.digits
-        random_part = ''.join(secrets.choice(alphabet) for _ in range(length))
-        return f'{self.API_KEY_PREFIX}{random_part}'
+        return ''.join(secrets.choice(alphabet) for _ in range(length))

    def create_api_key(
        self, user_id: str, name: str | None = None, expires_at: datetime | None = None
@@ -60,15 +57,9 @@ class ApiKeyStore:
                return None

            # Check if the key has expired
-            if key_record.expires_at:
-                # Handle timezone-naive datetime from database by assuming it's UTC
-                expires_at = key_record.expires_at
-                if expires_at.tzinfo is None:
-                    expires_at = expires_at.replace(tzinfo=UTC)
-
-                if expires_at < now:
-                    logger.info(f'API key has expired: {key_record.id}')
-                    return None
+            if key_record.expires_at and key_record.expires_at < now:
+                logger.info(f'API key has expired: {key_record.id}')
+                return None

            # Update last_used_at timestamp
            session.execute(
@@ -134,33 +125,6 @@ class ApiKeyStore:

        return None

-    def retrieve_api_key_by_name(self, user_id: str, name: str) -> str | None:
-        """Retrieve an API key by name for a specific user."""
-        with self.session_maker() as session:
-            key_record = (
-                session.query(ApiKey)
-                .filter(ApiKey.user_id == user_id, ApiKey.name == name)
-                .first()
-            )
-            return key_record.key if key_record else None
-
-    def delete_api_key_by_name(self, user_id: str, name: str) -> bool:
-        """Delete an API key by name for a specific user."""
-        with self.session_maker() as session:
-            key_record = (
-                session.query(ApiKey)
-                .filter(ApiKey.user_id == user_id, ApiKey.name == name)
-                .first()
-            )
-
-            if not key_record:
-                return False
-
-            session.delete(key_record)
-            session.commit()
-
-            return True
-
    @classmethod
    def get_instance(cls) -> ApiKeyStore:
        """Get an instance of the ApiKeyStore."""
--- a/enterprise/storage/device_code.py
+++ b/enterprise/storage/device_code.py
@@ -1,109 +0,0 @@
-"""Device code storage model for OAuth 2.0 Device Flow."""
-
-from datetime import datetime, timezone
-from enum import Enum
-
-from sqlalchemy import Column, DateTime, Integer, String
-from storage.base import Base
-
-
-class DeviceCodeStatus(Enum):
-    """Status of a device code authorization request."""
-
-    PENDING = 'pending'
-    AUTHORIZED = 'authorized'
-    EXPIRED = 'expired'
-    DENIED = 'denied'
-
-
-class DeviceCode(Base):
-    """Device code for OAuth 2.0 Device Flow.
-
-    This stores the device codes issued during the device authorization flow,
-    along with their status and associated user information once authorized.
-    """
-
-    __tablename__ = 'device_codes'
-
-    id = Column(Integer, primary_key=True, autoincrement=True)
-    device_code = Column(String(128), unique=True, nullable=False, index=True)
-    user_code = Column(String(16), unique=True, nullable=False, index=True)
-    status = Column(String(32), nullable=False, default=DeviceCodeStatus.PENDING.value)
-
-    # Keycloak user ID who authorized the device (set during verification)
-    keycloak_user_id = Column(String(255), nullable=True)
-
-    # Timestamps
-    expires_at = Column(DateTime(timezone=True), nullable=False)
-    authorized_at = Column(DateTime(timezone=True), nullable=True)
-
-    # Rate limiting fields for RFC 8628 section 3.5 compliance
-    last_poll_time = Column(DateTime(timezone=True), nullable=True)
-    current_interval = Column(Integer, nullable=False, default=5)
-
-    def __repr__(self) -> str:
-        return f"<DeviceCode(user_code='{self.user_code}', status='{self.status}')>"
-
-    def is_expired(self) -> bool:
-        """Check if the device code has expired."""
-        now = datetime.now(timezone.utc)
-        return now > self.expires_at
-
-    def is_pending(self) -> bool:
-        """Check if the device code is still pending authorization."""
-        return self.status == DeviceCodeStatus.PENDING.value and not self.is_expired()
-
-    def is_authorized(self) -> bool:
-        """Check if the device code has been authorized."""
-        return self.status == DeviceCodeStatus.AUTHORIZED.value
-
-    def authorize(self, user_id: str) -> None:
-        """Mark the device code as authorized."""
-        self.status = DeviceCodeStatus.AUTHORIZED.value
-        self.keycloak_user_id = user_id  # Set the Keycloak user ID during authorization
-        self.authorized_at = datetime.now(timezone.utc)
-
-    def deny(self) -> None:
-        """Mark the device code as denied."""
-        self.status = DeviceCodeStatus.DENIED.value
-
-    def expire(self) -> None:
-        """Mark the device code as expired."""
-        self.status = DeviceCodeStatus.EXPIRED.value
-
-    def check_rate_limit(self) -> tuple[bool, int]:
-        """Check if the client is polling too fast.
-
-        Returns:
-            tuple: (is_too_fast, current_interval)
-                - is_too_fast: True if client should receive slow_down error
-                - current_interval: Current polling interval to use
-        """
-        now = datetime.now(timezone.utc)
-
-        # If this is the first poll, allow it
-        if self.last_poll_time is None:
-            return False, self.current_interval
-
-        # Calculate time since last poll
-        time_since_last_poll = (now - self.last_poll_time).total_seconds()
-
-        # Check if polling too fast
-        if time_since_last_poll < self.current_interval:
-            # Increase interval for slow_down (RFC 8628 section 3.5)
-            new_interval = min(self.current_interval + 5, 60)  # Cap at 60 seconds
-            return True, new_interval
-
-        return False, self.current_interval
-
-    def update_poll_time(self, increase_interval: bool = False) -> None:
-        """Update the last poll time and optionally increase the interval.
-
-        Args:
-            increase_interval: If True, increase the current interval for slow_down
-        """
-        self.last_poll_time = datetime.now(timezone.utc)
-
-        if increase_interval:
-            # Increase interval by 5 seconds, cap at 60 seconds (RFC 8628)
-            self.current_interval = min(self.current_interval + 5, 60)
--- a/enterprise/storage/device_code_store.py
+++ b/enterprise/storage/device_code_store.py
@@ -1,167 +0,0 @@
-"""Device code store for OAuth 2.0 Device Flow."""
-
-import secrets
-import string
-from datetime import datetime, timedelta, timezone
-
-from sqlalchemy.exc import IntegrityError
-from storage.device_code import DeviceCode
-
-
-class DeviceCodeStore:
-    """Store for managing OAuth 2.0 device codes."""
-
-    def __init__(self, session_maker):
-        self.session_maker = session_maker
-
-    def generate_user_code(self) -> str:
-        """Generate a human-readable user code (8 characters, uppercase letters and digits)."""
-        # Use a mix of uppercase letters and digits, avoiding confusing characters
-        alphabet = 'ABCDEFGHJKLMNPQRSTUVWXYZ23456789'  # No I, O, 0, 1
-        return ''.join(secrets.choice(alphabet) for _ in range(8))
-
-    def generate_device_code(self) -> str:
-        """Generate a secure device code (128 characters)."""
-        alphabet = string.ascii_letters + string.digits
-        return ''.join(secrets.choice(alphabet) for _ in range(128))
-
-    def create_device_code(
-        self,
-        expires_in: int = 600,  # 10 minutes default
-        max_attempts: int = 10,
-    ) -> DeviceCode:
-        """Create a new device code entry.
-
-        Uses database constraints to ensure uniqueness, avoiding TOCTOU race conditions.
-        Retries on constraint violations until unique codes are generated.
-
-        Args:
-            expires_in: Expiration time in seconds
-            max_attempts: Maximum number of attempts to generate unique codes
-
-        Returns:
-            The created DeviceCode instance
-
-        Raises:
-            RuntimeError: If unable to generate unique codes after max_attempts
-        """
-        for attempt in range(max_attempts):
-            user_code = self.generate_user_code()
-            device_code = self.generate_device_code()
-            expires_at = datetime.now(timezone.utc) + timedelta(seconds=expires_in)
-
-            device_code_entry = DeviceCode(
-                device_code=device_code,
-                user_code=user_code,
-                keycloak_user_id=None,  # Will be set during authorization
-                expires_at=expires_at,
-            )
-
-            try:
-                with self.session_maker() as session:
-                    session.add(device_code_entry)
-                    session.commit()
-                    session.refresh(device_code_entry)
-                    session.expunge(device_code_entry)  # Detach from session cleanly
-                    return device_code_entry
-            except IntegrityError:
-                # Constraint violation - codes already exist, retry with new codes
-                continue
-
-        raise RuntimeError(
-            f'Failed to generate unique device codes after {max_attempts} attempts'
-        )
-
-    def get_by_device_code(self, device_code: str) -> DeviceCode | None:
-        """Get device code entry by device code."""
-        with self.session_maker() as session:
-            result = (
-                session.query(DeviceCode).filter_by(device_code=device_code).first()
-            )
-            if result:
-                session.expunge(result)  # Detach from session cleanly
-            return result
-
-    def get_by_user_code(self, user_code: str) -> DeviceCode | None:
-        """Get device code entry by user code."""
-        with self.session_maker() as session:
-            result = session.query(DeviceCode).filter_by(user_code=user_code).first()
-            if result:
-                session.expunge(result)  # Detach from session cleanly
-            return result
-
-    def authorize_device_code(self, user_code: str, user_id: str) -> bool:
-        """Authorize a device code.
-
-        Args:
-            user_code: The user code to authorize
-            user_id: The user ID from Keycloak
-
-        Returns:
-            True if authorization was successful, False otherwise
-        """
-        with self.session_maker() as session:
-            device_code_entry = (
-                session.query(DeviceCode).filter_by(user_code=user_code).first()
-            )
-
-            if not device_code_entry:
-                return False
-
-            if not device_code_entry.is_pending():
-                return False
-
-            device_code_entry.authorize(user_id)
-            session.commit()
-
-            return True
-
-    def deny_device_code(self, user_code: str) -> bool:
-        """Deny a device code authorization.
-
-        Args:
-            user_code: The user code to deny
-
-        Returns:
-            True if denial was successful, False otherwise
-        """
-        with self.session_maker() as session:
-            device_code_entry = (
-                session.query(DeviceCode).filter_by(user_code=user_code).first()
-            )
-
-            if not device_code_entry:
-                return False
-
-            if not device_code_entry.is_pending():
-                return False
-
-            device_code_entry.deny()
-            session.commit()
-
-            return True
-
-    def update_poll_time(
-        self, device_code: str, increase_interval: bool = False
-    ) -> bool:
-        """Update the poll time for a device code and optionally increase interval.
-
-        Args:
-            device_code: The device code to update
-            increase_interval: If True, increase the polling interval for slow_down
-
-        Returns:
-            True if update was successful, False otherwise
-        """
-        with self.session_maker() as session:
-            device_code_entry = (
-                session.query(DeviceCode).filter_by(device_code=device_code).first()
-            )
-
-            if not device_code_entry:
-                return False
-
-            device_code_entry.update_poll_time(increase_interval)
-            session.commit()
-
-            return True
--- a/enterprise/storage/saas_conversation_store.py
+++ b/enterprise/storage/saas_conversation_store.py
@@ -35,7 +35,6 @@ class SaasConversationStore(ConversationStore):
            session.query(StoredConversationMetadata)
            .filter(StoredConversationMetadata.user_id == self.user_id)
            .filter(StoredConversationMetadata.conversation_id == conversation_id)
-            .filter(StoredConversationMetadata.conversation_version == 'V0')
        )

    def _to_external_model(self, conversation_metadata: StoredConversationMetadata):
@@ -60,7 +59,6 @@ class SaasConversationStore(ConversationStore):
        kwargs.pop('reasoning_tokens', None)
        kwargs.pop('context_window', None)
        kwargs.pop('per_turn_token', None)
-        kwargs.pop('parent_conversation_id', None)

        return ConversationMetadata(**kwargs)

@@ -125,7 +123,6 @@ class SaasConversationStore(ConversationStore):
                conversations = (
                    session.query(StoredConversationMetadata)
                    .filter(StoredConversationMetadata.user_id == self.user_id)
-                    .filter(StoredConversationMetadata.conversation_version == 'V0')
                    .order_by(StoredConversationMetadata.created_at.desc())
                    .offset(offset)
                    .limit(limit + 1)
--- a/enterprise/storage/saas_settings_store.py
+++ b/enterprise/storage/saas_settings_store.py
@@ -94,14 +94,9 @@ class SaasSettingsStore(SettingsStore):
            }
            self._decrypt_kwargs(kwargs)
            settings = Settings(**kwargs)
-
            return settings

    async def store(self, item: Settings):
-        # Check if provider is OpenHands and generate API key if needed
-        if item and self._is_openhands_provider(item):
-            await self._ensure_openhands_api_key(item)
-
        with self.session_maker() as session:
            existing = None
            kwargs = {}
@@ -373,30 +368,6 @@ class SaasSettingsStore(SettingsStore):
    def _should_encrypt(self, key: str) -> bool:
        return key in ('llm_api_key', 'llm_api_key_for_byor', 'search_api_key')

-    def _is_openhands_provider(self, item: Settings) -> bool:
-        """Check if the settings use the OpenHands provider."""
-        return bool(item.llm_model and item.llm_model.startswith('openhands/'))
-
-    async def _ensure_openhands_api_key(self, item: Settings) -> None:
-        """Generate and set the OpenHands API key for the given settings.
-
-        First checks if an existing key with the OpenHands alias exists,
-        and reuses it if found. Otherwise, generates a new key.
-        """
-        # Generate new key if none exists
-        generated_key = await self._generate_openhands_key()
-        if generated_key:
-            item.llm_api_key = SecretStr(generated_key)
-            logger.info(
-                'saas_settings_store:store:generated_openhands_key',
-                extra={'user_id': self.user_id},
-            )
-        else:
-            logger.warning(
-                'saas_settings_store:store:failed_to_generate_openhands_key',
-                extra={'user_id': self.user_id},
-            )
-
    async def _create_user_in_lite_llm(
        self, client: httpx.AsyncClient, email: str | None, max_budget: int, spend: int
    ):
@@ -419,55 +390,3 @@ class SaasSettingsStore(SettingsStore):
            },
        )
        return response
-
-    async def _generate_openhands_key(self) -> str | None:
-        """Generate a new OpenHands provider key for a user."""
-        if not (LITE_LLM_API_KEY and LITE_LLM_API_URL):
-            logger.warning(
-                'saas_settings_store:_generate_openhands_key:litellm_config_not_found',
-                extra={'user_id': self.user_id},
-            )
-            return None
-
-        try:
-            async with httpx.AsyncClient(
-                verify=httpx_verify_option(),
-                headers={
-                    'x-goog-api-key': LITE_LLM_API_KEY,
-                },
-            ) as client:
-                response = await client.post(
-                    f'{LITE_LLM_API_URL}/key/generate',
-                    json={
-                        'user_id': self.user_id,
-                        'metadata': {'type': 'openhands'},
-                    },
-                )
-                response.raise_for_status()
-                response_json = response.json()
-                key = response_json.get('key')
-
-                if key:
-                    logger.info(
-                        'saas_settings_store:_generate_openhands_key:success',
-                        extra={
-                            'user_id': self.user_id,
-                            'key_length': len(key) if key else 0,
-                            'key_prefix': (
-                                key[:10] + '...' if key and len(key) > 10 else key
-                            ),
-                        },
-                    )
-                    return key
-                else:
-                    logger.error(
-                        'saas_settings_store:_generate_openhands_key:no_key_in_response',
-                        extra={'user_id': self.user_id, 'response_json': response_json},
-                    )
-                    return None
-        except Exception as e:
-            logger.exception(
-                'saas_settings_store:_generate_openhands_key:error',
-                extra={'user_id': self.user_id, 'error': str(e)},
-            )
-            return None
--- a/enterprise/storage/user_settings.py
+++ b/enterprise/storage/user_settings.py
@@ -38,4 +38,3 @@ class UserSettings(Base):  # type: ignore
    email_verified = Column(Boolean, nullable=True)
    git_user_name = Column(String, nullable=True)
    git_user_email = Column(String, nullable=True)
-    v1_enabled = Column(Boolean, nullable=True)
--- a/enterprise/tests/unit/conftest.py
+++ b/enterprise/tests/unit/conftest.py
@@ -12,7 +12,6 @@ from storage.base import Base
 # Anything not loaded here may not have a table created for it.
 from storage.billing_session import BillingSession
 from storage.conversation_work import ConversationWork
-from storage.device_code import DeviceCode  # noqa: F401
 from storage.feedback import Feedback
 from storage.github_app_installation import GithubAppInstallation
 from storage.maintenance_task import MaintenanceTask, MaintenanceTaskStatus
--- a/enterprise/tests/unit/experiments/test_saas_experiment_manager.py
+++ b/enterprise/tests/unit/experiments/test_saas_experiment_manager.py
@@ -92,8 +92,11 @@ def test_unknown_variant_returns_original_agent_without_changes(monkeypatch):
    assert getattr(result, 'condenser', None) is None


+@patch('experiments.experiment_manager.handle_condenser_max_step_experiment__v1')
@patch('experiments.experiment_manager.ENABLE_EXPERIMENT_MANAGER', False)
-def test_run_agent_variant_tests_v1_noop_when_manager_disabled():
+def test_run_agent_variant_tests_v1_noop_when_manager_disabled(
+    mock_handle_condenser,
+):
    """If ENABLE_EXPERIMENT_MANAGER is False, the method returns the exact same agent and does not call the handler."""
    agent = make_agent()
    conv_id = uuid4()
@@ -106,6 +109,8 @@ def test_run_agent_variant_tests_v1_noop_when_manager_disabled():

    # Same object returned (no copy)
    assert result is agent
+    # Handler should not have been called
+    mock_handle_condenser.assert_not_called()


@patch('experiments.experiment_manager.ENABLE_EXPERIMENT_MANAGER', True)
@@ -126,3 +131,7 @@ def test_run_agent_variant_tests_v1_calls_handler_and_sets_system_prompt(monkeyp
    # Should be a different instance than the original (copied after handler runs)
    assert result is not agent
    assert result.system_prompt_filename == 'system_prompt_long_horizon.j2'
+
+    # The condenser returned by the handler must be preserved after the system-prompt override copy
+    assert isinstance(result.condenser, LLMSummarizingCondenser)
+    assert result.condenser.max_size == 80
--- a/enterprise/tests/unit/integrations/test_resolver_context.py
+++ b/enterprise/tests/unit/integrations/test_resolver_context.py
@@ -1,133 +0,0 @@
-"""Test for ResolverUserContext get_secrets conversion logic.
-
-This test focuses on testing the actual ResolverUserContext implementation.
-"""
-
-from types import MappingProxyType
-from unittest.mock import AsyncMock
-
-import pytest
-from pydantic import SecretStr
-
-from enterprise.integrations.resolver_context import ResolverUserContext
-
-# Import the real classes we want to test
-from openhands.integrations.provider import CustomSecret
-
-# Import the SDK types we need for testing
-from openhands.sdk.secret import SecretSource, StaticSecret
-from openhands.storage.data_models.secrets import Secrets
-
-
-@pytest.fixture
-def mock_saas_user_auth():
-    """Mock SaasUserAuth for testing."""
-    return AsyncMock()
-
-
-@pytest.fixture
-def resolver_context(mock_saas_user_auth):
-    """Create a ResolverUserContext instance for testing."""
-    return ResolverUserContext(saas_user_auth=mock_saas_user_auth)
-
-
-def create_custom_secret(value: str, description: str = 'Test secret') -> CustomSecret:
-    """Helper to create CustomSecret instances."""
-    return CustomSecret(secret=SecretStr(value), description=description)
-
-
-def create_secrets(custom_secrets_dict: dict[str, CustomSecret]) -> Secrets:
-    """Helper to create Secrets instances."""
-    return Secrets(custom_secrets=MappingProxyType(custom_secrets_dict))
-
-
-@pytest.mark.asyncio
-async def test_get_secrets_converts_custom_to_static(
-    resolver_context, mock_saas_user_auth
-):
-    """Test that get_secrets correctly converts CustomSecret objects to StaticSecret objects."""
-    # Arrange
-    secrets = create_secrets(
-        {
-            'TEST_SECRET_1': create_custom_secret('secret_value_1'),
-            'TEST_SECRET_2': create_custom_secret('secret_value_2'),
-        }
-    )
-    mock_saas_user_auth.get_secrets.return_value = secrets
-
-    # Act
-    result = await resolver_context.get_secrets()
-
-    # Assert
-    assert len(result) == 2
-    assert all(isinstance(secret, StaticSecret) for secret in result.values())
-    assert result['TEST_SECRET_1'].value.get_secret_value() == 'secret_value_1'
-    assert result['TEST_SECRET_2'].value.get_secret_value() == 'secret_value_2'
-
-
-@pytest.mark.asyncio
-async def test_get_secrets_with_special_characters(
-    resolver_context, mock_saas_user_auth
-):
-    """Test that secret values with special characters are preserved during conversion."""
-    # Arrange
-    special_value = 'very_secret_password_123!@#$%^&*()'
-    secrets = create_secrets({'SPECIAL_SECRET': create_custom_secret(special_value)})
-    mock_saas_user_auth.get_secrets.return_value = secrets
-
-    # Act
-    result = await resolver_context.get_secrets()
-
-    # Assert
-    assert len(result) == 1
-    assert isinstance(result['SPECIAL_SECRET'], StaticSecret)
-    assert result['SPECIAL_SECRET'].value.get_secret_value() == special_value
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    'secrets_input,expected_result',
-    [
-        (None, {}),  # No secrets available
-        (create_secrets({}), {}),  # Empty custom secrets
-    ],
-)
-async def test_get_secrets_empty_cases(
-    resolver_context, mock_saas_user_auth, secrets_input, expected_result
-):
-    """Test that get_secrets handles empty cases correctly."""
-    # Arrange
-    mock_saas_user_auth.get_secrets.return_value = secrets_input
-
-    # Act
-    result = await resolver_context.get_secrets()
-
-    # Assert
-    assert result == expected_result
-
-
-def test_static_secret_is_valid_secret_source():
-    """Test that StaticSecret is a valid SecretSource for SDK validation."""
-    # Arrange & Act
-    static_secret = StaticSecret(value='test_secret_123')
-
-    # Assert
-    assert isinstance(static_secret, StaticSecret)
-    assert isinstance(static_secret, SecretSource)
-    assert static_secret.value.get_secret_value() == 'test_secret_123'
-
-
-def test_custom_to_static_conversion():
-    """Test the complete conversion flow from CustomSecret to StaticSecret."""
-    # Arrange
-    secret_value = 'conversion_test_secret'
-    custom_secret = create_custom_secret(secret_value, 'Conversion test')
-
-    # Act - simulate the conversion logic from the actual method
-    extracted_value = custom_secret.secret.get_secret_value()
-    static_secret = StaticSecret(value=extracted_value)
-
-    # Assert
-    assert isinstance(static_secret, StaticSecret)
-    assert isinstance(static_secret, SecretSource)
-    assert static_secret.value.get_secret_value() == secret_value
--- a/enterprise/tests/unit/server/routes/test_oauth_device.py
+++ b/enterprise/tests/unit/server/routes/test_oauth_device.py
@@ -1,610 +0,0 @@
-"""Unit tests for OAuth2 Device Flow endpoints."""
-
-from datetime import UTC, datetime, timedelta
-from unittest.mock import MagicMock, patch
-
-import pytest
-from fastapi import HTTPException, Request
-from fastapi.responses import JSONResponse
-from server.routes.oauth_device import (
-    device_authorization,
-    device_token,
-    device_verification_authenticated,
-)
-from storage.device_code import DeviceCode
-
-
-@pytest.fixture
-def mock_device_code_store():
-    """Mock device code store."""
-    return MagicMock()
-
-
-@pytest.fixture
-def mock_api_key_store():
-    """Mock API key store."""
-    return MagicMock()
-
-
-@pytest.fixture
-def mock_token_manager():
-    """Mock token manager."""
-    return MagicMock()
-
-
-@pytest.fixture
-def mock_request():
-    """Mock FastAPI request."""
-    request = MagicMock(spec=Request)
-    request.base_url = 'https://test.example.com/'
-    return request
-
-
-class TestDeviceAuthorization:
-    """Test device authorization endpoint."""
-
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_device_authorization_success(self, mock_store, mock_request):
-        """Test successful device authorization."""
-        mock_device = DeviceCode(
-            device_code='test-device-code-123',
-            user_code='ABC12345',
-            expires_at=datetime.now(UTC) + timedelta(minutes=10),
-            current_interval=5,  # Default interval
-        )
-        mock_store.create_device_code.return_value = mock_device
-
-        result = await device_authorization(mock_request)
-
-        assert result.device_code == 'test-device-code-123'
-        assert result.user_code == 'ABC12345'
-        assert result.expires_in == 600
-        assert result.interval == 5  # Should match device's current_interval
-        assert 'verify' in result.verification_uri
-        assert 'ABC12345' in result.verification_uri_complete
-
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_device_authorization_with_increased_interval(
-        self, mock_store, mock_request
-    ):
-        """Test device authorization returns increased interval from rate limiting."""
-        mock_device = DeviceCode(
-            device_code='test-device-code-456',
-            user_code='XYZ98765',
-            expires_at=datetime.now(UTC) + timedelta(minutes=10),
-            current_interval=15,  # Increased interval from previous rate limiting
-        )
-        mock_store.create_device_code.return_value = mock_device
-
-        result = await device_authorization(mock_request)
-
-        assert result.device_code == 'test-device-code-456'
-        assert result.user_code == 'XYZ98765'
-        assert result.expires_in == 600
-        assert result.interval == 15  # Should match device's increased current_interval
-        assert 'verify' in result.verification_uri
-        assert 'XYZ98765' in result.verification_uri_complete
-
-
-class TestDeviceToken:
-    """Test device token endpoint."""
-
-    @pytest.mark.parametrize(
-        'device_exists,status,expected_error',
-        [
-            (False, None, 'invalid_grant'),
-            (True, 'expired', 'expired_token'),
-            (True, 'denied', 'access_denied'),
-            (True, 'pending', 'authorization_pending'),
-        ],
-    )
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_device_token_error_cases(
-        self, mock_store, device_exists, status, expected_error
-    ):
-        """Test various error cases for device token endpoint."""
-        device_code = 'test-device-code'
-
-        if device_exists:
-            mock_device = MagicMock()
-            mock_device.is_expired.return_value = status == 'expired'
-            mock_device.status = status
-            # Mock rate limiting - return False (not too fast) and default interval
-            mock_device.check_rate_limit.return_value = (False, 5)
-            mock_store.get_by_device_code.return_value = mock_device
-            mock_store.update_poll_time.return_value = True
-        else:
-            mock_store.get_by_device_code.return_value = None
-
-        result = await device_token(device_code=device_code)
-
-        assert isinstance(result, JSONResponse)
-        assert result.status_code == 400
-        # Check error in response content
-        content = result.body.decode()
-        assert expected_error in content
-
-    @patch('server.routes.oauth_device.ApiKeyStore')
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_device_token_success(self, mock_store, mock_api_key_class):
-        """Test successful device token retrieval."""
-        device_code = 'test-device-code'
-
-        # Mock authorized device
-        mock_device = MagicMock()
-        mock_device.is_expired.return_value = False
-        mock_device.status = 'authorized'
-        mock_device.keycloak_user_id = 'user-123'
-        mock_device.user_code = (
-            'ABC12345'  # Add user_code for device-specific API key lookup
-        )
-        # Mock rate limiting - return False (not too fast) and default interval
-        mock_device.check_rate_limit.return_value = (False, 5)
-        mock_store.get_by_device_code.return_value = mock_device
-        mock_store.update_poll_time.return_value = True
-
-        # Mock API key retrieval
-        mock_api_key_store = MagicMock()
-        mock_api_key_store.retrieve_api_key_by_name.return_value = 'test-api-key'
-        mock_api_key_class.get_instance.return_value = mock_api_key_store
-
-        result = await device_token(device_code=device_code)
-
-        # Check that result is a DeviceTokenResponse
-        assert result.access_token == 'test-api-key'
-        assert result.token_type == 'Bearer'
-
-        # Verify that the correct device-specific API key name was used
-        mock_api_key_store.retrieve_api_key_by_name.assert_called_once_with(
-            'user-123', 'Device Link Access Key (ABC12345)'
-        )
-
-
-class TestDeviceVerificationAuthenticated:
-    """Test device verification authenticated endpoint."""
-
-    async def test_verification_unauthenticated_user(self):
-        """Test verification with unauthenticated user."""
-        with pytest.raises(HTTPException):
-            await device_verification_authenticated(user_code='ABC12345', user_id=None)
-
-    @patch('server.routes.oauth_device.ApiKeyStore')
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_verification_invalid_device_code(
-        self, mock_store, mock_api_key_class
-    ):
-        """Test verification with invalid device code."""
-        mock_store.get_by_user_code.return_value = None
-
-        with pytest.raises(HTTPException):
-            await device_verification_authenticated(
-                user_code='INVALID', user_id='user-123'
-            )
-
-    @patch('server.routes.oauth_device.ApiKeyStore')
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_verification_already_processed(self, mock_store, mock_api_key_class):
-        """Test verification with already processed device code."""
-        mock_device = MagicMock()
-        mock_device.is_pending.return_value = False
-        mock_store.get_by_user_code.return_value = mock_device
-
-        with pytest.raises(HTTPException):
-            await device_verification_authenticated(
-                user_code='ABC12345', user_id='user-123'
-            )
-
-    @patch('server.routes.oauth_device.ApiKeyStore')
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_verification_success(self, mock_store, mock_api_key_class):
-        """Test successful device verification."""
-        # Mock device code
-        mock_device = MagicMock()
-        mock_device.is_pending.return_value = True
-        mock_store.get_by_user_code.return_value = mock_device
-        mock_store.authorize_device_code.return_value = True
-
-        # Mock API key store
-        mock_api_key_store = MagicMock()
-        mock_api_key_class.get_instance.return_value = mock_api_key_store
-
-        result = await device_verification_authenticated(
-            user_code='ABC12345', user_id='user-123'
-        )
-
-        assert isinstance(result, JSONResponse)
-        assert result.status_code == 200
-        # Should NOT delete existing API keys (multiple devices allowed)
-        mock_api_key_store.delete_api_key_by_name.assert_not_called()
-        # Should create a new API key with device-specific name
-        mock_api_key_store.create_api_key.assert_called_once()
-        call_args = mock_api_key_store.create_api_key.call_args
-        assert call_args[1]['name'] == 'Device Link Access Key (ABC12345)'
-        mock_store.authorize_device_code.assert_called_once_with(
-            user_code='ABC12345', user_id='user-123'
-        )
-
-    @patch('server.routes.oauth_device.ApiKeyStore')
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_multiple_device_authentication(self, mock_store, mock_api_key_class):
-        """Test that multiple devices can authenticate simultaneously."""
-        # Mock API key store
-        mock_api_key_store = MagicMock()
-        mock_api_key_class.get_instance.return_value = mock_api_key_store
-
-        # Simulate two different devices
-        device1_code = 'ABC12345'
-        device2_code = 'XYZ67890'
-        user_id = 'user-123'
-
-        # Mock device codes
-        mock_device1 = MagicMock()
-        mock_device1.is_pending.return_value = True
-        mock_device2 = MagicMock()
-        mock_device2.is_pending.return_value = True
-
-        # Configure mock store to return appropriate device for each user_code
-        def get_by_user_code_side_effect(user_code):
-            if user_code == device1_code:
-                return mock_device1
-            elif user_code == device2_code:
-                return mock_device2
-            return None
-
-        mock_store.get_by_user_code.side_effect = get_by_user_code_side_effect
-        mock_store.authorize_device_code.return_value = True
-
-        # Authenticate first device
-        result1 = await device_verification_authenticated(
-            user_code=device1_code, user_id=user_id
-        )
-
-        # Authenticate second device
-        result2 = await device_verification_authenticated(
-            user_code=device2_code, user_id=user_id
-        )
-
-        # Both should succeed
-        assert isinstance(result1, JSONResponse)
-        assert result1.status_code == 200
-        assert isinstance(result2, JSONResponse)
-        assert result2.status_code == 200
-
-        # Should create two separate API keys with different names
-        assert mock_api_key_store.create_api_key.call_count == 2
-
-        # Check that each device got a unique API key name
-        call_args_list = mock_api_key_store.create_api_key.call_args_list
-        device1_name = call_args_list[0][1]['name']
-        device2_name = call_args_list[1][1]['name']
-
-        assert device1_name == f'Device Link Access Key ({device1_code})'
-        assert device2_name == f'Device Link Access Key ({device2_code})'
-        assert device1_name != device2_name  # Ensure they're different
-
-        # Should NOT delete any existing API keys
-        mock_api_key_store.delete_api_key_by_name.assert_not_called()
-
-
-class TestDeviceTokenRateLimiting:
-    """Test rate limiting for device token polling (RFC 8628 section 3.5)."""
-
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_first_poll_allowed(self, mock_store):
-        """Test that the first poll is always allowed."""
-        # Create a device code with no previous poll time
-        mock_device = DeviceCode(
-            device_code='test_device_code',
-            user_code='ABC123',
-            status='pending',
-            expires_at=datetime.now(UTC) + timedelta(minutes=10),
-            last_poll_time=None,  # First poll
-            current_interval=5,
-        )
-        mock_store.get_by_device_code.return_value = mock_device
-        mock_store.update_poll_time.return_value = True
-
-        device_code = 'test_device_code'
-        result = await device_token(device_code=device_code)
-
-        # Should return authorization_pending, not slow_down
-        assert isinstance(result, JSONResponse)
-        assert result.status_code == 400
-        content = result.body.decode()
-        assert 'authorization_pending' in content
-        assert 'slow_down' not in content
-
-        # Should update poll time without increasing interval
-        mock_store.update_poll_time.assert_called_with(
-            'test_device_code', increase_interval=False
-        )
-
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_normal_polling_allowed(self, mock_store):
-        """Test that normal polling (respecting interval) is allowed."""
-        # Create a device code with last poll time 6 seconds ago (interval is 5)
-        last_poll = datetime.now(UTC) - timedelta(seconds=6)
-        mock_device = DeviceCode(
-            device_code='test_device_code',
-            user_code='ABC123',
-            status='pending',
-            expires_at=datetime.now(UTC) + timedelta(minutes=10),
-            last_poll_time=last_poll,
-            current_interval=5,
-        )
-        mock_store.get_by_device_code.return_value = mock_device
-        mock_store.update_poll_time.return_value = True
-
-        device_code = 'test_device_code'
-        result = await device_token(device_code=device_code)
-
-        # Should return authorization_pending, not slow_down
-        assert isinstance(result, JSONResponse)
-        assert result.status_code == 400
-        content = result.body.decode()
-        assert 'authorization_pending' in content
-        assert 'slow_down' not in content
-
-        # Should update poll time without increasing interval
-        mock_store.update_poll_time.assert_called_with(
-            'test_device_code', increase_interval=False
-        )
-
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_fast_polling_returns_slow_down(self, mock_store):
-        """Test that polling too fast returns slow_down error."""
-        # Create a device code with last poll time 2 seconds ago (interval is 5)
-        last_poll = datetime.now(UTC) - timedelta(seconds=2)
-        mock_device = DeviceCode(
-            device_code='test_device_code',
-            user_code='ABC123',
-            status='pending',
-            expires_at=datetime.now(UTC) + timedelta(minutes=10),
-            last_poll_time=last_poll,
-            current_interval=5,
-        )
-        mock_store.get_by_device_code.return_value = mock_device
-        mock_store.update_poll_time.return_value = True
-
-        device_code = 'test_device_code'
-        result = await device_token(device_code=device_code)
-
-        # Should return slow_down error
-        assert isinstance(result, JSONResponse)
-        assert result.status_code == 400
-        content = result.body.decode()
-        assert 'slow_down' in content
-        assert 'interval' in content
-        assert '10' in content  # New interval should be 5 + 5 = 10
-
-        # Should update poll time and increase interval
-        mock_store.update_poll_time.assert_called_with(
-            'test_device_code', increase_interval=True
-        )
-
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_interval_increases_with_repeated_fast_polling(self, mock_store):
-        """Test that interval increases with repeated fast polling."""
-        # Create a device code with higher current interval from previous slow_down
-        last_poll = datetime.now(UTC) - timedelta(seconds=5)  # 5 seconds ago
-        mock_device = DeviceCode(
-            device_code='test_device_code',
-            user_code='ABC123',
-            status='pending',
-            expires_at=datetime.now(UTC) + timedelta(minutes=10),
-            last_poll_time=last_poll,
-            current_interval=15,  # Already increased from previous slow_down
-        )
-        mock_store.get_by_device_code.return_value = mock_device
-        mock_store.update_poll_time.return_value = True
-
-        device_code = 'test_device_code'
-        result = await device_token(device_code=device_code)
-
-        # Should return slow_down error with increased interval
-        assert isinstance(result, JSONResponse)
-        assert result.status_code == 400
-        content = result.body.decode()
-        assert 'slow_down' in content
-        assert '20' in content  # New interval should be 15 + 5 = 20
-
-        # Should update poll time and increase interval
-        mock_store.update_poll_time.assert_called_with(
-            'test_device_code', increase_interval=True
-        )
-
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_interval_caps_at_maximum(self, mock_store):
-        """Test that interval is capped at maximum value."""
-        # Create a device code with interval near maximum
-        last_poll = datetime.now(UTC) - timedelta(seconds=30)
-        mock_device = DeviceCode(
-            device_code='test_device_code',
-            user_code='ABC123',
-            status='pending',
-            expires_at=datetime.now(UTC) + timedelta(minutes=10),
-            last_poll_time=last_poll,
-            current_interval=58,  # Near maximum of 60
-        )
-        mock_store.get_by_device_code.return_value = mock_device
-        mock_store.update_poll_time.return_value = True
-
-        device_code = 'test_device_code'
-        result = await device_token(device_code=device_code)
-
-        # Should return slow_down error with capped interval
-        assert isinstance(result, JSONResponse)
-        assert result.status_code == 400
-        content = result.body.decode()
-        assert 'slow_down' in content
-        assert '60' in content  # Should be capped at 60, not 63
-
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_rate_limiting_with_authorized_device(self, mock_store):
-        """Test that rate limiting still applies to authorized devices."""
-        # Create an authorized device code with recent poll
-        last_poll = datetime.now(UTC) - timedelta(seconds=2)
-        mock_device = DeviceCode(
-            device_code='test_device_code',
-            user_code='ABC123',
-            status='authorized',  # Device is authorized
-            keycloak_user_id='user123',
-            expires_at=datetime.now(UTC) + timedelta(minutes=10),
-            last_poll_time=last_poll,
-            current_interval=5,
-        )
-        mock_store.get_by_device_code.return_value = mock_device
-        mock_store.update_poll_time.return_value = True
-
-        device_code = 'test_device_code'
-        result = await device_token(device_code=device_code)
-
-        # Should still return slow_down error even for authorized device
-        assert isinstance(result, JSONResponse)
-        assert result.status_code == 400
-        content = result.body.decode()
-        assert 'slow_down' in content
-
-        # Should update poll time and increase interval
-        mock_store.update_poll_time.assert_called_with(
-            'test_device_code', increase_interval=True
-        )
-
-
-class TestDeviceVerificationTransactionIntegrity:
-    """Test transaction integrity for device verification to prevent orphaned API keys."""
-
-    @patch('server.routes.oauth_device.ApiKeyStore')
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_authorization_failure_prevents_api_key_creation(
-        self, mock_store, mock_api_key_class
-    ):
-        """Test that if device authorization fails, no API key is created."""
-        # Mock device code
-        mock_device = MagicMock()
-        mock_device.is_pending.return_value = True
-        mock_store.get_by_user_code.return_value = mock_device
-        mock_store.authorize_device_code.return_value = False  # Authorization fails
-
-        # Mock API key store
-        mock_api_key_store = MagicMock()
-        mock_api_key_class.get_instance.return_value = mock_api_key_store
-
-        # Should raise HTTPException due to authorization failure
-        with pytest.raises(HTTPException) as exc_info:
-            await device_verification_authenticated(
-                user_code='ABC12345', user_id='user-123'
-            )
-
-        assert exc_info.value.status_code == 500
-        assert 'Failed to authorize the device' in exc_info.value.detail
-
-        # API key should NOT be created since authorization failed
-        mock_api_key_store.create_api_key.assert_not_called()
-        mock_store.authorize_device_code.assert_called_once_with(
-            user_code='ABC12345', user_id='user-123'
-        )
-
-    @patch('server.routes.oauth_device.ApiKeyStore')
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_api_key_creation_failure_reverts_authorization(
-        self, mock_store, mock_api_key_class
-    ):
-        """Test that if API key creation fails after authorization, the authorization is reverted."""
-        # Mock device code
-        mock_device = MagicMock()
-        mock_device.is_pending.return_value = True
-        mock_store.get_by_user_code.return_value = mock_device
-        mock_store.authorize_device_code.return_value = True  # Authorization succeeds
-        mock_store.deny_device_code.return_value = True  # Cleanup succeeds
-
-        # Mock API key store to fail on creation
-        mock_api_key_store = MagicMock()
-        mock_api_key_store.create_api_key.side_effect = Exception('Database error')
-        mock_api_key_class.get_instance.return_value = mock_api_key_store
-
-        # Should raise HTTPException due to API key creation failure
-        with pytest.raises(HTTPException) as exc_info:
-            await device_verification_authenticated(
-                user_code='ABC12345', user_id='user-123'
-            )
-
-        assert exc_info.value.status_code == 500
-        assert 'Failed to create API key for device access' in exc_info.value.detail
-
-        # Authorization should have been attempted first
-        mock_store.authorize_device_code.assert_called_once_with(
-            user_code='ABC12345', user_id='user-123'
-        )
-
-        # API key creation should have been attempted after authorization
-        mock_api_key_store.create_api_key.assert_called_once()
-
-        # Authorization should be reverted due to API key creation failure
-        mock_store.deny_device_code.assert_called_once_with('ABC12345')
-
-    @patch('server.routes.oauth_device.ApiKeyStore')
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_api_key_creation_failure_cleanup_failure_logged(
-        self, mock_store, mock_api_key_class
-    ):
-        """Test that cleanup failure is logged but doesn't prevent the main error from being raised."""
-        # Mock device code
-        mock_device = MagicMock()
-        mock_device.is_pending.return_value = True
-        mock_store.get_by_user_code.return_value = mock_device
-        mock_store.authorize_device_code.return_value = True  # Authorization succeeds
-        mock_store.deny_device_code.side_effect = Exception(
-            'Cleanup failed'
-        )  # Cleanup fails
-
-        # Mock API key store to fail on creation
-        mock_api_key_store = MagicMock()
-        mock_api_key_store.create_api_key.side_effect = Exception('Database error')
-        mock_api_key_class.get_instance.return_value = mock_api_key_store
-
-        # Should still raise HTTPException for the original API key creation failure
-        with pytest.raises(HTTPException) as exc_info:
-            await device_verification_authenticated(
-                user_code='ABC12345', user_id='user-123'
-            )
-
-        assert exc_info.value.status_code == 500
-        assert 'Failed to create API key for device access' in exc_info.value.detail
-
-        # Both operations should have been attempted
-        mock_store.authorize_device_code.assert_called_once()
-        mock_api_key_store.create_api_key.assert_called_once()
-        mock_store.deny_device_code.assert_called_once_with('ABC12345')
-
-    @patch('server.routes.oauth_device.ApiKeyStore')
-    @patch('server.routes.oauth_device.device_code_store')
-    async def test_successful_flow_creates_api_key_after_authorization(
-        self, mock_store, mock_api_key_class
-    ):
-        """Test that in the successful flow, API key is created only after authorization."""
-        # Mock device code
-        mock_device = MagicMock()
-        mock_device.is_pending.return_value = True
-        mock_store.get_by_user_code.return_value = mock_device
-        mock_store.authorize_device_code.return_value = True  # Authorization succeeds
-
-        # Mock API key store
-        mock_api_key_store = MagicMock()
-        mock_api_key_class.get_instance.return_value = mock_api_key_store
-
-        result = await device_verification_authenticated(
-            user_code='ABC12345', user_id='user-123'
-        )
-
-        assert isinstance(result, JSONResponse)
-        assert result.status_code == 200
-
-        # Verify the order: authorization first, then API key creation
-        mock_store.authorize_device_code.assert_called_once_with(
-            user_code='ABC12345', user_id='user-123'
-        )
-        mock_api_key_store.create_api_key.assert_called_once()
-
-        # No cleanup should be needed in successful case
-        mock_store.deny_device_code.assert_not_called()
--- a/enterprise/tests/unit/storage/test_device_code.py
+++ b/enterprise/tests/unit/storage/test_device_code.py
@@ -1,83 +0,0 @@
-"""Unit tests for DeviceCode model."""
-
-from datetime import datetime, timedelta, timezone
-
-import pytest
-from storage.device_code import DeviceCode, DeviceCodeStatus
-
-
-class TestDeviceCode:
-    """Test cases for DeviceCode model."""
-
-    @pytest.fixture
-    def device_code(self):
-        """Create a test device code."""
-        return DeviceCode(
-            device_code='test-device-code-123',
-            user_code='ABC12345',
-            expires_at=datetime.now(timezone.utc) + timedelta(minutes=10),
-        )
-
-    @pytest.mark.parametrize(
-        'expires_delta,expected',
-        [
-            (timedelta(minutes=5), False),  # Future expiry
-            (timedelta(minutes=-5), True),  # Past expiry
-            (timedelta(seconds=1), False),  # Just future (not expired)
-        ],
-    )
-    def test_is_expired(self, expires_delta, expected):
-        """Test expiration check with various time deltas."""
-        device_code = DeviceCode(
-            device_code='test-device-code',
-            user_code='ABC12345',
-            expires_at=datetime.now(timezone.utc) + expires_delta,
-        )
-        assert device_code.is_expired() == expected
-
-    @pytest.mark.parametrize(
-        'status,expired,expected',
-        [
-            (DeviceCodeStatus.PENDING.value, False, True),
-            (DeviceCodeStatus.PENDING.value, True, False),
-            (DeviceCodeStatus.AUTHORIZED.value, False, False),
-            (DeviceCodeStatus.DENIED.value, False, False),
-        ],
-    )
-    def test_is_pending(self, status, expired, expected):
-        """Test pending status check."""
-        expires_at = (
-            datetime.now(timezone.utc) - timedelta(minutes=1)
-            if expired
-            else datetime.now(timezone.utc) + timedelta(minutes=10)
-        )
-        device_code = DeviceCode(
-            device_code='test-device-code',
-            user_code='ABC12345',
-            status=status,
-            expires_at=expires_at,
-        )
-        assert device_code.is_pending() == expected
-
-    def test_authorize(self, device_code):
-        """Test device authorization."""
-        user_id = 'test-user-123'
-
-        device_code.authorize(user_id)
-
-        assert device_code.status == DeviceCodeStatus.AUTHORIZED.value
-        assert device_code.keycloak_user_id == user_id
-        assert device_code.authorized_at is not None
-        assert isinstance(device_code.authorized_at, datetime)
-
-    @pytest.mark.parametrize(
-        'method,expected_status',
-        [
-            ('deny', DeviceCodeStatus.DENIED.value),
-            ('expire', DeviceCodeStatus.EXPIRED.value),
-        ],
-    )
-    def test_status_changes(self, device_code, method, expected_status):
-        """Test status change methods."""
-        getattr(device_code, method)()
-        assert device_code.status == expected_status
--- a/enterprise/tests/unit/storage/test_device_code_store.py
+++ b/enterprise/tests/unit/storage/test_device_code_store.py
@@ -1,193 +0,0 @@
-"""Unit tests for DeviceCodeStore."""
-
-from unittest.mock import MagicMock
-
-import pytest
-from sqlalchemy.exc import IntegrityError
-from storage.device_code import DeviceCode
-from storage.device_code_store import DeviceCodeStore
-
-
-@pytest.fixture
-def mock_session():
-    """Mock database session."""
-    session = MagicMock()
-    return session
-
-
-@pytest.fixture
-def mock_session_maker(mock_session):
-    """Mock session maker."""
-    session_maker = MagicMock()
-    session_maker.return_value.__enter__.return_value = mock_session
-    session_maker.return_value.__exit__.return_value = None
-    return session_maker
-
-
-@pytest.fixture
-def device_code_store(mock_session_maker):
-    """Create DeviceCodeStore instance."""
-    return DeviceCodeStore(mock_session_maker)
-
-
-class TestDeviceCodeStore:
-    """Test cases for DeviceCodeStore."""
-
-    def test_generate_user_code(self, device_code_store):
-        """Test user code generation."""
-        code = device_code_store.generate_user_code()
-
-        assert len(code) == 8
-        assert code.isupper()
-        # Should not contain confusing characters
-        assert not any(char in code for char in 'IO01')
-
-    def test_generate_device_code(self, device_code_store):
-        """Test device code generation."""
-        code = device_code_store.generate_device_code()
-
-        assert len(code) == 128
-        assert code.isalnum()
-
-    def test_create_device_code_success(self, device_code_store, mock_session):
-        """Test successful device code creation."""
-        # Mock successful creation (no IntegrityError)
-        mock_device_code = MagicMock(spec=DeviceCode)
-        mock_device_code.device_code = 'test-device-code-123'
-        mock_device_code.user_code = 'TESTCODE'
-
-        # Mock the session to return our mock device code after refresh
-        def mock_refresh(obj):
-            obj.device_code = mock_device_code.device_code
-            obj.user_code = mock_device_code.user_code
-
-        mock_session.refresh.side_effect = mock_refresh
-
-        result = device_code_store.create_device_code(expires_in=600)
-
-        assert isinstance(result, DeviceCode)
-        mock_session.add.assert_called_once()
-        mock_session.commit.assert_called_once()
-        mock_session.refresh.assert_called_once()
-        mock_session.expunge.assert_called_once()
-
-    def test_create_device_code_with_retries(
-        self, device_code_store, mock_session_maker
-    ):
-        """Test device code creation with constraint violation retries."""
-        mock_session = MagicMock()
-        mock_session_maker.return_value.__enter__.return_value = mock_session
-        mock_session_maker.return_value.__exit__.return_value = None
-
-        # First attempt fails with IntegrityError, second succeeds
-        mock_session.commit.side_effect = [IntegrityError('', '', ''), None]
-
-        mock_device_code = MagicMock(spec=DeviceCode)
-        mock_device_code.device_code = 'test-device-code-456'
-        mock_device_code.user_code = 'TESTCD2'
-
-        def mock_refresh(obj):
-            obj.device_code = mock_device_code.device_code
-            obj.user_code = mock_device_code.user_code
-
-        mock_session.refresh.side_effect = mock_refresh
-
-        store = DeviceCodeStore(mock_session_maker)
-        result = store.create_device_code(expires_in=600)
-
-        assert isinstance(result, DeviceCode)
-        assert mock_session.add.call_count == 2  # Two attempts
-        assert mock_session.commit.call_count == 2  # Two attempts
-
-    def test_create_device_code_max_attempts_exceeded(
-        self, device_code_store, mock_session_maker
-    ):
-        """Test device code creation failure after max attempts."""
-        mock_session = MagicMock()
-        mock_session_maker.return_value.__enter__.return_value = mock_session
-        mock_session_maker.return_value.__exit__.return_value = None
-
-        # All attempts fail with IntegrityError
-        mock_session.commit.side_effect = IntegrityError('', '', '')
-
-        store = DeviceCodeStore(mock_session_maker)
-
-        with pytest.raises(
-            RuntimeError,
-            match='Failed to generate unique device codes after 3 attempts',
-        ):
-            store.create_device_code(expires_in=600, max_attempts=3)
-
-    @pytest.mark.parametrize(
-        'lookup_method,lookup_field',
-        [
-            ('get_by_device_code', 'device_code'),
-            ('get_by_user_code', 'user_code'),
-        ],
-    )
-    def test_lookup_methods(
-        self, device_code_store, mock_session, lookup_method, lookup_field
-    ):
-        """Test device code lookup methods."""
-        test_code = 'test-code-123'
-        mock_device_code = MagicMock()
-        mock_session.query.return_value.filter_by.return_value.first.return_value = (
-            mock_device_code
-        )
-
-        result = getattr(device_code_store, lookup_method)(test_code)
-
-        assert result == mock_device_code
-        mock_session.query.assert_called_once_with(DeviceCode)
-        mock_session.query.return_value.filter_by.assert_called_once_with(
-            **{lookup_field: test_code}
-        )
-
-    @pytest.mark.parametrize(
-        'device_exists,is_pending,expected_result',
-        [
-            (True, True, True),  # Success case
-            (False, True, False),  # Device not found
-            (True, False, False),  # Device not pending
-        ],
-    )
-    def test_authorize_device_code(
-        self,
-        device_code_store,
-        mock_session,
-        device_exists,
-        is_pending,
-        expected_result,
-    ):
-        """Test device code authorization."""
-        user_code = 'ABC12345'
-        user_id = 'test-user-123'
-
-        if device_exists:
-            mock_device = MagicMock()
-            mock_device.is_pending.return_value = is_pending
-            mock_session.query.return_value.filter_by.return_value.first.return_value = mock_device
-        else:
-            mock_session.query.return_value.filter_by.return_value.first.return_value = None
-
-        result = device_code_store.authorize_device_code(user_code, user_id)
-
-        assert result == expected_result
-        if expected_result:
-            mock_device.authorize.assert_called_once_with(user_id)
-            mock_session.commit.assert_called_once()
-
-    def test_deny_device_code(self, device_code_store, mock_session):
-        """Test device code denial."""
-        user_code = 'ABC12345'
-        mock_device = MagicMock()
-        mock_device.is_pending.return_value = True
-        mock_session.query.return_value.filter_by.return_value.first.return_value = (
-            mock_device
-        )
-
-        result = device_code_store.deny_device_code(user_code)
-
-        assert result is True
-        mock_device.deny.assert_called_once()
-        mock_session.commit.assert_called_once()
--- a/enterprise/tests/unit/test_api_key_store.py
+++ b/enterprise/tests/unit/test_api_key_store.py
@@ -25,12 +25,10 @@ def api_key_store(mock_session_maker):


 def test_generate_api_key(api_key_store):
-    """Test that generate_api_key returns a string with sk-oh- prefix and expected length."""
+    """Test that generate_api_key returns a string of the expected length."""
    key = api_key_store.generate_api_key(length=32)
    assert isinstance(key, str)
-    assert key.startswith('sk-oh-')
-    # Total length should be prefix (6 chars) + random part (32 chars) = 38 chars
-    assert len(key) == len('sk-oh-') + 32
+    assert len(key) == 32


 def test_create_api_key(api_key_store, mock_session):
@@ -92,50 +90,6 @@ def test_validate_api_key_expired(api_key_store, mock_session):
    mock_session.commit.assert_not_called()


-def test_validate_api_key_expired_timezone_naive(api_key_store, mock_session):
-    """Test validating an expired API key with timezone-naive datetime from database."""
-    # Setup
-    api_key = 'test-api-key'
-    mock_key_record = MagicMock()
-    # Simulate timezone-naive datetime as returned from database
-    mock_key_record.expires_at = datetime.now() - timedelta(days=1)  # No UTC timezone
-    mock_key_record.id = 1
-    mock_session.query.return_value.filter.return_value.first.return_value = (
-        mock_key_record
-    )
-
-    # Execute
-    result = api_key_store.validate_api_key(api_key)
-
-    # Verify
-    assert result is None
-    mock_session.execute.assert_not_called()
-    mock_session.commit.assert_not_called()
-
-
-def test_validate_api_key_valid_timezone_naive(api_key_store, mock_session):
-    """Test validating a valid API key with timezone-naive datetime from database."""
-    # Setup
-    api_key = 'test-api-key'
-    user_id = 'test-user-123'
-    mock_key_record = MagicMock()
-    mock_key_record.user_id = user_id
-    # Simulate timezone-naive datetime as returned from database (future date)
-    mock_key_record.expires_at = datetime.now() + timedelta(days=1)  # No UTC timezone
-    mock_key_record.id = 1
-    mock_session.query.return_value.filter.return_value.first.return_value = (
-        mock_key_record
-    )
-
-    # Execute
-    result = api_key_store.validate_api_key(api_key)
-
-    # Verify
-    assert result == user_id
-    mock_session.execute.assert_called_once()
-    mock_session.commit.assert_called_once()
-
-
 def test_validate_api_key_not_found(api_key_store, mock_session):
    """Test validating a non-existent API key."""
    # Setup
--- a/enterprise/tests/unit/test_get_user_v1_enabled_setting.py
+++ b/enterprise/tests/unit/test_get_user_v1_enabled_setting.py
@@ -1,132 +0,0 @@
-"""Unit tests for get_user_v1_enabled_setting function."""
-
-import os
-from unittest.mock import AsyncMock, MagicMock, patch
-
-import pytest
-from integrations.github.github_view import get_user_v1_enabled_setting
-
-
-@pytest.fixture
-def mock_user_settings():
-    """Create a mock user settings object."""
-    settings = MagicMock()
-    settings.v1_enabled = True  # Default to True, can be overridden in tests
-    return settings
-
-
-@pytest.fixture
-def mock_settings_store(mock_user_settings):
-    """Create a mock settings store."""
-    store = MagicMock()
-    store.get_user_settings_by_keycloak_id = AsyncMock(return_value=mock_user_settings)
-    return store
-
-
-@pytest.fixture
-def mock_config():
-    """Create a mock config object."""
-    return MagicMock()
-
-
-@pytest.fixture
-def mock_session_maker():
-    """Create a mock session maker."""
-    return MagicMock()
-
-
-@pytest.fixture
-def mock_dependencies(
-    mock_settings_store, mock_config, mock_session_maker, mock_user_settings
-):
-    """Fixture that patches all the common dependencies."""
-    with patch(
-        'integrations.github.github_view.SaasSettingsStore',
-        return_value=mock_settings_store,
-    ) as mock_store_class, patch(
-        'integrations.github.github_view.get_config', return_value=mock_config
-    ) as mock_get_config, patch(
-        'integrations.github.github_view.session_maker', mock_session_maker
-    ), patch(
-        'integrations.github.github_view.call_sync_from_async',
-        return_value=mock_user_settings,
-    ) as mock_call_sync:
-        yield {
-            'store_class': mock_store_class,
-            'get_config': mock_get_config,
-            'session_maker': mock_session_maker,
-            'call_sync': mock_call_sync,
-            'settings_store': mock_settings_store,
-            'user_settings': mock_user_settings,
-        }
-
-
-class TestGetUserV1EnabledSetting:
-    """Test cases for get_user_v1_enabled_setting function."""
-
-    @pytest.mark.asyncio
-    @pytest.mark.parametrize(
-        'env_var_enabled,user_setting_enabled,expected_result',
-        [
-            (False, True, False),  # Env var disabled, user enabled -> False
-            (True, False, False),  # Env var enabled, user disabled -> False
-            (True, True, True),  # Both enabled -> True
-            (False, False, False),  # Both disabled -> False
-        ],
-    )
-    async def test_v1_enabled_combinations(
-        self, mock_dependencies, env_var_enabled, user_setting_enabled, expected_result
-    ):
-        """Test all combinations of environment variable and user setting values."""
-        mock_dependencies['user_settings'].v1_enabled = user_setting_enabled
-
-        with patch(
-            'integrations.github.github_view.ENABLE_V1_GITHUB_RESOLVER', env_var_enabled
-        ):
-            result = await get_user_v1_enabled_setting('test_user_id')
-            assert result is expected_result
-
-    @pytest.mark.asyncio
-    @pytest.mark.parametrize(
-        'env_var_value,env_var_bool,expected_result',
-        [
-            ('false', False, False),  # Environment variable 'false' -> False
-            ('true', True, True),  # Environment variable 'true' -> True
-        ],
-    )
-    async def test_environment_variable_integration(
-        self, mock_dependencies, env_var_value, env_var_bool, expected_result
-    ):
-        """Test that the function properly reads the ENABLE_V1_GITHUB_RESOLVER environment variable."""
-        mock_dependencies['user_settings'].v1_enabled = True
-
-        with patch.dict(
-            os.environ, {'ENABLE_V1_GITHUB_RESOLVER': env_var_value}
-        ), patch('integrations.utils.os.getenv', return_value=env_var_value), patch(
-            'integrations.github.github_view.ENABLE_V1_GITHUB_RESOLVER', env_var_bool
-        ):
-            result = await get_user_v1_enabled_setting('test_user_id')
-            assert result is expected_result
-
-    @pytest.mark.asyncio
-    async def test_function_calls_correct_methods(self, mock_dependencies):
-        """Test that the function calls the correct methods with correct parameters."""
-        mock_dependencies['user_settings'].v1_enabled = True
-
-        with patch('integrations.github.github_view.ENABLE_V1_GITHUB_RESOLVER', True):
-            result = await get_user_v1_enabled_setting('test_user_123')
-
-            # Verify the result
-            assert result is True
-
-            # Verify correct methods were called with correct parameters
-            mock_dependencies['get_config'].assert_called_once()
-            mock_dependencies['store_class'].assert_called_once_with(
-                user_id='test_user_123',
-                session_maker=mock_dependencies['session_maker'],
-                config=mock_dependencies['get_config'].return_value,
-            )
-            mock_dependencies['call_sync'].assert_called_once_with(
-                mock_dependencies['settings_store'].get_user_settings_by_keycloak_id,
-                'test_user_123',
-            )
--- a/enterprise/tests/unit/test_github_view.py
+++ b/enterprise/tests/unit/test_github_view.py
@@ -1,10 +1,7 @@
 from unittest import TestCase, mock
-from unittest.mock import MagicMock, patch

-import pytest
-from integrations.github.github_view import GithubFactory, GithubIssue, get_oh_labels
+from integrations.github.github_view import GithubFactory, get_oh_labels
 from integrations.models import Message, SourceType
-from integrations.types import UserData


 class TestGithubLabels(TestCase):
@@ -78,132 +75,3 @@ class TestGithubCommentCaseInsensitivity(TestCase):
        self.assertTrue(GithubFactory.is_issue_comment(message_lower))
        self.assertTrue(GithubFactory.is_issue_comment(message_upper))
        self.assertTrue(GithubFactory.is_issue_comment(message_mixed))
-
-
-class TestGithubV1ConversationRouting(TestCase):
-    """Test V1 conversation routing logic in GitHub integration."""
-
-    def setUp(self):
-        """Set up test fixtures."""
-        # Create a proper UserData instance instead of MagicMock
-        user_data = UserData(
-            user_id=123, username='testuser', keycloak_user_id='test-keycloak-id'
-        )
-
-        # Create a mock raw_payload
-        raw_payload = Message(
-            source=SourceType.GITHUB,
-            message={
-                'payload': {
-                    'action': 'opened',
-                    'issue': {'number': 123},
-                }
-            },
-        )
-
-        self.github_issue = GithubIssue(
-            user_info=user_data,
-            full_repo_name='test/repo',
-            issue_number=123,
-            installation_id=456,
-            conversation_id='test-conversation-id',
-            should_extract=True,
-            send_summary_instruction=False,
-            is_public_repo=True,
-            raw_payload=raw_payload,
-            uuid='test-uuid',
-            title='Test Issue',
-            description='Test issue description',
-            previous_comments=[],
-            v1=False,
-        )
-
-    @pytest.mark.asyncio
-    @patch('integrations.github.github_view.get_user_v1_enabled_setting')
-    @patch.object(GithubIssue, '_create_v0_conversation')
-    @patch.object(GithubIssue, '_create_v1_conversation')
-    async def test_create_new_conversation_routes_to_v0_when_disabled(
-        self, mock_create_v1, mock_create_v0, mock_get_v1_setting
-    ):
-        """Test that conversation creation routes to V0 when v1_enabled is False."""
-        # Mock v1_enabled as False
-        mock_get_v1_setting.return_value = False
-        mock_create_v0.return_value = None
-        mock_create_v1.return_value = None
-
-        # Mock parameters
-        jinja_env = MagicMock()
-        git_provider_tokens = MagicMock()
-        conversation_metadata = MagicMock()
-
-        # Call the method
-        await self.github_issue.create_new_conversation(
-            jinja_env, git_provider_tokens, conversation_metadata
-        )
-
-        # Verify V0 was called and V1 was not
-        mock_create_v0.assert_called_once_with(
-            jinja_env, git_provider_tokens, conversation_metadata
-        )
-        mock_create_v1.assert_not_called()
-
-    @pytest.mark.asyncio
-    @patch('integrations.github.github_view.get_user_v1_enabled_setting')
-    @patch.object(GithubIssue, '_create_v0_conversation')
-    @patch.object(GithubIssue, '_create_v1_conversation')
-    async def test_create_new_conversation_routes_to_v1_when_enabled(
-        self, mock_create_v1, mock_create_v0, mock_get_v1_setting
-    ):
-        """Test that conversation creation routes to V1 when v1_enabled is True."""
-        # Mock v1_enabled as True
-        mock_get_v1_setting.return_value = True
-        mock_create_v0.return_value = None
-        mock_create_v1.return_value = None
-
-        # Mock parameters
-        jinja_env = MagicMock()
-        git_provider_tokens = MagicMock()
-        conversation_metadata = MagicMock()
-
-        # Call the method
-        await self.github_issue.create_new_conversation(
-            jinja_env, git_provider_tokens, conversation_metadata
-        )
-
-        # Verify V1 was called and V0 was not
-        mock_create_v1.assert_called_once_with(
-            jinja_env, git_provider_tokens, conversation_metadata
-        )
-        mock_create_v0.assert_not_called()
-
-    @pytest.mark.asyncio
-    @patch('integrations.github.github_view.get_user_v1_enabled_setting')
-    @patch.object(GithubIssue, '_create_v0_conversation')
-    @patch.object(GithubIssue, '_create_v1_conversation')
-    async def test_create_new_conversation_fallback_on_v1_setting_error(
-        self, mock_create_v1, mock_create_v0, mock_get_v1_setting
-    ):
-        """Test that conversation creation falls back to V0 when _create_v1_conversation fails."""
-        # Mock v1_enabled as True so V1 is attempted
-        mock_get_v1_setting.return_value = True
-        # Mock _create_v1_conversation to raise an exception
-        mock_create_v1.side_effect = Exception('V1 conversation creation failed')
-        mock_create_v0.return_value = None
-
-        # Mock parameters
-        jinja_env = MagicMock()
-        git_provider_tokens = MagicMock()
-        conversation_metadata = MagicMock()
-
-        # Call the method
-        await self.github_issue.create_new_conversation(
-            jinja_env, git_provider_tokens, conversation_metadata
-        )
-
-        # Verify V1 was attempted first, then V0 was called as fallback
-        mock_create_v1.assert_called_once_with(
-            jinja_env, git_provider_tokens, conversation_metadata
-        )
-        mock_create_v0.assert_called_once_with(
-            jinja_env, git_provider_tokens, conversation_metadata
-        )
--- a/enterprise/tests/unit/test_legacy_conversation_manager.py
+++ b/enterprise/tests/unit/test_legacy_conversation_manager.py
@@ -0,0 +1,485 @@
+import time
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from server.legacy_conversation_manager import (
+    _LEGACY_ENTRY_TIMEOUT_SECONDS,
+    LegacyCacheEntry,
+    LegacyConversationManager,
+)
+
+from openhands.core.config.openhands_config import OpenHandsConfig
+from openhands.server.config.server_config import ServerConfig
+from openhands.server.monitoring import MonitoringListener
+from openhands.storage.memory import InMemoryFileStore
+
+
+@pytest.fixture
+def mock_sio():
+    """Create a mock SocketIO server."""
+    return MagicMock()
+
+
+@pytest.fixture
+def mock_config():
+    """Create a mock OpenHands config."""
+    return MagicMock(spec=OpenHandsConfig)
+
+
+@pytest.fixture
+def mock_server_config():
+    """Create a mock server config."""
+    return MagicMock(spec=ServerConfig)
+
+
+@pytest.fixture
+def mock_file_store():
+    """Create a mock file store."""
+    return MagicMock(spec=InMemoryFileStore)
+
+
+@pytest.fixture
+def mock_monitoring_listener():
+    """Create a mock monitoring listener."""
+    return MagicMock(spec=MonitoringListener)
+
+
+@pytest.fixture
+def mock_conversation_manager():
+    """Create a mock SaasNestedConversationManager."""
+    mock_cm = MagicMock()
+    mock_cm._get_runtime = AsyncMock()
+    return mock_cm
+
+
+@pytest.fixture
+def mock_legacy_conversation_manager():
+    """Create a mock ClusteredConversationManager."""
+    return MagicMock()
+
+
+@pytest.fixture
+def legacy_manager(
+    mock_sio,
+    mock_config,
+    mock_server_config,
+    mock_file_store,
+    mock_conversation_manager,
+    mock_legacy_conversation_manager,
+):
+    """Create a LegacyConversationManager instance for testing."""
+    return LegacyConversationManager(
+        sio=mock_sio,
+        config=mock_config,
+        server_config=mock_server_config,
+        file_store=mock_file_store,
+        conversation_manager=mock_conversation_manager,
+        legacy_conversation_manager=mock_legacy_conversation_manager,
+    )
+
+
+class TestLegacyCacheEntry:
+    """Test the LegacyCacheEntry dataclass."""
+
+    def test_cache_entry_creation(self):
+        """Test creating a cache entry."""
+        timestamp = time.time()
+        entry = LegacyCacheEntry(is_legacy=True, timestamp=timestamp)
+
+        assert entry.is_legacy is True
+        assert entry.timestamp == timestamp
+
+    def test_cache_entry_false(self):
+        """Test creating a cache entry with False value."""
+        timestamp = time.time()
+        entry = LegacyCacheEntry(is_legacy=False, timestamp=timestamp)
+
+        assert entry.is_legacy is False
+        assert entry.timestamp == timestamp
+
+
+class TestLegacyConversationManagerCacheCleanup:
+    """Test cache cleanup functionality."""
+
+    def test_cleanup_expired_cache_entries_removes_expired(self, legacy_manager):
+        """Test that expired entries are removed from cache."""
+        current_time = time.time()
+        expired_time = current_time - _LEGACY_ENTRY_TIMEOUT_SECONDS - 1
+        valid_time = current_time - 100  # Well within timeout
+
+        # Add both expired and valid entries
+        legacy_manager._legacy_cache = {
+            'expired_conversation': LegacyCacheEntry(True, expired_time),
+            'valid_conversation': LegacyCacheEntry(False, valid_time),
+            'another_expired': LegacyCacheEntry(True, expired_time - 100),
+        }
+
+        legacy_manager._cleanup_expired_cache_entries()
+
+        # Only valid entry should remain
+        assert len(legacy_manager._legacy_cache) == 1
+        assert 'valid_conversation' in legacy_manager._legacy_cache
+        assert 'expired_conversation' not in legacy_manager._legacy_cache
+        assert 'another_expired' not in legacy_manager._legacy_cache
+
+    def test_cleanup_expired_cache_entries_keeps_valid(self, legacy_manager):
+        """Test that valid entries are kept during cleanup."""
+        current_time = time.time()
+        valid_time = current_time - 100  # Well within timeout
+
+        legacy_manager._legacy_cache = {
+            'valid_conversation_1': LegacyCacheEntry(True, valid_time),
+            'valid_conversation_2': LegacyCacheEntry(False, valid_time - 50),
+        }
+
+        legacy_manager._cleanup_expired_cache_entries()
+
+        # Both entries should remain
+        assert len(legacy_manager._legacy_cache) == 2
+        assert 'valid_conversation_1' in legacy_manager._legacy_cache
+        assert 'valid_conversation_2' in legacy_manager._legacy_cache
+
+    def test_cleanup_expired_cache_entries_empty_cache(self, legacy_manager):
+        """Test cleanup with empty cache."""
+        legacy_manager._legacy_cache = {}
+
+        legacy_manager._cleanup_expired_cache_entries()
+
+        assert len(legacy_manager._legacy_cache) == 0
+
+
+class TestIsLegacyRuntime:
+    """Test the is_legacy_runtime method."""
+
+    def test_is_legacy_runtime_none(self, legacy_manager):
+        """Test with None runtime."""
+        result = legacy_manager.is_legacy_runtime(None)
+        assert result is False
+
+    def test_is_legacy_runtime_legacy_command(self, legacy_manager):
+        """Test with legacy runtime command."""
+        runtime = {'command': 'some_old_legacy_command'}
+        result = legacy_manager.is_legacy_runtime(runtime)
+        assert result is True
+
+    def test_is_legacy_runtime_new_command(self, legacy_manager):
+        """Test with new runtime command containing openhands.server."""
+        runtime = {'command': 'python -m openhands.server.listen'}
+        result = legacy_manager.is_legacy_runtime(runtime)
+        assert result is False
+
+    def test_is_legacy_runtime_partial_match(self, legacy_manager):
+        """Test with command that partially matches but is still legacy."""
+        runtime = {'command': 'openhands.client.start'}
+        result = legacy_manager.is_legacy_runtime(runtime)
+        assert result is True
+
+    def test_is_legacy_runtime_empty_command(self, legacy_manager):
+        """Test with empty command."""
+        runtime = {'command': ''}
+        result = legacy_manager.is_legacy_runtime(runtime)
+        assert result is True
+
+    def test_is_legacy_runtime_missing_command_key(self, legacy_manager):
+        """Test with runtime missing command key."""
+        runtime = {'other_key': 'value'}
+        # This should raise a KeyError
+        with pytest.raises(KeyError):
+            legacy_manager.is_legacy_runtime(runtime)
+
+
+class TestShouldStartInLegacyMode:
+    """Test the should_start_in_legacy_mode method."""
+
+    @pytest.mark.asyncio
+    async def test_cache_hit_valid_entry_legacy(self, legacy_manager):
+        """Test cache hit with valid legacy entry."""
+        conversation_id = 'test_conversation'
+        current_time = time.time()
+
+        # Add valid cache entry
+        legacy_manager._legacy_cache[conversation_id] = LegacyCacheEntry(
+            True, current_time - 100
+        )
+
+        result = await legacy_manager.should_start_in_legacy_mode(conversation_id)
+
+        assert result is True
+        # Should not call _get_runtime since we hit cache
+        legacy_manager.conversation_manager._get_runtime.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_cache_hit_valid_entry_non_legacy(self, legacy_manager):
+        """Test cache hit with valid non-legacy entry."""
+        conversation_id = 'test_conversation'
+        current_time = time.time()
+
+        # Add valid cache entry
+        legacy_manager._legacy_cache[conversation_id] = LegacyCacheEntry(
+            False, current_time - 100
+        )
+
+        result = await legacy_manager.should_start_in_legacy_mode(conversation_id)
+
+        assert result is False
+        # Should not call _get_runtime since we hit cache
+        legacy_manager.conversation_manager._get_runtime.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_cache_miss_legacy_runtime(self, legacy_manager):
+        """Test cache miss with legacy runtime."""
+        conversation_id = 'test_conversation'
+        runtime = {'command': 'old_command'}
+
+        legacy_manager.conversation_manager._get_runtime.return_value = runtime
+
+        result = await legacy_manager.should_start_in_legacy_mode(conversation_id)
+
+        assert result is True
+        # Should call _get_runtime
+        legacy_manager.conversation_manager._get_runtime.assert_called_once_with(
+            conversation_id
+        )
+        # Should cache the result
+        assert conversation_id in legacy_manager._legacy_cache
+        assert legacy_manager._legacy_cache[conversation_id].is_legacy is True
+
+    @pytest.mark.asyncio
+    async def test_cache_miss_non_legacy_runtime(self, legacy_manager):
+        """Test cache miss with non-legacy runtime."""
+        conversation_id = 'test_conversation'
+        runtime = {'command': 'python -m openhands.server.listen'}
+
+        legacy_manager.conversation_manager._get_runtime.return_value = runtime
+
+        result = await legacy_manager.should_start_in_legacy_mode(conversation_id)
+
+        assert result is False
+        # Should call _get_runtime
+        legacy_manager.conversation_manager._get_runtime.assert_called_once_with(
+            conversation_id
+        )
+        # Should cache the result
+        assert conversation_id in legacy_manager._legacy_cache
+        assert legacy_manager._legacy_cache[conversation_id].is_legacy is False
+
+    @pytest.mark.asyncio
+    async def test_cache_expired_entry(self, legacy_manager):
+        """Test with expired cache entry."""
+        conversation_id = 'test_conversation'
+        expired_time = time.time() - _LEGACY_ENTRY_TIMEOUT_SECONDS - 1
+        runtime = {'command': 'python -m openhands.server.listen'}
+
+        # Add expired cache entry
+        legacy_manager._legacy_cache[conversation_id] = LegacyCacheEntry(
+            True,
+            expired_time,  # This should be considered expired
+        )
+
+        legacy_manager.conversation_manager._get_runtime.return_value = runtime
+
+        result = await legacy_manager.should_start_in_legacy_mode(conversation_id)
+
+        assert result is False  # Runtime indicates non-legacy
+        # Should call _get_runtime since cache is expired
+        legacy_manager.conversation_manager._get_runtime.assert_called_once_with(
+            conversation_id
+        )
+        # Should update cache with new result
+        assert legacy_manager._legacy_cache[conversation_id].is_legacy is False
+
+    @pytest.mark.asyncio
+    async def test_cache_exactly_at_timeout(self, legacy_manager):
+        """Test with cache entry exactly at timeout boundary."""
+        conversation_id = 'test_conversation'
+        timeout_time = time.time() - _LEGACY_ENTRY_TIMEOUT_SECONDS
+        runtime = {'command': 'python -m openhands.server.listen'}
+
+        # Add cache entry exactly at timeout
+        legacy_manager._legacy_cache[conversation_id] = LegacyCacheEntry(
+            True, timeout_time
+        )
+
+        legacy_manager.conversation_manager._get_runtime.return_value = runtime
+
+        result = await legacy_manager.should_start_in_legacy_mode(conversation_id)
+
+        # Should treat as expired and fetch from runtime
+        assert result is False
+        legacy_manager.conversation_manager._get_runtime.assert_called_once_with(
+            conversation_id
+        )
+
+    @pytest.mark.asyncio
+    async def test_runtime_returns_none(self, legacy_manager):
+        """Test when runtime returns None."""
+        conversation_id = 'test_conversation'
+
+        legacy_manager.conversation_manager._get_runtime.return_value = None
+
+        result = await legacy_manager.should_start_in_legacy_mode(conversation_id)
+
+        assert result is False
+        # Should cache the result
+        assert conversation_id in legacy_manager._legacy_cache
+        assert legacy_manager._legacy_cache[conversation_id].is_legacy is False
+
+    @pytest.mark.asyncio
+    async def test_cleanup_called_on_each_invocation(self, legacy_manager):
+        """Test that cleanup is called on each invocation."""
+        conversation_id = 'test_conversation'
+        runtime = {'command': 'test'}
+
+        legacy_manager.conversation_manager._get_runtime.return_value = runtime
+
+        # Mock the cleanup method to verify it's called
+        with patch.object(
+            legacy_manager, '_cleanup_expired_cache_entries'
+        ) as mock_cleanup:
+            await legacy_manager.should_start_in_legacy_mode(conversation_id)
+            mock_cleanup.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_multiple_conversations_cached_independently(self, legacy_manager):
+        """Test that multiple conversations are cached independently."""
+        conv1 = 'conversation_1'
+        conv2 = 'conversation_2'
+
+        runtime1 = {'command': 'old_command'}  # Legacy
+        runtime2 = {'command': 'python -m openhands.server.listen'}  # Non-legacy
+
+        # Mock to return different runtimes based on conversation_id
+        def mock_get_runtime(conversation_id):
+            if conversation_id == conv1:
+                return runtime1
+            return runtime2
+
+        legacy_manager.conversation_manager._get_runtime.side_effect = mock_get_runtime
+
+        result1 = await legacy_manager.should_start_in_legacy_mode(conv1)
+        result2 = await legacy_manager.should_start_in_legacy_mode(conv2)
+
+        assert result1 is True
+        assert result2 is False
+
+        # Both should be cached
+        assert conv1 in legacy_manager._legacy_cache
+        assert conv2 in legacy_manager._legacy_cache
+        assert legacy_manager._legacy_cache[conv1].is_legacy is True
+        assert legacy_manager._legacy_cache[conv2].is_legacy is False
+
+    @pytest.mark.asyncio
+    async def test_cache_timestamp_updated_on_refresh(self, legacy_manager):
+        """Test that cache timestamp is updated when entry is refreshed."""
+        conversation_id = 'test_conversation'
+        old_time = time.time() - _LEGACY_ENTRY_TIMEOUT_SECONDS - 1
+        runtime = {'command': 'test'}
+
+        # Add expired entry
+        legacy_manager._legacy_cache[conversation_id] = LegacyCacheEntry(True, old_time)
+        legacy_manager.conversation_manager._get_runtime.return_value = runtime
+
+        # Record time before call
+        before_call = time.time()
+        await legacy_manager.should_start_in_legacy_mode(conversation_id)
+        after_call = time.time()
+
+        # Timestamp should be updated
+        cached_entry = legacy_manager._legacy_cache[conversation_id]
+        assert cached_entry.timestamp >= before_call
+        assert cached_entry.timestamp <= after_call
+
+
+class TestLegacyConversationManagerIntegration:
+    """Integration tests for LegacyConversationManager."""
+
+    @pytest.mark.asyncio
+    async def test_get_instance_creates_proper_manager(
+        self,
+        mock_sio,
+        mock_config,
+        mock_file_store,
+        mock_server_config,
+        mock_monitoring_listener,
+    ):
+        """Test that get_instance creates a properly configured manager."""
+        with patch(
+            'server.legacy_conversation_manager.SaasNestedConversationManager'
+        ) as mock_saas, patch(
+            'server.legacy_conversation_manager.ClusteredConversationManager'
+        ) as mock_clustered:
+            mock_saas.get_instance.return_value = MagicMock()
+            mock_clustered.get_instance.return_value = MagicMock()
+
+            manager = LegacyConversationManager.get_instance(
+                mock_sio,
+                mock_config,
+                mock_file_store,
+                mock_server_config,
+                mock_monitoring_listener,
+            )
+
+            assert isinstance(manager, LegacyConversationManager)
+            assert manager.sio == mock_sio
+            assert manager.config == mock_config
+            assert manager.file_store == mock_file_store
+            assert manager.server_config == mock_server_config
+
+            # Verify that both nested managers are created
+            mock_saas.get_instance.assert_called_once()
+            mock_clustered.get_instance.assert_called_once()
+
+    def test_legacy_cache_initialized_empty(self, legacy_manager):
+        """Test that legacy cache is initialized as empty dict."""
+        assert isinstance(legacy_manager._legacy_cache, dict)
+        assert len(legacy_manager._legacy_cache) == 0
+
+
+class TestEdgeCases:
+    """Test edge cases and error scenarios."""
+
+    @pytest.mark.asyncio
+    async def test_get_runtime_raises_exception(self, legacy_manager):
+        """Test behavior when _get_runtime raises an exception."""
+        conversation_id = 'test_conversation'
+
+        legacy_manager.conversation_manager._get_runtime.side_effect = Exception(
+            'Runtime error'
+        )
+
+        # Should propagate the exception
+        with pytest.raises(Exception, match='Runtime error'):
+            await legacy_manager.should_start_in_legacy_mode(conversation_id)
+
+    @pytest.mark.asyncio
+    async def test_very_large_cache(self, legacy_manager):
+        """Test behavior with a large number of cache entries."""
+        current_time = time.time()
+
+        # Add many cache entries
+        for i in range(1000):
+            legacy_manager._legacy_cache[f'conversation_{i}'] = LegacyCacheEntry(
+                i % 2 == 0, current_time - i
+            )
+
+        # This should work without issues
+        await legacy_manager.should_start_in_legacy_mode('new_conversation')
+
+        # Should have added one more entry
+        assert len(legacy_manager._legacy_cache) == 1001
+
+    def test_cleanup_with_concurrent_modifications(self, legacy_manager):
+        """Test cleanup behavior when cache is modified during cleanup."""
+        current_time = time.time()
+        expired_time = current_time - _LEGACY_ENTRY_TIMEOUT_SECONDS - 1
+
+        # Add expired entries
+        legacy_manager._legacy_cache = {
+            f'conversation_{i}': LegacyCacheEntry(True, expired_time) for i in range(10)
+        }
+
+        # This should work without raising exceptions
+        legacy_manager._cleanup_expired_cache_entries()
+
+        # All entries should be removed
+        assert len(legacy_manager._legacy_cache) == 0
--- a/enterprise/tests/unit/test_saas_settings_store.py
+++ b/enterprise/tests/unit/test_saas_settings_store.py
@@ -243,7 +243,7 @@ async def test_update_settings_with_litellm_default(
    # Check that the URL and most of the JSON payload match what we expect
    assert call_args['json']['user_email'] == 'testy@tester.com'
    assert call_args['json']['models'] == []
-    assert call_args['json']['max_budget'] == 10.0
+    assert call_args['json']['max_budget'] == 20.0
    assert call_args['json']['user_id'] == 'user-id'
    assert call_args['json']['teams'] == ['test_team']
    assert call_args['json']['auto_create_key'] is True
--- a/enterprise/tests/unit/test_saas_user_auth.py
+++ b/enterprise/tests/unit/test_saas_user_auth.py
@@ -535,115 +535,3 @@ def test_get_api_key_from_header_with_invalid_authorization_format():

    # Assert that None was returned
    assert api_key is None
-
-
-def test_get_api_key_from_header_with_x_access_token():
-    """Test that get_api_key_from_header extracts API key from X-Access-Token header."""
-    # Create a mock request with X-Access-Token header
-    mock_request = MagicMock(spec=Request)
-    mock_request.headers = {'X-Access-Token': 'access_token_key'}
-
-    # Call the function
-    api_key = get_api_key_from_header(mock_request)
-
-    # Assert that the API key was correctly extracted
-    assert api_key == 'access_token_key'
-
-
-def test_get_api_key_from_header_priority_authorization_over_x_access_token():
-    """Test that Authorization header takes priority over X-Access-Token header."""
-    # Create a mock request with both headers
-    mock_request = MagicMock(spec=Request)
-    mock_request.headers = {
-        'Authorization': 'Bearer auth_api_key',
-        'X-Access-Token': 'access_token_key',
-    }
-
-    # Call the function
-    api_key = get_api_key_from_header(mock_request)
-
-    # Assert that the API key from Authorization header was used
-    assert api_key == 'auth_api_key'
-
-
-def test_get_api_key_from_header_priority_x_session_over_x_access_token():
-    """Test that X-Session-API-Key header takes priority over X-Access-Token header."""
-    # Create a mock request with both headers
-    mock_request = MagicMock(spec=Request)
-    mock_request.headers = {
-        'X-Session-API-Key': 'session_api_key',
-        'X-Access-Token': 'access_token_key',
-    }
-
-    # Call the function
-    api_key = get_api_key_from_header(mock_request)
-
-    # Assert that the API key from X-Session-API-Key header was used
-    assert api_key == 'session_api_key'
-
-
-def test_get_api_key_from_header_all_three_headers():
-    """Test header priority when all three headers are present."""
-    # Create a mock request with all three headers
-    mock_request = MagicMock(spec=Request)
-    mock_request.headers = {
-        'Authorization': 'Bearer auth_api_key',
-        'X-Session-API-Key': 'session_api_key',
-        'X-Access-Token': 'access_token_key',
-    }
-
-    # Call the function
-    api_key = get_api_key_from_header(mock_request)
-
-    # Assert that the API key from Authorization header was used (highest priority)
-    assert api_key == 'auth_api_key'
-
-
-def test_get_api_key_from_header_invalid_authorization_fallback_to_x_access_token():
-    """Test that invalid Authorization header falls back to X-Access-Token."""
-    # Create a mock request with invalid Authorization header and X-Access-Token
-    mock_request = MagicMock(spec=Request)
-    mock_request.headers = {
-        'Authorization': 'InvalidFormat api_key',
-        'X-Access-Token': 'access_token_key',
-    }
-
-    # Call the function
-    api_key = get_api_key_from_header(mock_request)
-
-    # Assert that the API key from X-Access-Token header was used
-    assert api_key == 'access_token_key'
-
-
-def test_get_api_key_from_header_empty_headers():
-    """Test that empty header values are handled correctly."""
-    # Create a mock request with empty header values
-    mock_request = MagicMock(spec=Request)
-    mock_request.headers = {
-        'Authorization': '',
-        'X-Session-API-Key': '',
-        'X-Access-Token': 'access_token_key',
-    }
-
-    # Call the function
-    api_key = get_api_key_from_header(mock_request)
-
-    # Assert that the API key from X-Access-Token header was used
-    assert api_key == 'access_token_key'
-
-
-def test_get_api_key_from_header_bearer_with_empty_token():
-    """Test that Bearer header with empty token falls back to other headers."""
-    # Create a mock request with Bearer header with empty token
-    mock_request = MagicMock(spec=Request)
-    mock_request.headers = {
-        'Authorization': 'Bearer ',
-        'X-Access-Token': 'access_token_key',
-    }
-
-    # Call the function
-    api_key = get_api_key_from_header(mock_request)
-
-    # Assert that empty string from Bearer is returned (current behavior)
-    # This tests the current implementation behavior
-    assert api_key == ''
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -1,10 +1,5 @@
 # Evaluation

-> [!WARNING]
-> **This directory is deprecated.** Our new benchmarks are located at [OpenHands/benchmarks](https://github.com/OpenHands/benchmarks).
->
-> If you have already implemented a benchmark in this directory and would like to contribute it, we are happy to have the contribution. However, if you are starting anew, please use the new location.
-
 This folder contains code and resources to run experiments and evaluations.

 ## For Benchmark Users
--- a/evaluation/benchmarks/multi_swe_bench/README.md
+++ b/evaluation/benchmarks/multi_swe_bench/README.md
@@ -15,7 +15,7 @@ python evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py

 ## Docker image download

-Please download the multi-swe-bench docker images from [here](https://github.com/multi-swe-bench/multi-swe-bench?tab=readme-ov-file#run-evaluation).
+Please download the multi-swe-bench dokcer images from [here](https://github.com/multi-swe-bench/multi-swe-bench?tab=readme-ov-file#run-evaluation).

 ## Generate patch

@@ -47,7 +47,7 @@ For debugging purposes, you can set `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=t

 The results will be generated in evaluation/evaluation_outputs/outputs/XXX/CodeActAgent/YYY/output.jsonl, you can refer to the [example](examples/output.jsonl).

-## Running evaluation
+## Runing evaluation

 First, install [multi-swe-bench](https://github.com/multi-swe-bench/multi-swe-bench).

--- a/evaluation/benchmarks/multi_swe_bench/compute_skip_ids.py
+++ b/evaluation/benchmarks/multi_swe_bench/compute_skip_ids.py
@@ -1,79 +0,0 @@
-import argparse
-import fnmatch
-import json
-from collections import Counter
-from pathlib import Path
-
-
-def find_final_reports(base_dir, pattern=None):
-    base_path = Path(base_dir)
-    if not base_path.exists():
-        raise FileNotFoundError(f'Base directory does not exist: {base_dir}')
-
-    # Find all final_report.json files
-    all_reports = list(base_path.rglob('final_report.json'))
-
-    if pattern is None:
-        return all_reports
-
-    # Filter by pattern
-    filtered_reports = []
-    for report in all_reports:
-        # Get relative path from base_dir for matching
-        rel_path = report.relative_to(base_path)
-        if fnmatch.fnmatch(str(rel_path), pattern):
-            filtered_reports.append(report)
-
-    return filtered_reports
-
-
-def collect_resolved_ids(report_files):
-    id_counter = Counter()
-
-    for report_file in report_files:
-        with open(report_file, 'r') as f:
-            data = json.load(f)
-            if 'resolved_ids' not in data:
-                raise KeyError(f"'resolved_ids' key not found in {report_file}")
-            resolved_ids = data['resolved_ids']
-            id_counter.update(resolved_ids)
-
-    return id_counter
-
-
-def get_skip_ids(id_counter, threshold):
-    return [id_str for id_str, count in id_counter.items() if count >= threshold]
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Compute SKIP_IDS from resolved IDs in final_report.json files'
-    )
-    parser.add_argument(
-        'threshold',
-        type=int,
-        help='Minimum number of times an ID must be resolved to be skipped',
-    )
-    parser.add_argument(
-        '--base-dir',
-        default='evaluation/evaluation_outputs/outputs',
-        help='Base directory to search for final_report.json files (default: evaluation/evaluation_outputs/outputs)',
-    )
-    parser.add_argument(
-        '--pattern',
-        default=None,
-        help='Glob pattern to filter paths (e.g., "*Multi-SWE-RL*/**/*gpt*")',
-    )
-
-    args = parser.parse_args()
-    report_files = find_final_reports(args.base_dir, args.pattern)
-    id_counter = collect_resolved_ids(report_files)
-
-    skip_ids = get_skip_ids(id_counter, args.threshold)
-    skip_ids = [s.replace('/', '__').replace(':pr-', '-') for s in skip_ids]
-    skip_ids = ','.join(sorted(skip_ids))
-    print(skip_ids)
-
-
-if __name__ == '__main__':
-    main()
--- a/evaluation/benchmarks/multi_swe_bench/run_infer.py
+++ b/evaluation/benchmarks/multi_swe_bench/run_infer.py
@@ -747,14 +747,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
                subset = dataset[dataset[filter_column].isin(selected_ids)]
                logger.info(f'Retained {subset.shape[0]} tasks after filtering')
                return subset
-    skip_ids = [id for id in os.environ.get('SKIP_IDS', '').split(',') if id]
+    skip_ids = os.environ.get('SKIP_IDS', '').split(',')
    if len(skip_ids) > 0:
-        logger.info(f'Dataset size before filtering: {dataset.shape[0]} tasks')
        logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
-        logger.info(f'SKIP_IDS:\n{skip_ids}')
-        filtered_dataset = dataset[~dataset[filter_column].isin(skip_ids)]
-        logger.info(f'Dataset size after filtering: {filtered_dataset.shape[0]} tasks')
-        return filtered_dataset
+        return dataset[~dataset[filter_column].isin(skip_ids)]
    return dataset


@@ -772,11 +768,6 @@ if __name__ == '__main__':
        default='test',
        help='split to evaluate on',
    )
-    parser.add_argument(
-        '--filter_dataset_after_sampling',
-        action='store_true',
-        help='if provided, filter dataset after sampling instead of before',
-    )
    args, _ = parser.parse_known_args()

    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
@@ -786,24 +777,10 @@ if __name__ == '__main__':
    logger.info(f'Loading dataset {args.dataset} with split {args.split} ')
    dataset = load_dataset('json', data_files=args.dataset)
    dataset = dataset[args.split]
-    swe_bench_tests = dataset.to_pandas()
-
-    # Determine filter strategy based on flag
-    filter_func = None
-    if args.filter_dataset_after_sampling:
-        # Pass filter as callback to apply after sampling
-        def filter_func(df):
-            return filter_dataset(df, 'instance_id')
-
-        logger.info(
-            f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks (filtering will occur after sampling)'
-        )
-    else:
-        # Apply filter before sampling
-        swe_bench_tests = filter_dataset(swe_bench_tests, 'instance_id')
-        logger.info(
-            f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
-        )
+    swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
+    logger.info(
+        f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
+    )

    llm_config = None
    if args.llm_config:
@@ -833,9 +810,7 @@ if __name__ == '__main__':

    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
    print(f'### OUTPUT FILE: {output_file} ###')
-    instances = prepare_dataset(
-        swe_bench_tests, output_file, args.eval_n_limit, filter_func=filter_func
-    )
+    instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)

    if len(instances) > 0 and not isinstance(
        instances['FAIL_TO_PASS'][instances['FAIL_TO_PASS'].index[0]], str
--- a/evaluation/benchmarks/multi_swe_bench/scripts/rollout_multi_swegym.sh
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/rollout_multi_swegym.sh
@@ -8,14 +8,8 @@
 MODEL=$1  # eg your llm config name in config.toml (eg: "llm.claude-3-5-sonnet-20241022-t05")
 EXP_NAME=$2 # "train-t05"
 EVAL_DATASET=$3  # path to original dataset (jsonl file)
-MAX_ITER=$4
-N_WORKERS=${5:-64}
-N_RUNS=${6:-1}
-EVAL_LIMIT=${7:-}
-SKIP_IDS_THRESHOLD=$8
-SKIP_IDS_PATTERN=$9
-INPUT_SKIP_IDS=${10}
-FILTER_DATASET_AFTER_SAMPLING=${11:-}
+N_WORKERS=${4:-64}
+N_RUNS=${5:-1}

 export EXP_NAME=$EXP_NAME
 # use 2x resources for rollout since some codebases are pretty resource-intensive
@@ -23,7 +17,6 @@ export DEFAULT_RUNTIME_RESOURCE_FACTOR=2
 echo "MODEL: $MODEL"
 echo "EXP_NAME: $EXP_NAME"
 echo "EVAL_DATASET: $EVAL_DATASET"
-echo "INPUT_SKIP_IDS: $INPUT_SKIP_IDS"
 # Generate DATASET path by adding _with_runtime_ before .jsonl extension
 DATASET="${EVAL_DATASET%.jsonl}_with_runtime_.jsonl"  # path to converted dataset

@@ -42,6 +35,9 @@ else
    export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
 fi

+#EVAL_LIMIT=3000
+MAX_ITER=100
+

 # ===== Run inference =====
 source "evaluation/utils/version_control.sh"
@@ -73,52 +69,17 @@ function run_eval() {
    --dataset $DATASET \
    --split $SPLIT"

-  # Conditionally add filter flag
-  if [ "$FILTER_DATASET_AFTER_SAMPLING" = "true" ]; then
-    COMMAND="$COMMAND --filter_dataset_after_sampling"
-  fi
-
  echo "Running command: $COMMAND"
  if [ -n "$EVAL_LIMIT" ]; then
    echo "EVAL_LIMIT: $EVAL_LIMIT"
    COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
  fi

+  # Run the command
  eval $COMMAND
 }

 for run_idx in $(seq 1 $N_RUNS); do
-    if [ -n "$SKIP_IDS_THRESHOLD" ]; then
-        echo "Computing SKIP_IDS for run $run_idx..."
-        SKIP_CMD="poetry run python evaluation/benchmarks/multi_swe_bench/compute_skip_ids.py $SKIP_IDS_THRESHOLD"
-        if [ -n "$SKIP_IDS_PATTERN" ]; then
-            SKIP_CMD="$SKIP_CMD --pattern \"$SKIP_IDS_PATTERN\""
-        fi
-        COMPUTED_SKIP_IDS=$(eval $SKIP_CMD)
-        SKIP_STATUS=$?
-        if [ $SKIP_STATUS -ne 0 ]; then
-            echo "ERROR: Skip IDs computation failed with exit code $SKIP_STATUS"
-            exit $SKIP_STATUS
-        fi
-        echo "COMPUTED_SKIP_IDS: $COMPUTED_SKIP_IDS"
-    else
-        echo "SKIP_IDS_THRESHOLD not provided, skipping SKIP_IDS computation"
-        COMPUTED_SKIP_IDS=""
-    fi
-
-    # Concatenate COMPUTED_SKIP_IDS and INPUT_SKIP_IDS
-    if [ -n "$COMPUTED_SKIP_IDS" ] && [ -n "$INPUT_SKIP_IDS" ]; then
-        export SKIP_IDS="${COMPUTED_SKIP_IDS},${INPUT_SKIP_IDS}"
-    elif [ -n "$COMPUTED_SKIP_IDS" ]; then
-        export SKIP_IDS="$COMPUTED_SKIP_IDS"
-    elif [ -n "$INPUT_SKIP_IDS" ]; then
-        export SKIP_IDS="$INPUT_SKIP_IDS"
-    else
-        unset SKIP_IDS
-    fi
-
-    echo "FINAL SKIP_IDS: $SKIP_IDS"
-    echo ""

    while true; do
        echo "### Running inference... ###"
--- a/evaluation/benchmarks/swefficiency/README.md
+++ b/evaluation/benchmarks/swefficiency/README.md
@@ -1,65 +0,0 @@
-# SWE-fficiency Evaluation
-
-This folder contains the OpenHands inference generation of the [SWE-fficiency benchmark](https://swefficiency.com/) ([paper](https://arxiv.org/pdf/2507.12415v1)).
-
-The evaluation consists of three steps:
-
-1. Environment setup: [install python environment](../../README.md#development-environment) and [configure LLM config](../../README.md#configure-openhands-and-your-llm).
-2. [Run inference](#running-inference-locally-with-docker): Generate a edit patch for each Github issue
-3. [Evaluate patches](#evaluate-generated-patches)
-
-## Setup Environment and LLM Configuration
-
-Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
-
-## Running inference Locally with Docker
-
-Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-PErf set you are running on) for the instance-level docker image.
-
-When the `run_infer.sh` script is started, it will automatically pull the relevant SWE-Perf images.
-For example, for instance ID `scikit-learn_scikit-learn-11674`, it will try to pull our pre-build docker image `betty1202/sweb.eval.x86_64.scikit-learn_s_scikit-learn-11674` from DockerHub.
-This image will be used create an OpenHands runtime image where the agent will operate on.
-
-```bash
-./evaluation/benchmarks/swefficiency/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] [n_runs] [mode]
-
-# Example
-./evaluation/benchmarks/swefficiency/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 500 100 1 swefficiency/swefficiency test
-```
-
-where `model_config` is mandatory, and the rest are optional.
-
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
-LLM settings, as defined in your `config.toml`.
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would
-like to evaluate. It could also be a release tag like `0.6.2`.
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
-to `CodeActAgent`.
- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
-default, the script evaluates the entire SWE-Perf test set (140 issues). Note:
-in order to use `eval_limit`, you must also set `agent`.
- `max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
-default, it is set to 100.
- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
-default, it is set to 1.
- `dataset`, a huggingface dataset name. e.g. `SWE-Perf/SWE-Perf`, specifies which dataset to evaluate on.
- `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.
-
- `n_runs`, e.g. `3`, is the number of times to run the evaluation. Default is 1.
- `mode`, e.g. `swt`, `swt-ci`, or `swe`, specifies the evaluation mode. Default is `swe`.
-
-> [!CAUTION]
-> Setting `num_workers` larger than 1 is not officially tested, YMMV.
-
-
-Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and CodeActAgent,
-
-then your command would be:
-
-```bash
-./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
-```
-
-### 2. Run the SWE-fficiency benchmark official evaluation
-
-Once the output is converted, use the [official SWE-fficiency benchmark evaluation](https://github.com/swefficiency/swefficiency) to evaluate it.
--- a/evaluation/benchmarks/swefficiency/binary_patch_utils.py
+++ b/evaluation/benchmarks/swefficiency/binary_patch_utils.py
@@ -1,52 +0,0 @@
-"""
-Utilities for handling binary files and patch generation in SWE-bench evaluation.
-"""
-
-
-def remove_binary_diffs(patch_text):
-    """
-    Remove binary file diffs from a git patch.
-
-    Args:
-        patch_text (str): The git patch text
-
-    Returns:
-        str: The cleaned patch text with binary diffs removed
-    """
-    lines = patch_text.splitlines()
-    cleaned_lines = []
-    block = []
-    is_binary_block = False
-
-    for line in lines:
-        if line.startswith('diff --git '):
-            if block and not is_binary_block:
-                cleaned_lines.extend(block)
-            block = [line]
-            is_binary_block = False
-        elif 'Binary files' in line:
-            is_binary_block = True
-            block.append(line)
-        else:
-            block.append(line)
-
-    if block and not is_binary_block:
-        cleaned_lines.extend(block)
-    return '\n'.join(cleaned_lines)
-
-
-def remove_binary_files_from_git():
-    """
-    Generate a bash command to remove binary files from git staging.
-
-    Returns:
-        str: A bash command that removes binary files from git staging
-    """
-    return """
-    for file in $(git status --porcelain | grep -E "^(M| M|\\?\\?|A| A)" | cut -c4-); do
-        if [ -f "$file" ] && (file "$file" | grep -q "executable" || git check-attr binary "$file" | grep -q "binary: set"); then
-            git rm -f "$file" 2>/dev/null || rm -f "$file"
-            echo "Removed: $file"
-        fi
-    done
-    """.strip()
--- a/evaluation/benchmarks/swefficiency/run_infer.py
+++ b/evaluation/benchmarks/swefficiency/run_infer.py
@@ -1,960 +0,0 @@
-import asyncio
-import copy
-import functools
-import json
-import multiprocessing
-import os
-import tempfile
-from typing import Any, Literal
-
-import pandas as pd
-import toml
-from datasets import load_dataset
-
-import openhands.agenthub
-from evaluation.benchmarks.swe_bench.binary_patch_utils import (
-    remove_binary_diffs,
-    remove_binary_files_from_git,
-)
-from evaluation.utils.shared import (
-    EvalException,
-    EvalMetadata,
-    EvalOutput,
-    assert_and_raise,
-    codeact_user_response,
-    get_default_sandbox_config_for_eval,
-    get_metrics,
-    is_fatal_evaluation_error,
-    make_metadata,
-    prepare_dataset,
-    reset_logger_for_multiprocessing,
-    run_evaluation,
-    update_llm_config_for_completions_logging,
-)
-from openhands.controller.state.state import State
-from openhands.core.config import (
-    AgentConfig,
-    OpenHandsConfig,
-    get_evaluation_parser,
-    get_llm_config_arg,
-)
-from openhands.core.config.condenser_config import NoOpCondenserConfig
-from openhands.core.config.utils import get_condenser_config_arg
-from openhands.core.logger import openhands_logger as logger
-from openhands.core.main import create_runtime, run_controller
-from openhands.critic import AgentFinishedCritic
-from openhands.events.action import CmdRunAction, FileReadAction, MessageAction
-from openhands.events.observation import (
-    CmdOutputObservation,
-    ErrorObservation,
-    FileReadObservation,
-)
-from openhands.events.serialization.event import event_from_dict, event_to_dict
-from openhands.runtime.base import Runtime
-from openhands.utils.async_utils import call_async_from_sync
-from openhands.utils.shutdown_listener import sleep_if_should_continue
-
-USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
-RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
-BenchMode = Literal['swe', 'swt', 'swt-ci']
-
-
-AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
-    'CodeActAgent': codeact_user_response,
-}
-
-
-def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
-    return f'{instance.repo}__{instance.version}'.replace('/', '__')
-
-
-def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
-    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
-
-    # TODO: Change to testbed?
-    instruction = f"""
-<uploaded_files>
-/workspace/{workspace_dir_name}
-</uploaded_files>
-
-I’ve uploaded a python code repository in the directory workspace_dir_name. Consider the following performance workload and `workload()` function showing an specific usage of the repository:
-<performance_workload>
-{instance.workload}
-</performance_workload>
-
-Can you help me implement the necessary changes to the repository so that the runtime of the `workload()` function is faster? Basic guidelines:
-1. Your task is to make changes to non-test files in the /workspace directory to improve the performance of the code running in `workload()`. Please do not directly change the implementation of the `workload()` function to optimize things: I want you to focus on making the workload AS IS run faster by only editing the repository containing code that the `workload()` function calls.
-2. Make changes while ensuring the repository is functionally equivalent to the original: your changes should not introduce new bugs or cause already-passing tests to begin failing after your changes. However, you do not need to worry about tests that already fail without any changes made. For relevant test files you find in the repository, you can run them via the bash command `{instance.test_cmd} <test_file>` to check for correctness. Note that running all the tests may take a long time, so you need to determine which tests are relevant to your changes.
-3. Make sure the `workload()` function improves in performance after you make changes to the repository. The workload can potentially take some time to run, so please allow it to finish and be generous with setting your timeout parameter (a timeout value of 3600 or larger here is encouraged): for faster iteration, you should adjust the workload script to use fewer iterations. Before you complete your task, please make sure to check that the **original performance workload** and `workload()` function runs successfully and the performance is improved.
-4. You may need to reinstall/rebuild the repo for your changes to take effect before testing if you made non-Python changes. Reinstalling may take a long time to run (a timeout value of 3600 or larger here is encouraged), so please be patient with running it and allow it to complete if possible. You can reinstall the repository by running the bash command `{instance.rebuild_cmd}` in the workspace directory.
-5. All the dependencies required to run the `workload()` function are already installed in the environment. You should not install or upgrade any dependencies.
-
-Follow these steps to improve performance:
-1. As a first step, explore the repository structure.
-2. Create a Python script to reproduce the performance workload, execute it with python <workload_file>, and examine the printed output metrics.
-3. Edit the source code of the repository to improve performance. Please do not change the contents of the `workload()` function itself, but focus on optimizing the code in the repository that the original `workload()` function uses.
-4. If non-Python changes were made, rebuild the repo to make sure the changes take effect.
-5. Rerun your script to confirm that performance has improved.
-6. If necessary, identify any relevant test files in the repository related to your changes and verify that test statuses did not change after your modifications.
-7. After each attempted change, please reflect on the changes attempted and the performance impact observed. If the performance did not improve, consider alternative approaches or optimizations.
-8. Once you are satisfied, please use the finish command to complete your task.
-
-Please remember that you should not change the implementation of the `workload()` function. The performance improvement should solely come from editing the source files in the code repository.
-"""
-
-    if RUN_WITH_BROWSING:
-        instruction += (
-            '<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
-        )
-
-    return MessageAction(content=instruction)
-
-
-def get_instance_docker_image(
-    instance_id: str,
-) -> str:
-    return f'ghcr.io/swefficiency/swefficiency-images:{instance_id}'
-
-
-def get_config(
-    instance: pd.Series,
-    metadata: EvalMetadata,
-    cpu_group: list[int] | None = None,
-) -> OpenHandsConfig:
-    # We use a different instance image for the each instance of swe-bench eval
-    base_container_image = get_instance_docker_image(
-        instance['instance_id'],
-    )
-    logger.info(
-        f'Using instance container image: {base_container_image}. '
-        f'Please make sure this image exists. '
-        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
-    )
-
-    sandbox_config = get_default_sandbox_config_for_eval()
-    sandbox_config.base_container_image = base_container_image
-    sandbox_config.enable_auto_lint = True
-    sandbox_config.use_host_network = False
-    sandbox_config.timeout = 3600
-
-    # Control container cleanup behavior via environment variable
-    # Default to False for multiprocessing stability to prevent cascade failures
-    sandbox_config.rm_all_containers = True
-
-    sandbox_config.platform = 'linux/amd64'
-    sandbox_config.remote_runtime_resource_factor = 4.0
-    sandbox_config.runtime_startup_env_vars.update(
-        {
-            'NO_CHANGE_TIMEOUT_SECONDS': '900',  # 15 minutes
-        }
-    )
-
-    if cpu_group is not None:
-        print(f'Configuring Docker runtime with CPU group: {cpu_group}')
-        sandbox_config.docker_runtime_kwargs = {
-            # HACK: Use the cpu_group if provided, otherwise use all available CPUs
-            'cpuset_cpus': ','.join(map(str, cpu_group)),
-            'nano_cpus': int(1e9 * len(cpu_group)),  # optional: hard cap to vCPU count
-            'mem_limit': '16g',
-        }
-
-    # Note: We keep rm_all_containers = False for worker process safety
-
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
-        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
-    )
-    config.set_llm_config(
-        update_llm_config_for_completions_logging(
-            metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
-        )
-    )
-    agent_config = AgentConfig(
-        enable_jupyter=False,
-        enable_browsing=RUN_WITH_BROWSING,
-        enable_llm_editor=False,
-        enable_mcp=False,
-        condenser=metadata.condenser_config,
-        enable_prompt_extensions=False,
-    )
-    config.set_agent_config(agent_config)
-    return config
-
-
-def initialize_runtime(
-    runtime: Runtime,
-    instance: pd.Series,  # this argument is not required
-    metadata: EvalMetadata,
-):
-    """Initialize the runtime for the agent.
-
-    This function is called before the runtime is used to run the agent.
-    """
-    logger.info('-' * 30)
-    logger.info('BEGIN Runtime Initialization Fn')
-    logger.info('-' * 30)
-    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
-    obs: CmdOutputObservation
-
-    # Set instance id and git configuration
-    action = CmdRunAction(
-        command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc && git config --global core.pager "" && git config --global diff.binary false"""
-    )
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0,
-        f'Failed to export SWE_INSTANCE_ID and configure git: {str(obs)}',
-    )
-
-    action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
-
-    # inject the init script
-    script_dir = os.path.dirname(__file__)
-
-    # inject the instance info
-    action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0,
-        f'Failed to create /swe_util/eval_data/instances: {str(obs)}',
-    )
-
-    swe_instance_json_name = 'swe-bench-instance.json'
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Construct the full path for the desired file name within the temporary directory
-        temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
-        # Write to the file with the desired name within the temporary directory
-        with open(temp_file_path, 'w') as f:
-            if not isinstance(instance, dict):
-                json.dump([instance.to_dict()], f)
-            else:
-                json.dump([instance], f)
-
-        # Copy the file to the desired location
-        runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
-
-        # inject the instance swe entry
-        runtime.copy_to(
-            str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
-            '/swe_util/',
-        )
-
-    action = CmdRunAction(command='cat ~/.bashrc')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
-
-    action = CmdRunAction(command='source ~/.bashrc')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    if isinstance(obs, ErrorObservation):
-        logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
-    assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
-
-    action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0,
-        f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}',
-    )
-
-    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0,
-        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
-    )
-
-    action = CmdRunAction(command='git reset --hard')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}')
-
-    action = CmdRunAction(
-        command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
-    )
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
-
-    action = CmdRunAction(command='which python')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0 and 'testbed' in obs.content,
-        f'Expected to find python interpreter from testbed, but got: {str(obs)}',
-    )
-
-    logger.info('-' * 30)
-    logger.info('END Runtime Initialization Fn')
-    logger.info('-' * 30)
-
-
-def complete_runtime(
-    runtime: Runtime,
-    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
-) -> dict[str, Any]:
-    """Complete the runtime for the agent.
-
-    This function is called before the runtime is used to run the agent.
-    If you need to do something in the sandbox to get the correctness metric after
-    the agent has run, modify this function.
-    """
-    logger.info('-' * 30)
-    logger.info('BEGIN Runtime Completion Fn')
-    logger.info('-' * 30)
-    obs: CmdOutputObservation
-    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
-
-    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    if obs.exit_code == -1:
-        # The previous command is still running
-        # We need to kill previous command
-        logger.info('The previous command is still running, trying to kill it...')
-        action = CmdRunAction(command='C-c')
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-        # Then run the command again
-        action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-        action.set_hard_timeout(600)
-        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    if obs.exit_code == -1:
-        # The previous command is still running
-        # We need to kill previous command
-        logger.info('The previous command is still running, trying to ctrl+z it...')
-        action = CmdRunAction(command='C-z')
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-        # Then run the command again
-        action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-        action.set_hard_timeout(600)
-        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
-    )
-
-    action = CmdRunAction(command='git config --global core.pager ""')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to git config --global core.pager "": {str(obs)}',
-    )
-
-    # First check for any git repositories in subdirectories
-    action = CmdRunAction(command='find . -type d -name .git -not -path "./.git"')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to find git repositories: {str(obs)}',
-    )
-
-    git_dirs = [p for p in obs.content.strip().split('\n') if p]
-    if git_dirs:
-        # Remove all .git directories in subdirectories
-        for git_dir in git_dirs:
-            action = CmdRunAction(command=f'rm -rf "{git_dir}"')
-            action.set_hard_timeout(600)
-            logger.info(action, extra={'msg_type': 'ACTION'})
-            obs = runtime.run_action(action)
-            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-            assert_and_raise(
-                isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-                f'Failed to remove git directory {git_dir}: {str(obs)}',
-            )
-
-    # add all files
-    action = CmdRunAction(command='git add -A')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to git add -A: {str(obs)}',
-    )
-
-    # Remove binary files from git staging
-    action = CmdRunAction(command=remove_binary_files_from_git())
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to remove binary files: {str(obs)}',
-    )
-
-    n_retries = 0
-    git_patch = None
-    while n_retries < 5:
-        action = CmdRunAction(
-            command=f'git diff --no-color --cached {instance["base_commit"]} > patch.diff'
-        )
-        action.set_hard_timeout(max(300 + 100 * n_retries, 600))
-        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        n_retries += 1
-        if isinstance(obs, CmdOutputObservation):
-            if obs.exit_code == 0:
-                # Read the patch file
-                action = FileReadAction(path='patch.diff')
-                action.set_hard_timeout(max(300 + 100 * n_retries, 600))
-                logger.info(action, extra={'msg_type': 'ACTION'})
-                obs = runtime.run_action(action)
-                logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-                if isinstance(obs, FileReadObservation):
-                    git_patch = obs.content
-                    break
-                elif isinstance(obs, ErrorObservation):
-                    # Fall back to cat "patch.diff" to get the patch
-                    assert 'File could not be decoded as utf-8' in obs.content
-                    action = CmdRunAction(command='cat patch.diff')
-                    action.set_hard_timeout(max(300 + 100 * n_retries, 600))
-                    logger.info(action, extra={'msg_type': 'ACTION'})
-                    obs = runtime.run_action(action)
-                    assert isinstance(obs, CmdOutputObservation) and obs.exit_code == 0
-                    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-                    git_patch = obs.content
-                    break
-                else:
-                    assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
-            else:
-                logger.info('Failed to get git diff, retrying...')
-                sleep_if_should_continue(10)
-        elif isinstance(obs, ErrorObservation):
-            logger.error(f'Error occurred: {obs.content}. Retrying...')
-            sleep_if_should_continue(10)
-        else:
-            assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
-
-    assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
-
-    # Remove binary diffs from the patch
-    git_patch = remove_binary_diffs(git_patch)
-
-    logger.info('-' * 30)
-    logger.info('END Runtime Completion Fn')
-    logger.info('-' * 30)
-    return {'git_patch': git_patch}
-
-
-class CPUGroupManager:
-    def __init__(self, cpu_groups_queue: multiprocessing.Queue):
-        self.cpu_groups_queue = cpu_groups_queue
-
-    def __enter__(self):
-        # Get the current CPU group for this worker]
-        if self.cpu_groups_queue is not None:
-            self.cpu_group = self.cpu_groups_queue.get()
-            logger.info(f'Worker started with CPU group: {self.cpu_group}')
-            return self.cpu_group
-        return None
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        # Put the CPU group back into the queue for other workers to use
-        if self.cpu_groups_queue is not None:
-            self.cpu_groups_queue.put(self.cpu_group)
-            logger.info(f'Worker finished with CPU group: {self.cpu_group}')
-
-
-def cleanup_docker_resources_for_worker():
-    """Clean up Docker resources specific to this worker process.
-
-    This prevents cascade failures when one worker's container crashes.
-    Note: This only cleans up stale locks, not containers, to avoid
-    interfering with other workers. Container cleanup is handled
-    by the DockerRuntime.close() method based on configuration.
-    """
-
-    # Clean up any stale port locks from crashed processes
-    try:
-        from openhands.runtime.utils.port_lock import cleanup_stale_locks
-
-        cleanup_stale_locks(max_age_seconds=300)  # Clean up locks older than 5 minutes
-    except Exception as e:
-        logger.debug(f'Error cleaning up stale port locks: {e}')
-
-
-def process_instance(
-    instance: pd.Series,
-    metadata: EvalMetadata,
-    reset_logger: bool = True,
-    runtime_failure_count: int = 0,
-    cpu_groups_queue: multiprocessing.Queue = None,
-) -> EvalOutput:
-    # Clean up any Docker resources from previous failed runs
-    cleanup_docker_resources_for_worker()
-
-    # HACK: Use the global and get the cpu group for this worker.
-    with CPUGroupManager(cpu_groups_queue) as cpu_group:
-        config = get_config(instance, metadata, cpu_group=cpu_group)
-
-        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-        if reset_logger:
-            log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
-            reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
-        else:
-            logger.info(f'Starting evaluation for instance {instance.instance_id}.')
-
-        metadata = copy.deepcopy(metadata)
-        metadata.details['runtime_failure_count'] = runtime_failure_count
-        metadata.details['remote_runtime_resource_factor'] = (
-            config.sandbox.remote_runtime_resource_factor
-        )
-
-        runtime = create_runtime(config, sid=None)
-        call_async_from_sync(runtime.connect)
-
-        try:
-            initialize_runtime(runtime, instance, metadata)
-
-            message_action = get_instruction(instance, metadata)
-
-            # Here's how you can run the agent (similar to the `main` function) and get the final task state
-            state: State | None = asyncio.run(
-                run_controller(
-                    config=config,
-                    initial_user_action=message_action,
-                    runtime=runtime,
-                    fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                        metadata.agent_class
-                    ],
-                )
-            )
-
-            # if fatal error, throw EvalError to trigger re-run
-            if is_fatal_evaluation_error(state.last_error):
-                raise EvalException('Fatal error detected: ' + state.last_error)
-
-            # ======= THIS IS SWE-Bench specific =======
-            # Get git patch
-            return_val = complete_runtime(runtime, instance)
-            git_patch = return_val['git_patch']
-            logger.info(
-                f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
-            )
-        except Exception as e:
-            # Log the error but don't let it crash other workers
-            logger.error(
-                f'Error in worker processing instance {instance.instance_id}: {str(e)}'
-            )
-            raise
-        finally:
-            # Ensure runtime is properly closed to prevent cascade failures
-            try:
-                runtime.close()
-            except Exception as e:
-                logger.warning(
-                    f'Error closing runtime for {instance.instance_id}: {str(e)}'
-                )
-                # Don't re-raise - we want to continue cleanup
-
-        # ==========================================
-
-        # ======= Attempt to evaluate the agent's edits =======
-        # we use eval_infer.sh to evaluate the agent's edits, not here
-        # because the agent may alter the environment / testcases
-        test_result = {
-            'git_patch': git_patch,
-        }
-
-        # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-        if state is None:
-            raise ValueError('State should not be None.')
-
-        # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
-        histories = [event_to_dict(event) for event in state.history]
-        metrics = get_metrics(state)
-
-        # Save the output
-        instruction = message_action.content
-        if message_action.image_urls:
-            instruction += (
-                '\n\n<image_urls>'
-                + '\n'.join(message_action.image_urls)
-                + '</image_urls>'
-            )
-        output = EvalOutput(
-            instance_id=instance.instance_id,
-            instruction=instruction,
-            instance=instance.to_dict(),  # SWE Bench specific
-            test_result=test_result,
-            metadata=metadata,
-            history=histories,
-            metrics=metrics,
-            error=state.last_error if state and state.last_error else None,
-        )
-        return output
-
-
-def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
-    file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml')
-    if os.path.exists(file_path):
-        with open(file_path, 'r') as file:
-            data = toml.load(file)
-            if 'selected_ids' in data:
-                selected_ids = data['selected_ids']
-                logger.info(
-                    f'Filtering {len(selected_ids)} tasks from "selected_ids"...'
-                )
-                subset = dataset[dataset[filter_column].isin(selected_ids)]
-                logger.info(f'Retained {subset.shape[0]} tasks after filtering')
-                return subset
-            if 'selected_repos' in data:
-                # repos for the swe-bench instances:
-                # ['astropy/astropy', 'django/django', 'matplotlib/matplotlib', 'mwaskom/seaborn', 'pallets/flask', 'psf/requests', 'pydata/xarray', 'pylint-dev/pylint', 'pytest-dev/pytest', 'scikit-learn/scikit-learn', 'sphinx-doc/sphinx', 'sympy/sympy']
-                selected_repos = data['selected_repos']
-                if isinstance(selected_repos, str):
-                    selected_repos = [selected_repos]
-                assert isinstance(selected_repos, list)
-                logger.info(
-                    f'Filtering {selected_repos} tasks from "selected_repos"...'
-                )
-                subset = dataset[dataset['repo'].isin(selected_repos)]
-                logger.info(f'Retained {subset.shape[0]} tasks after filtering')
-                return subset
-
-    skip_ids = os.environ.get('SKIP_IDS', '').split(',')
-    if len(skip_ids) > 0:
-        logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
-        return dataset[~dataset[filter_column].isin(skip_ids)]
-    return dataset
-
-
-def divide_cpus_among_workers(num_workers, num_cpus_per_worker=4, num_to_skip=0):
-    """Divide CPUs among workers, with better error handling for multiprocessing."""
-    try:
-        current_cpus = list(os.sched_getaffinity(0))
-    except AttributeError:
-        # os.sched_getaffinity not available on all platforms
-        import multiprocessing
-
-        current_cpus = list(range(multiprocessing.cpu_count()))
-
-    num_cpus = len(current_cpus)
-    if num_workers <= 0:
-        raise ValueError('Number of workers must be greater than 0')
-
-    # Chec that num worers and num_cpus_per_worker fit into available CPUs
-    total_cpus_needed = num_workers * num_cpus_per_worker + num_to_skip
-    if total_cpus_needed > num_cpus:
-        raise ValueError(
-            f'Not enough CPUs available. Requested {total_cpus_needed} '
-            f'CPUs (num_workers={num_workers}, num_cpus_per_worker={num_cpus_per_worker}, '
-            f'num_to_skip={num_to_skip}), but only {num_cpus} CPUs are available.'
-        )
-
-    # Divide this into groups, skipping the first `num_to_skip` CPUs.
-    available_cpus = current_cpus[num_to_skip:]
-    cpu_groups = [
-        available_cpus[i * num_cpus_per_worker : (i + 1) * num_cpus_per_worker]
-        for i in range(num_workers)
-    ]
-    print(
-        f'Divided {num_cpus} CPUs into {num_workers} groups, each with {num_cpus_per_worker} CPUs.'
-    )
-    print(f'CPU groups: {cpu_groups}')
-
-    return cpu_groups
-
-
-if __name__ == '__main__':
-    parser = get_evaluation_parser()
-    parser.add_argument(
-        '--dataset',
-        type=str,
-        default=None,
-        help='data set to evaluate on, for now use local.',
-    )
-    parser.add_argument(
-        '--split',
-        type=str,
-        default='test',
-        help='split to evaluate on',
-    )
-    parser.add_argument(
-        '--mode',
-        type=str,
-        default='swe',
-        help='mode to evaluate on',
-    )
-
-    args, _ = parser.parse_known_args()
-
-    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
-    # so we don't need to manage file uploading to OpenHands's repo
-
-    # dataset = load_dataset(args.dataset, split=args.split)
-    # swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
-    dataset = load_dataset(args.dataset, split=args.split)
-
-    # Convert dataset to pandas DataFrame if it is not already.
-    if not isinstance(dataset, pd.DataFrame):
-        dataset = dataset.to_pandas()
-
-    dataset['version'] = dataset['version'].astype(str)
-
-    # Convert created_at column to string.
-    dataset['created_at'] = dataset['created_at'].astype(str)
-
-    swe_bench_tests = filter_dataset(dataset, 'instance_id')
-
-    logger.info(
-        f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
-    )
-
-    llm_config = None
-    if args.llm_config:
-        llm_config = get_llm_config_arg(args.llm_config)
-        llm_config.log_completions = True
-        # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
-        llm_config.modify_params = False
-
-    if llm_config is None:
-        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
-
-    # Get condenser config from environment variable
-    condenser_name = os.environ.get('EVAL_CONDENSER')
-    if condenser_name:
-        condenser_config = get_condenser_config_arg(condenser_name)
-        if condenser_config is None:
-            raise ValueError(
-                f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}'
-            )
-    else:
-        # If no specific condenser config is provided via env var, default to NoOpCondenser
-        condenser_config = NoOpCondenserConfig()
-        logger.debug(
-            'No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.'
-        )
-
-    details = {'mode': args.mode}
-    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
-
-    dataset_descrption = (
-        args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__')
-    )
-    metadata = make_metadata(
-        llm_config,
-        dataset_descrption,
-        args.agent_cls,
-        args.max_iterations,
-        args.eval_note,
-        args.eval_output_dir,
-        details=details,
-        condenser_config=condenser_config,
-    )
-
-    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    print(f'### OUTPUT FILE: {output_file} ###')
-
-    # Run evaluation in iterative mode:
-    # If a rollout fails to output AgentFinishAction, we will try again until it succeeds OR total 3 attempts have been made.
-    ITERATIVE_EVAL_MODE = (
-        os.environ.get('ITERATIVE_EVAL_MODE', 'false').lower() == 'true'
-    )
-    ITERATIVE_EVAL_MODE_MAX_ATTEMPTS = int(
-        os.environ.get('ITERATIVE_EVAL_MODE_MAX_ATTEMPTS', '3')
-    )
-
-    # Get all CPUs and divide into groups of num_workers and put them into a multiprocessing.Queue.
-    cpu_groups_queue = None
-    cpu_groups_list = divide_cpus_among_workers(args.eval_num_workers, num_to_skip=8)
-    cpu_groups_queue = multiprocessing.Manager().Queue()
-    for cpu_group in cpu_groups_list:
-        cpu_groups_queue.put(cpu_group)
-
-    if not ITERATIVE_EVAL_MODE:
-        # load the dataset
-        instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
-
-        process_instance_with_cpu_groups = functools.partial(
-            process_instance,
-            cpu_groups_queue=cpu_groups_queue,
-        )
-
-        config = get_config(
-            instances.iloc[0],  # Use the first instance to get the config
-            metadata,
-            cpu_group=None,  # We will use the cpu_groups_queue to get the cpu group later
-        )
-
-        run_evaluation(
-            instances,
-            metadata,
-            output_file,
-            args.eval_num_workers,
-            process_instance_with_cpu_groups,
-            timeout_seconds=8
-            * 60
-            * 60,  # 8 hour PER instance should be more than enough
-            max_retries=3,
-        )
-    else:
-        critic = AgentFinishedCritic()
-
-        def get_cur_output_file_path(attempt: int) -> str:
-            return (
-                f'{output_file.removesuffix(".jsonl")}.critic_attempt_{attempt}.jsonl'
-            )
-
-        eval_ids = None
-        for attempt in range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1):
-            cur_output_file = get_cur_output_file_path(attempt)
-            logger.info(
-                f'Running evaluation with critic {critic.__class__.__name__} for attempt {attempt} of {ITERATIVE_EVAL_MODE_MAX_ATTEMPTS}.'
-            )
-
-            # For deterministic eval, we set temperature to 0.1 for (>1) attempt
-            # so hopefully we get slightly different results
-            if attempt > 1 and metadata.llm_config.temperature == 0:
-                logger.info(
-                    f'Detected temperature is 0 for (>1) attempt {attempt}. Setting temperature to 0.1...'
-                )
-                metadata.llm_config.temperature = 0.1
-
-            # Load instances - at first attempt, we evaluate all instances
-            # On subsequent attempts, we only evaluate the instances that failed the previous attempt determined by critic
-            instances = prepare_dataset(
-                swe_bench_tests, cur_output_file, args.eval_n_limit, eval_ids=eval_ids
-            )
-            if len(instances) > 0 and not isinstance(
-                instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str
-            ):
-                for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']:
-                    instances[col] = instances[col].apply(lambda x: str(x))
-
-            # Run evaluation - but save them to cur_output_file
-            logger.info(
-                f'Evaluating {len(instances)} instances for attempt {attempt}...'
-            )
-            run_evaluation(
-                instances,
-                metadata,
-                cur_output_file,
-                args.eval_num_workers,
-                process_instance,
-                timeout_seconds=8
-                * 60
-                * 60,  # 8 hour PER instance should be more than enough
-                max_retries=1,
-            )
-
-            # When eval is done, we update eval_ids to the instances that failed the current attempt
-            instances_failed = []
-            logger.info(
-                f'Use critic {critic.__class__.__name__} to check {len(instances)} instances for attempt {attempt}...'
-            )
-            with open(cur_output_file, 'r') as f:
-                for line in f:
-                    instance = json.loads(line)
-                    try:
-                        history = [
-                            event_from_dict(event) for event in instance['history']
-                        ]
-                        critic_result = critic.evaluate(
-                            history, instance['test_result'].get('git_patch', '')
-                        )
-                        if not critic_result.success:
-                            instances_failed.append(instance['instance_id'])
-                    except Exception as e:
-                        logger.error(
-                            f'Error loading history for instance {instance["instance_id"]}: {e}'
-                        )
-                        instances_failed.append(instance['instance_id'])
-            logger.info(
-                f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}'
-            )
-            eval_ids = instances_failed
-
-            # If no instances failed, we break
-            if len(instances_failed) == 0:
-                break
-
-        # Then we should aggregate the results from all attempts into the original output file
-        # and remove the intermediate files
-        logger.info(
-            'Aggregating results from all attempts into the original output file...'
-        )
-        fout = open(output_file, 'w')
-        added_instance_ids = set()
-        for attempt in reversed(range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1)):
-            cur_output_file = get_cur_output_file_path(attempt)
-            if not os.path.exists(cur_output_file):
-                logger.warning(
-                    f'Intermediate output file {cur_output_file} does not exist. Skipping...'
-                )
-                continue
-
-            with open(cur_output_file, 'r') as f:
-                for line in f:
-                    instance = json.loads(line)
-                    # Also make sure git_patch is not empty - otherwise we fall back to previous attempt (empty patch is worse than anything else)
-                    if (
-                        instance['instance_id'] not in added_instance_ids
-                        and instance['test_result'].get('git_patch', '').strip()
-                    ):
-                        fout.write(line)
-                        added_instance_ids.add(instance['instance_id'])
-            logger.info(
-                f'Aggregated instances from {cur_output_file}. Total instances added so far: {len(added_instance_ids)}'
-            )
-        fout.close()
-        logger.info(
-            f'Done! Total {len(added_instance_ids)} instances added to {output_file}'
-        )
--- a/evaluation/benchmarks/swefficiency/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swefficiency/scripts/run_infer.sh
@@ -1,148 +0,0 @@
-#!/usr/bin/env bash
-set -eo pipefail
-
-source "evaluation/utils/version_control.sh"
-
-MODEL_CONFIG=$1
-COMMIT_HASH=$2
-AGENT=$3
-EVAL_LIMIT=$4
-MAX_ITER=$5
-NUM_WORKERS=$6
-DATASET=$7
-SPLIT=$8
-N_RUNS=$9
-MODE=${10}
-
-
-if [ -z "$NUM_WORKERS" ]; then
-  NUM_WORKERS=1
-  echo "Number of workers not specified, use default $NUM_WORKERS"
-fi
-checkout_eval_branch
-
-if [ -z "$AGENT" ]; then
-  echo "Agent not specified, use default CodeActAgent"
-  AGENT="CodeActAgent"
-fi
-
-if [ -z "$MAX_ITER" ]; then
-  echo "MAX_ITER not specified, use default 100"
-  MAX_ITER=100
-fi
-
-if [ -z "$RUN_WITH_BROWSING" ]; then
-  echo "RUN_WITH_BROWSING not specified, use default false"
-  RUN_WITH_BROWSING=false
-fi
-
-
-if [ -z "$DATASET" ]; then
-  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
-  DATASET="swefficiency/swefficiency"
-fi
-
-if [ -z "$SPLIT" ]; then
-  echo "SPLIT not specified, use default test"
-  SPLIT="test"
-fi
-
-if [ -z "$MODE" ]; then
-  MODE="swe"
-  echo "MODE not specified, use default $MODE"
-fi
-
-if [ -n "$EVAL_CONDENSER" ]; then
-  echo "Using Condenser Config: $EVAL_CONDENSER"
-else
-  echo "No Condenser Config provided via EVAL_CONDENSER, use default (NoOpCondenser)."
-fi
-
-export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
-echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
-
-get_openhands_version
-
-echo "AGENT: $AGENT"
-echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
-echo "MODEL_CONFIG: $MODEL_CONFIG"
-echo "DATASET: $DATASET"
-echo "SPLIT: $SPLIT"
-echo "MAX_ITER: $MAX_ITER"
-echo "NUM_WORKERS: $NUM_WORKERS"
-echo "COMMIT_HASH: $COMMIT_HASH"
-echo "MODE: $MODE"
-echo "EVAL_CONDENSER: $EVAL_CONDENSER"
-
-# Default to NOT use Hint
-if [ -z "$USE_HINT_TEXT" ]; then
-  export USE_HINT_TEXT=false
-fi
-echo "USE_HINT_TEXT: $USE_HINT_TEXT"
-EVAL_NOTE="$OPENHANDS_VERSION"
-# if not using Hint, add -no-hint to the eval note
-if [ "$USE_HINT_TEXT" = false ]; then
-  EVAL_NOTE="$EVAL_NOTE-no-hint"
-fi
-
-if [ "$RUN_WITH_BROWSING" = true ]; then
-  EVAL_NOTE="$EVAL_NOTE-with-browsing"
-fi
-
-if [ -n "$EXP_NAME" ]; then
-  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
-fi
-# if mode != swe, add mode to the eval note
-if [ "$MODE" != "swe" ]; then
-  EVAL_NOTE="${EVAL_NOTE}-${MODE}"
-fi
-# Add condenser config to eval note if provided
-if [ -n "$EVAL_CONDENSER" ]; then
-  EVAL_NOTE="${EVAL_NOTE}-${EVAL_CONDENSER}"
-fi
-
-# export RUNTIME="remote"
-# export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
-export NO_CHANGE_TIMEOUT_SECONDS=900 # 15 minutes
-
-function run_eval() {
-  local eval_note="${1}"
-  COMMAND="poetry run python evaluation/benchmarks/swefficiency/run_infer.py \
-    --agent-cls $AGENT \
-    --llm-config $MODEL_CONFIG \
-    --max-iterations $MAX_ITER \
-    --eval-num-workers $NUM_WORKERS \
-    --eval-note $eval_note \
-    --dataset $DATASET \
-    --split $SPLIT \
-    --mode $MODE"
-
-  if [ -n "$EVAL_LIMIT" ]; then
-    echo "EVAL_LIMIT: $EVAL_LIMIT"
-    COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
-  fi
-
-  # Run the command
-  eval $COMMAND
-}
-
-unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
-if [ -z "$N_RUNS" ]; then
-  N_RUNS=1
-  echo "N_RUNS not specified, use default $N_RUNS"
-fi
-
-# Skip runs if the run number is in the SKIP_RUNS list
-# read from env variable SKIP_RUNS as a comma separated list of run numbers
-SKIP_RUNS=(${SKIP_RUNS//,/ })
-for i in $(seq 1 $N_RUNS); do
-  if [[ " ${SKIP_RUNS[@]} " =~ " $i " ]]; then
-    echo "Skipping run $i"
-    continue
-  fi
-  current_eval_note="$EVAL_NOTE-run_$i"
-  echo "EVAL_NOTE: $current_eval_note"
-  run_eval $current_eval_note
-done
-
-checkout_original_branch
--- a/evaluation/benchmarks/swefficiency/scripts/setup/instance_swe_entry.sh
+++ b/evaluation/benchmarks/swefficiency/scripts/setup/instance_swe_entry.sh
@@ -1,43 +0,0 @@
-#!/usr/bin/env bash
-
-source ~/.bashrc
-SWEUTIL_DIR=/swe_util
-
-# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
-# SWE_INSTANCE_ID=django__django-11099
-if [ -z "$SWE_INSTANCE_ID" ]; then
-    echo "Error: SWE_INSTANCE_ID is not set." >&2
-    exit 1
-fi
-
-# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
-item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
-
-if [[ -z "$item" ]]; then
-  echo "No item found for the provided instance ID."
-  exit 1
-fi
-
-
-WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")')
-
-echo "WORKSPACE_NAME: $WORKSPACE_NAME"
-
-# Clear the workspace
-if [ -d /workspace ]; then
-    rm -rf /workspace/*
-else
-    mkdir /workspace
-fi
-# Copy repo to workspace
-if [ -d /workspace/$WORKSPACE_NAME ]; then
-    rm -rf /workspace/$WORKSPACE_NAME
-fi
-mkdir -p /workspace
-cp -r /testbed /workspace/$WORKSPACE_NAME
-
-# Activate instance-specific environment
-if [ -d /opt/miniconda3 ]; then
-    . /opt/miniconda3/etc/profile.d/conda.sh
-    conda activate testbed
-fi
--- a/evaluation/benchmarks/swefficiency/scripts/setup/prepare_swe_utils.sh
+++ b/evaluation/benchmarks/swefficiency/scripts/setup/prepare_swe_utils.sh
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-EVAL_WORKSPACE="evaluation/benchmarks/swe_bench/eval_workspace"
-mkdir -p $EVAL_WORKSPACE
-
-# 1. Prepare REPO
-echo "==== Prepare SWE-bench repo ===="
-OH_SWE_BENCH_REPO_PATH="https://github.com/All-Hands-AI/SWE-bench.git"
-OH_SWE_BENCH_REPO_BRANCH="eval"
-git clone -b $OH_SWE_BENCH_REPO_BRANCH $OH_SWE_BENCH_REPO_PATH $EVAL_WORKSPACE/OH-SWE-bench
-
-# 2. Prepare DATA
-echo "==== Prepare SWE-bench data ===="
-EVAL_IMAGE=ghcr.io/all-hands-ai/eval-swe-bench:builder_with_conda
-EVAL_WORKSPACE=$(realpath $EVAL_WORKSPACE)
-chmod +x $EVAL_WORKSPACE/OH-SWE-bench/swebench/harness/prepare_data.sh
-if [ -d $EVAL_WORKSPACE/eval_data ]; then
-    rm -r $EVAL_WORKSPACE/eval_data
-fi
-docker run \
-    -v $EVAL_WORKSPACE:/workspace \
-    -w /workspace \
-    -u $(id -u):$(id -g) \
-    -e HF_DATASETS_CACHE="/tmp" \
-    --rm -it $EVAL_IMAGE \
-    bash -c "cd OH-SWE-bench/swebench/harness && /swe_util/miniforge3/bin/conda run -n swe-bench-eval ./prepare_data.sh && mv eval_data /workspace/"
--- a/evaluation/benchmarks/swefficiency/scripts/setup/swe_entry.sh
+++ b/evaluation/benchmarks/swefficiency/scripts/setup/swe_entry.sh
@@ -1,96 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-# assert user name is `root`
-if [ "$USER" != "root" ]; then
-    echo "Error: This script is intended to be run by the 'root' user only." >&2
-    exit 1
-fi
-
-source ~/.bashrc
-
-SWEUTIL_DIR=/swe_util
-
-# Create logs directory
-LOG_DIR=/openhands/logs
-mkdir -p $LOG_DIR && chmod 777 $LOG_DIR
-
-# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
-# SWE_INSTANCE_ID=django__django-11099
-if [ -z "$SWE_INSTANCE_ID" ]; then
-    echo "Error: SWE_INSTANCE_ID is not set." >&2
-    exit 1
-fi
-
-# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
-item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-test-lite.json)
-
-if [[ -z "$item" ]]; then
-  echo "No item found for the provided instance ID."
-  exit 1
-fi
-
-CONDA_ENV_NAME=$(echo "$item" | jq -r '.repo + "__" + .version | gsub("/"; "__")')
-
-echo "CONDA_ENV_NAME: $CONDA_ENV_NAME"
-
-SWE_TASK_DIR=/openhands/swe_tasks
-mkdir -p $SWE_TASK_DIR
-# Dump test_patch to /workspace/test.patch
-echo "$item" | jq -r '.test_patch' > $SWE_TASK_DIR/test.patch
-# Dump patch to /workspace/gold.patch
-echo "$item" | jq -r '.patch' > $SWE_TASK_DIR/gold.patch
-# Dump the item to /workspace/instance.json except for the "test_patch" and "patch" fields
-echo "$item" | jq 'del(.test_patch, .patch)' > $SWE_TASK_DIR/instance.json
-
-# Clear the workspace
-rm -rf /workspace/*
-# Copy repo to workspace
-if [ -d /workspace/$CONDA_ENV_NAME ]; then
-    rm -rf /workspace/$CONDA_ENV_NAME
-fi
-cp -r $SWEUTIL_DIR/eval_data/testbeds/$CONDA_ENV_NAME /workspace
-
-# Reset swe-bench testbed and install the repo
-. $SWEUTIL_DIR/miniforge3/etc/profile.d/conda.sh
-conda config --set changeps1 False
-conda config --append channels conda-forge
-conda activate swe-bench-eval
-
-mkdir -p $SWE_TASK_DIR/reset_testbed_temp
-mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
-SWE_BENCH_DIR=/swe_util/OH-SWE-bench
-output=$(
-    export PYTHONPATH=$SWE_BENCH_DIR && \
-    cd $SWE_BENCH_DIR && \
-    python swebench/harness/reset_swe_env.py \
-    --swe_bench_tasks $SWEUTIL_DIR/eval_data/instances/swe-bench-test.json \
-    --temp_dir $SWE_TASK_DIR/reset_testbed_temp \
-    --testbed /workspace \
-    --conda_path $SWEUTIL_DIR/miniforge3 \
-    --instance_id $SWE_INSTANCE_ID \
-    --log_dir $SWE_TASK_DIR/reset_testbed_log_dir \
-    --timeout 900 \
-    --verbose
-)
-
-REPO_PATH=$(echo "$output" | awk -F': ' '/repo_path:/ {print $2}')
-TEST_CMD=$(echo "$output" | awk -F': ' '/test_cmd:/ {print $2}')
-echo "Repo Path: $REPO_PATH"
-echo "Test Command: $TEST_CMD"
-
-echo "export SWE_BENCH_DIR=\"$SWE_BENCH_DIR\"" >> ~/.bashrc
-echo "export REPO_PATH=\"$REPO_PATH\"" >> ~/.bashrc
-echo "export TEST_CMD=\"$TEST_CMD\"" >> ~/.bashrc
-
-if [[ "$REPO_PATH" == "None" ]]; then
-    echo "Error: Failed to retrieve repository path. Tests may not have passed or output was not as expected." >&2
-    exit 1
-fi
-
-# Activate instance-specific environment
-. $SWEUTIL_DIR/miniforge3/etc/profile.d/conda.sh
-conda activate $CONDA_ENV_NAME
-
-set +e
--- a/evaluation/integration_tests/README.md
+++ b/evaluation/integration_tests/README.md
@@ -0,0 +1,69 @@
+# Integration tests
+
+This directory implements integration tests that [was running in CI](https://github.com/OpenHands/OpenHands/tree/23d3becf1d6f5d07e592f7345750c314a826b4e9/tests/integration).
+
+[PR 3985](https://github.com/OpenHands/OpenHands/pull/3985) introduce LLM-based editing, which requires access to LLM to perform edit. Hence, we remove integration tests from CI and intend to run them as nightly evaluation to ensure the quality of OpenHands softwares.
+
+## To add new tests
+
+Each test is a file named like `tXX_testname.py` where `XX` is a number.
+Make sure to name the file for each test to start with `t` and ends with `.py`.
+
+Each test should be structured as a subclass of [`BaseIntegrationTest`](./tests/base.py), where you need to implement `initialize_runtime` that setup the runtime enviornment before test, and `verify_result` that takes in a `Runtime` and history of `Event` and return a `TestResult`. See [t01_fix_simple_typo.py](./tests/t01_fix_simple_typo.py) and [t05_simple_browsing.py](./tests/t05_simple_browsing.py) for two representative examples.
+
+```python
+class TestResult(BaseModel):
+    success: bool
+    reason: str | None = None
+
+
+class BaseIntegrationTest(ABC):
+    """Base class for integration tests."""
+
+    INSTRUCTION: str
+
+    @classmethod
+    @abstractmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        """Initialize the runtime for the test to run."""
+        pass
+
+    @classmethod
+    @abstractmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        """Verify the result of the test.
+
+        This method will be called after the agent performs the task on the runtime.
+        """
+        pass
+```
+
+
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../README.md#setup) to setup your local
+development environment and LLM.
+
+## Start the evaluation
+
+```bash
+./evaluation/integration_tests/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
+```
+
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
+    your LLM settings, as defined in your `config.toml`.
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
+    you would like to evaluate. It could also be a release tag like `0.9.0`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
+    defaulting to `CodeActAgent`.
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
+    instances. By default, the script evaluates the entire Exercism test set
+    (133 issues). Note: in order to use `eval_limit`, you must also set `agent`.
+- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
+    given IDs (comma separated).
+
+Example:
+```bash
+./evaluation/integration_tests/scripts/run_infer.sh llm.claude-35-sonnet-eval HEAD CodeActAgent
+```
--- a/evaluation/benchmarks/swefficiency/init.py
+++ b/evaluation/benchmarks/swefficiency/init.py
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -0,0 +1,251 @@
+import asyncio
+import importlib.util
+import os
+
+import pandas as pd
+
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    get_default_sandbox_config_for_eval,
+    get_metrics,
+    get_openhands_config_for_eval,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+    update_llm_config_for_completions_logging,
+)
+from evaluation.utils.shared import (
+    codeact_user_response as fake_user_response,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AgentConfig,
+    OpenHandsConfig,
+    get_evaluation_parser,
+    get_llm_config_arg,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction
+from openhands.events.serialization.event import event_to_dict
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+FAKE_RESPONSES = {
+    'CodeActAgent': fake_user_response,
+    'VisualBrowsingAgent': fake_user_response,
+}
+
+
+def get_config(
+    metadata: EvalMetadata,
+    instance_id: str,
+) -> OpenHandsConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.platform = 'linux/amd64'
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
+        runtime=os.environ.get('RUNTIME', 'docker'),
+        sandbox_config=sandbox_config,
+    )
+    config.debug = True
+    config.set_llm_config(
+        update_llm_config_for_completions_logging(
+            metadata.llm_config, metadata.eval_output_dir, instance_id
+        )
+    )
+    agent_config = AgentConfig(
+        enable_jupyter=True,
+        enable_browsing=True,
+        enable_llm_editor=False,
+    )
+    config.set_agent_config(agent_config)
+    return config
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(metadata, instance.instance_id)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
+    else:
+        logger.info(
+            f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
+        )
+
+    # =============================================
+    # import test instance
+    # =============================================
+    instance_id = instance.instance_id
+    spec = importlib.util.spec_from_file_location(instance_id, instance.file_path)
+    test_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(test_module)
+    assert hasattr(test_module, 'Test'), (
+        f'Test module {instance_id} does not have a Test class'
+    )
+
+    test_class: type[BaseIntegrationTest] = test_module.Test
+    assert issubclass(test_class, BaseIntegrationTest), (
+        f'Test class {instance_id} does not inherit from BaseIntegrationTest'
+    )
+
+    instruction = test_class.INSTRUCTION
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+    try:
+        test_class.initialize_runtime(runtime)
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State | None = asyncio.run(
+            run_controller(
+                config=config,
+                initial_user_action=MessageAction(content=instruction),
+                runtime=runtime,
+                fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
+            )
+        )
+        if state is None:
+            raise ValueError('State should not be None.')
+
+        # # =============================================
+        # # result evaluation
+        # # =============================================
+
+        histories = state.history
+
+        # some basic check
+        logger.info(f'Total events in history: {len(histories)}')
+        assert len(histories) > 0, 'History should not be empty'
+
+        test_result: TestResult = test_class.verify_result(runtime, histories)
+        metrics = get_metrics(state)
+    finally:
+        runtime.close()
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=str(instance.instance_id),
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=[event_to_dict(event) for event in histories],
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result.model_dump(),
+    )
+    return output
+
+
+def load_integration_tests() -> pd.DataFrame:
+    """Load tests from python files under ./tests"""
+    cur_dir = os.path.dirname(os.path.abspath(__file__))
+    test_dir = os.path.join(cur_dir, 'tests')
+    test_files = [
+        os.path.join(test_dir, f)
+        for f in os.listdir(test_dir)
+        if f.startswith('t') and f.endswith('.py')
+    ]
+    df = pd.DataFrame(test_files, columns=['file_path'])
+    df['instance_id'] = df['file_path'].apply(
+        lambda x: os.path.basename(x).rstrip('.py')
+    )
+    return df
+
+
+if __name__ == '__main__':
+    parser = get_evaluation_parser()
+    args, _ = parser.parse_known_args()
+    integration_tests = load_integration_tests()
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    metadata = make_metadata(
+        llm_config,
+        'integration_tests',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')
+
+    instances = prepare_dataset(
+        integration_tests,
+        output_file,
+        args.eval_n_limit,
+        eval_ids=eval_ids,
+    )
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+    )
+
+    df = pd.read_json(output_file, lines=True, orient='records')
+
+    # record success and reason
+    df['success'] = df['test_result'].apply(lambda x: x['success'])
+    df['reason'] = df['test_result'].apply(lambda x: x['reason'])
+    logger.info('-' * 100)
+    logger.info(
+        f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})'
+    )
+    logger.info(
+        '\nEvaluation Results:'
+        + '\n'
+        + df[['instance_id', 'success', 'reason']].to_string(index=False)
+    )
+    logger.info('-' * 100)
+
+    # record cost for each instance, with 3 decimal places
+    # we sum up all the "costs" from the metrics array
+    df['cost'] = df['metrics'].apply(
+        lambda m: round(sum(c['cost'] for c in m['costs']), 3)
+        if m and 'costs' in m
+        else 0.0
+    )
+
+    # capture the top-level error if present, per instance
+    df['error_message'] = df.get('error', None)
+
+    logger.info(f'Total cost: USD {df["cost"].sum():.2f}')
+
+    report_file = os.path.join(metadata.eval_output_dir, 'report.md')
+    with open(report_file, 'w') as f:
+        f.write(
+            f'Success rate: {df["success"].mean():.2%}'
+            f' ({df["success"].sum()}/{len(df)})\n'
+        )
+        f.write(f'\nTotal cost: USD {df["cost"].sum():.2f}\n')
+        f.write(
+            df[
+                ['instance_id', 'success', 'reason', 'cost', 'error_message']
+            ].to_markdown(index=False)
+        )
--- a/evaluation/integration_tests/scripts/run_infer.sh
+++ b/evaluation/integration_tests/scripts/run_infer.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+MAX_ITERATIONS=$5
+NUM_WORKERS=$6
+EVAL_IDS=$7
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+get_openhands_version
+
+echo "AGENT: $AGENT"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE=$OPENHANDS_VERSION
+
+# Default to NOT use unit tests.
+if [ -z "$USE_UNIT_TESTS" ]; then
+  export USE_UNIT_TESTS=false
+fi
+echo "USE_UNIT_TESTS: $USE_UNIT_TESTS"
+# If use unit tests, set EVAL_NOTE to the commit hash
+if [ "$USE_UNIT_TESTS" = true ]; then
+  EVAL_NOTE=$EVAL_NOTE-w-test
+fi
+
+# export PYTHONPATH=evaluation/integration_tests:\$PYTHONPATH
+COMMAND="poetry run python evaluation/integration_tests/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations ${MAX_ITERATIONS:-10} \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+if [ -n "$EVAL_IDS" ]; then
+  echo "EVAL_IDS: $EVAL_IDS"
+  COMMAND="$COMMAND --eval-ids $EVAL_IDS"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/integration_tests/tests/init.py
+++ b/evaluation/integration_tests/tests/init.py
--- a/evaluation/integration_tests/tests/base.py
+++ b/evaluation/integration_tests/tests/base.py
@@ -0,0 +1,32 @@
+from abc import ABC, abstractmethod
+
+from pydantic import BaseModel
+
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class TestResult(BaseModel):
+    success: bool
+    reason: str | None = None
+
+
+class BaseIntegrationTest(ABC):
+    """Base class for integration tests."""
+
+    INSTRUCTION: str
+
+    @classmethod
+    @abstractmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        """Initialize the runtime for the test to run."""
+        pass
+
+    @classmethod
+    @abstractmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        """Verify the result of the test.
+
+        This method will be called after the agent performs the task on the runtime.
+        """
+        pass
--- a/evaluation/integration_tests/tests/t01_fix_simple_typo.py
+++ b/evaluation/integration_tests/tests/t01_fix_simple_typo.py
@@ -0,0 +1,39 @@
+import os
+import tempfile
+
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from openhands.events.action import CmdRunAction
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = 'Fix typos in bad.txt.'
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        # create a file with a typo in /workspace/bad.txt
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_file_path = os.path.join(temp_dir, 'bad.txt')
+            with open(temp_file_path, 'w') as f:
+                f.write('This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!')
+
+            # Copy the file to the desired location
+            runtime.copy_to(temp_file_path, '/workspace')
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the file /workspace/bad.txt has been fixed
+        action = CmdRunAction(command='cat /workspace/bad.txt')
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False, reason=f'Failed to run command: {obs.content}'
+            )
+        # check if the file /workspace/bad.txt has been fixed
+        if (
+            obs.content.strip().replace('\r\n', '\n')
+            == 'This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!'
+        ):
+            return TestResult(success=True)
+        return TestResult(success=False, reason=f'File not fixed: {obs.content}')
--- a/evaluation/integration_tests/tests/t02_add_bash_hello.py
+++ b/evaluation/integration_tests/tests/t02_add_bash_hello.py
@@ -0,0 +1,40 @@
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import assert_and_raise
+from openhands.events.action import CmdRunAction
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = "Write a shell script '/workspace/hello.sh' that prints 'hello'."
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        action = CmdRunAction(command='mkdir -p /workspace')
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the file /workspace/hello.sh exists
+        action = CmdRunAction(command='cat /workspace/hello.sh')
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to cat /workspace/hello.sh: {obs.content}.',
+            )
+
+        # execute the script
+        action = CmdRunAction(command='bash /workspace/hello.sh')
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to execute /workspace/hello.sh: {obs.content}.',
+            )
+        if obs.content.strip() != 'hello':
+            return TestResult(
+                success=False, reason=f'Script did not print "hello": {obs.content}.'
+            )
+        return TestResult(success=True)
--- a/evaluation/integration_tests/tests/t03_jupyter_write_file.py
+++ b/evaluation/integration_tests/tests/t03_jupyter_write_file.py
@@ -0,0 +1,43 @@
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import assert_and_raise
+from openhands.events.action import CmdRunAction
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        action = CmdRunAction(command='mkdir -p /workspace')
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the file /workspace/hello.sh exists
+        action = CmdRunAction(command='cat /workspace/test.txt')
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to cat /workspace/test.txt: {obs.content}.',
+            )
+
+        # execute the script
+        action = CmdRunAction(command='cat /workspace/test.txt')
+        obs = runtime.run_action(action)
+
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to cat /workspace/test.txt: {obs.content}.',
+            )
+
+        if 'hello world' not in obs.content.strip():
+            return TestResult(
+                success=False,
+                reason=f'File did not contain "hello world": {obs.content}.',
+            )
+        return TestResult(success=True)
--- a/evaluation/integration_tests/tests/t04_git_staging.py
+++ b/evaluation/integration_tests/tests/t04_git_staging.py
@@ -0,0 +1,57 @@
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import assert_and_raise
+from openhands.events.action import CmdRunAction
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = 'Write a git commit message for the current staging area and commit the changes.'
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        action = CmdRunAction(command='mkdir -p /workspace')
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        # git init
+        action = CmdRunAction(command='git init')
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        # create file
+        action = CmdRunAction(command='echo \'print("hello world")\' > hello.py')
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        # git add
+        cmd_str = 'git add hello.py'
+        action = CmdRunAction(command=cmd_str)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the file /workspace/hello.py exists
+        action = CmdRunAction(command='cat /workspace/hello.py')
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to cat /workspace/hello.py: {obs.content}.',
+            )
+
+        # check if the staging area is empty
+        action = CmdRunAction(command='git status')
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False, reason=f'Failed to git status: {obs.content}.'
+            )
+        if 'nothing to commit, working tree clean' in obs.content.strip():
+            return TestResult(success=True)
+
+        return TestResult(
+            success=False,
+            reason=f'Failed to check for "nothing to commit, working tree clean": {obs.content}.',
+        )
--- a/evaluation/integration_tests/tests/t05_simple_browsing.py
+++ b/evaluation/integration_tests/tests/t05_simple_browsing.py
@@ -0,0 +1,145 @@
+import os
+import tempfile
+
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import assert_and_raise
+from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
+from openhands.events.event import Event
+from openhands.events.observation import AgentDelegateObservation
+from openhands.runtime.base import Runtime
+
+HTML_FILE = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>The Ultimate Answer</title>
+    <style>
+        body {
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            height: 100vh;
+            margin: 0;
+            background: linear-gradient(to right, #1e3c72, #2a5298);
+            color: #fff;
+            font-family: 'Arial', sans-serif;
+            text-align: center;
+        }
+        .container {
+            text-align: center;
+            padding: 20px;
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 10px;
+            box-shadow: 0 0 10px rgba(0, 0, 0, 0.2);
+        }
+        h1 {
+            font-size: 36px;
+            margin-bottom: 20px;
+        }
+        p {
+            font-size: 18px;
+            margin-bottom: 30px;
+        }
+        #showButton {
+            padding: 10px 20px;
+            font-size: 16px;
+            color: #1e3c72;
+            background: #fff;
+            border: none;
+            border-radius: 5px;
+            cursor: pointer;
+            transition: background 0.3s ease;
+        }
+        #showButton:hover {
+            background: #f0f0f0;
+        }
+        #result {
+            margin-top: 20px;
+            font-size: 24px;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>The Ultimate Answer</h1>
+        <p>Click the button to reveal the answer to life, the universe, and everything.</p>
+        <button id="showButton">Click me</button>
+        <div id="result"></div>
+    </div>
+    <script>
+        document.getElementById('showButton').addEventListener('click', function() {
+            document.getElementById('result').innerText = 'The answer is OpenHands is all you need!';
+        });
+    </script>
+</body>
+</html>
+"""
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = 'Browse localhost:8000, and tell me the ultimate answer to life.'
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        action = CmdRunAction(command='mkdir -p /workspace')
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        action = CmdRunAction(command='mkdir -p /tmp/server')
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        # create a file with a typo in /workspace/bad.txt
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_file_path = os.path.join(temp_dir, 'index.html')
+            with open(temp_file_path, 'w') as f:
+                f.write(HTML_FILE)
+            # Copy the file to the desired location
+            runtime.copy_to(temp_file_path, '/tmp/server')
+
+        # create README.md
+        action = CmdRunAction(
+            command='cd /tmp/server && nohup python3 -m http.server 8000 &'
+        )
+        obs = runtime.run_action(action)
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        from openhands.core.logger import openhands_logger as logger
+
+        # check if the "The answer is OpenHands is all you need!" is in any message
+        message_actions = [
+            event
+            for event in histories
+            if isinstance(
+                event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
+            )
+        ]
+        logger.debug(f'Total message-like events: {len(message_actions)}')
+
+        for event in message_actions:
+            try:
+                if isinstance(event, AgentDelegateObservation):
+                    content = event.content
+                elif isinstance(event, AgentFinishAction):
+                    content = event.outputs.get('content', '')
+                elif isinstance(event, MessageAction):
+                    content = event.content
+                else:
+                    logger.warning(f'Unexpected event type: {type(event)}')
+                    continue
+
+                if 'OpenHands is all you need!' in content:
+                    return TestResult(success=True)
+            except Exception as e:
+                logger.error(f'Error processing event: {e}')
+
+        logger.debug(
+            f'Total messages: {len(message_actions)}. Messages: {message_actions}'
+        )
+        return TestResult(
+            success=False,
+            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.',
+        )
--- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py
+++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
@@ -0,0 +1,58 @@
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from openhands.events.action import AgentFinishAction, MessageAction
+from openhands.events.event import Event
+from openhands.events.observation import AgentDelegateObservation
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = 'Look at https://github.com/OpenHands/OpenHands/pull/8, and tell me what is happening there and what did @asadm suggest.'
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        pass
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        from openhands.core.logger import openhands_logger as logger
+
+        # check if the license information is in any message
+        message_actions = [
+            event
+            for event in histories
+            if isinstance(
+                event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
+            )
+        ]
+        logger.info(f'Total message-like events: {len(message_actions)}')
+
+        for event in message_actions:
+            try:
+                if isinstance(event, AgentDelegateObservation):
+                    content = event.content
+                elif isinstance(event, AgentFinishAction):
+                    content = event.outputs.get('content', '')
+                    if event.thought:
+                        content += f'\n\n{event.thought}'
+                elif isinstance(event, MessageAction):
+                    content = event.content
+                else:
+                    logger.warning(f'Unexpected event type: {type(event)}')
+                    continue
+
+                if (
+                    'non-commercial' in content
+                    or 'MIT' in content
+                    or 'Apache 2.0' in content
+                ):
+                    return TestResult(success=True)
+            except Exception as e:
+                logger.error(f'Error processing event: {e}')
+
+        logger.debug(
+            f'Total messages: {len(message_actions)}. Messages: {message_actions}'
+        )
+        return TestResult(
+            success=False,
+            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.',
+        )
--- a/evaluation/integration_tests/tests/t07_interactive_commands.py
+++ b/evaluation/integration_tests/tests/t07_interactive_commands.py
@@ -0,0 +1,73 @@
+import hashlib
+
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from openhands.events.action import (
+    AgentFinishAction,
+    FileWriteAction,
+    MessageAction,
+)
+from openhands.events.event import Event
+from openhands.events.observation import AgentDelegateObservation
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = 'Execute the python script /workspace/python_script.py with input "John" and "25" and tell me the secret number.'
+    SECRET_NUMBER = int(hashlib.sha256(str(25).encode()).hexdigest()[:8], 16) % 1000
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        from openhands.core.logger import openhands_logger as logger
+
+        action = FileWriteAction(
+            path='/workspace/python_script.py',
+            content=(
+                'name = input("Enter your name: "); age = input("Enter your age: "); '
+                'import hashlib; secret = int(hashlib.sha256(str(age).encode()).hexdigest()[:8], 16) % 1000; '
+                'print(f"Hello {name}, you are {age} years old. Tell you a secret number: {secret}")'
+            ),
+        )
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        observation = runtime.run_action(action)
+        logger.info(observation, extra={'msg_type': 'OBSERVATION'})
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        from openhands.core.logger import openhands_logger as logger
+
+        # check if the license information is in any message
+        message_actions = [
+            event
+            for event in histories
+            if isinstance(
+                event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
+            )
+        ]
+        logger.info(f'Total message-like events: {len(message_actions)}')
+
+        for event in message_actions:
+            try:
+                if isinstance(event, AgentDelegateObservation):
+                    content = event.content
+                elif isinstance(event, AgentFinishAction):
+                    content = event.outputs.get('content', '')
+                    if event.thought:
+                        content += f'\n\n{event.thought}'
+                elif isinstance(event, MessageAction):
+                    content = event.content
+                else:
+                    logger.warning(f'Unexpected event type: {type(event)}')
+                    continue
+
+                if str(cls.SECRET_NUMBER) in content:
+                    return TestResult(success=True)
+            except Exception as e:
+                logger.error(f'Error processing event: {e}')
+
+        logger.debug(
+            f'Total messages: {len(message_actions)}. Messages: {message_actions}'
+        )
+        return TestResult(
+            success=False,
+            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.',
+        )
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -9,7 +9,7 @@ import time
 import traceback
 from contextlib import contextmanager
 from inspect import signature
-from typing import Any, Awaitable, Callable, Optional, TextIO
+from typing import Any, Awaitable, Callable, TextIO

 import pandas as pd
 from pydantic import BaseModel
@@ -222,7 +222,6 @@ def prepare_dataset(
    eval_n_limit: int,
    eval_ids: list[str] | None = None,
    skip_num: int | None = None,
-    filter_func: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
 ):
    assert 'instance_id' in dataset.columns, (
        "Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column."
@@ -266,12 +265,6 @@ def prepare_dataset(
            f'Randomly sampling {eval_n_limit} unique instances with random seed 42.'
        )

-    if filter_func is not None:
-        dataset = filter_func(dataset)
-        logger.info(
-            f'Applied filter after sampling: {len(dataset)} instances remaining'
-        )
-
    def make_serializable(instance_dict: dict) -> dict:
        import numpy as np

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ray Myers	70fb1dd450	Updatre coverage enforcement calculation	2025-10-29 01:27:12 -05:00
Ray Myers	475664e62b	chore - Enforce python test coverage	2025-10-28 17:59:53 -05:00
				`@@ -1 +0,0 @@`
				`This way of running OpenHands is not officially supported. It is maintained by the community.`