test: fix unit tests

- Add missing dependency 'markdown' for CLI TUI rendering - Prevent env var WORKSPACE_MOUNT_PATH_IN_SANDBOX from overriding default when SANDBOX_VOLUMES lacks /workspace Co-authored-by: openhands <openhands@all-hands.dev>
Make setup process more friendly and welcoming
2026-04-29 03:00:45 -04:00 · 2025-08-28 18:41:13 +00:00 · 2025-08-11 18:15:52 +00:00 · 2025-08-11 09:01:17 -04:00 · 2025-08-09 14:37:18 -04:00 · 2025-08-08 20:28:36 -07:00
79 changed files with 1375 additions and 604 deletions
--- a/.github/scripts/update_pr_description.sh
+++ b/.github/scripts/update_pr_description.sh
@@ -1,33 +1,53 @@
 #!/bin/bash

+set -euxo pipefail
+
 # This script updates the PR description with commands to run the PR locally
 # It adds both Docker and uvx commands

 # Get the branch name for the PR
-BRANCH_NAME=$(gh pr view $PR_NUMBER --json headRefName --jq .headRefName)
+BRANCH_NAME=$(gh pr view "$PR_NUMBER" --json headRefName --jq .headRefName)

 # Define the Docker command
 DOCKER_RUN_COMMAND="docker run -it --rm \
  -p 3000:3000 \
  -v /var/run/docker.sock:/var/run/docker.sock \
  --add-host host.docker.internal:host-gateway \
-  -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:$SHORT_SHA-nikolaik \
-  --name openhands-app-$SHORT_SHA \
-  docker.all-hands.dev/all-hands-ai/openhands:$SHORT_SHA"
+  -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:${SHORT_SHA}-nikolaik \
+  --name openhands-app-${SHORT_SHA} \
+  docker.all-hands.dev/all-hands-ai/openhands:${SHORT_SHA}"

 # Define the uvx command
-UVX_RUN_COMMAND="uvx --python 3.12 --from git+https://github.com/All-Hands-AI/OpenHands@$BRANCH_NAME openhands"
+UVX_RUN_COMMAND="uvx --python 3.12 --from git+https://github.com/All-Hands-AI/OpenHands@${BRANCH_NAME} openhands"

 # Get the current PR body
-PR_BODY=$(gh pr view $PR_NUMBER --json body --jq .body)
+PR_BODY=$(gh pr view "$PR_NUMBER" --json body --jq .body)

 # Prepare the new PR body with both commands
 if echo "$PR_BODY" | grep -q "To run this PR locally, use the following command:"; then
-  # For existing PR descriptions, replace the command section
-  NEW_PR_BODY=$(echo "$PR_BODY" | sed "s|To run this PR locally, use the following command:.*\`\`\`|To run this PR locally, use the following command:\n\nGUI with Docker:\n\`\`\`\n$DOCKER_RUN_COMMAND\n\`\`\`\n\nCLI with uvx:\n\`\`\`\n$UVX_RUN_COMMAND\n\`\`\`|s")
+  # For existing PR descriptions, use a more robust approach
+  # Split the PR body at the "To run this PR locally" section and replace everything after it
+  BEFORE_SECTION=$(echo "$PR_BODY" | sed '/To run this PR locally, use the following command:/,$d')
+  NEW_PR_BODY=$(cat <<EOF
+${BEFORE_SECTION}
+
+To run this PR locally, use the following command:
+
+GUI with Docker:
+\`\`\`
+${DOCKER_RUN_COMMAND}
+\`\`\`
+
+CLI with uvx:
+\`\`\`
+${UVX_RUN_COMMAND}
+\`\`\`
+EOF
+)
 else
-  # For new PR descriptions
-  NEW_PR_BODY="${PR_BODY}
+  # For new PR descriptions: use heredoc safely without indentation
+  NEW_PR_BODY=$(cat <<EOF
+$PR_BODY

 ---

@@ -35,15 +55,17 @@ To run this PR locally, use the following command:

 GUI with Docker:
 \`\`\`
-$DOCKER_RUN_COMMAND
+${DOCKER_RUN_COMMAND}
 \`\`\`

 CLI with uvx:
 \`\`\`
-$UVX_RUN_COMMAND
-\`\`\`"
+${UVX_RUN_COMMAND}
+\`\`\`
+EOF
+)
 fi

 # Update the PR description
 echo "Updating PR description with Docker and uvx commands"
-gh pr edit $PR_NUMBER --body "$NEW_PR_BODY"
+gh pr edit "$PR_NUMBER" --body "$NEW_PR_BODY"
--- a/.github/workflows/py-tests.yml
+++ b/.github/workflows/py-tests.yml
@@ -48,11 +48,11 @@ jobs:
      - name: Build Environment
        run: make build
      - name: Run Unit Tests
-        run: poetry run pytest --forked -n auto -svv ./tests/unit
+        run: PYTHONPATH=".:$PYTHONPATH" poetry run pytest --forked -n auto -svv ./tests/unit
      - name: Run Runtime Tests with CLIRuntime
-        run: TEST_RUNTIME=cli poetry run pytest -svv tests/runtime/test_bash.py
+        run: PYTHONPATH=".:$PYTHONPATH" TEST_RUNTIME=cli poetry run pytest -svv tests/runtime/test_bash.py
      - name: Run E2E Tests
-        run: poetry run pytest -svv tests/e2e
+        run: PYTHONPATH=".:$PYTHONPATH" poetry run pytest -svv tests/e2e

  # Run specific Windows python tests
  test-on-windows:
@@ -77,9 +77,11 @@ jobs:
      - name: Run Windows unit tests
        run: poetry run pytest -svv tests/unit/test_windows_bash.py
        env:
+          PYTHONPATH: ".;$env:PYTHONPATH"
          DEBUG: "1"
      - name: Run Windows runtime tests with LocalRuntime
        run: $env:TEST_RUNTIME="local"; poetry run pytest -svv tests/runtime/test_bash.py
        env:
+          PYTHONPATH: ".;$env:PYTHONPATH"
          TEST_RUNTIME: local
          DEBUG: "1"
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -12,11 +12,11 @@ jobs:
    steps:
      - uses: actions/stale@v9
        with:
-          stale-issue-message: 'This issue is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
-          stale-pr-message: 'This PR is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
-          days-before-stale: 30
+          stale-issue-message: 'This issue is stale because it has been open for 40 days with no activity. Remove the stale label or leave a comment, otherwise it will be closed in 10 days.'
+          stale-pr-message: 'This PR is stale because it has been open for 40 days with no activity. Remove the stale label or leave a comment, otherwise it will be closed in 10 days.'
+          days-before-stale: 40
          exempt-issue-labels: 'roadmap'
-          close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
-          close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
-          days-before-close: 7
+          close-issue-message: 'This issue was automatically closed due to 50 days of inactivity. We do this to help keep the issues somewhat manageable and focus on active issues.'
+          close-pr-message: 'This PR was closed because it had no activity for 50 days. If you feel this was closed in error, and you would like to continue the PR, please resubmit or let us know.'
+          days-before-close: 10
          operations-per-run: 150
--- a/.openhands/setup.sh
+++ b/.openhands/setup.sh
@@ -1,13 +1,17 @@
 #! /bin/bash

-echo "Setting up the environment..."
+echo "🚀 Welcome to OpenHands! Let's get your development environment ready..."

 # Install pre-commit package
+echo "📦 Installing pre-commit to help maintain code quality..."
 python -m pip install pre-commit

 # Install pre-commit hooks if .git directory exists
 if [ -d ".git" ]; then
-    echo "Installing pre-commit hooks..."
+    echo "🔧 Setting up pre-commit hooks to keep your code clean..."
    pre-commit install
    make install-pre-commit-hooks
+    echo ""
+    echo "🎉 Setup complete! Your OpenHands development environment is ready!"
+    echo "💡 You can now start contributing to OpenHands. Happy coding! 🚀"
 fi
--- a/56
+++ b/56
@@ -23,16 +23,16 @@ RESET=$(shell tput -Txterm sgr0)

 # Build
 build:
-	@echo "$(GREEN)Building project...$(RESET)"
+	@echo "$(GREEN)🚀 Building OpenHands project...$(RESET)"
 	@$(MAKE) -s check-dependencies
 	@$(MAKE) -s install-python-dependencies
 	@$(MAKE) -s install-frontend-dependencies
 	@$(MAKE) -s install-pre-commit-hooks
 	@$(MAKE) -s build-frontend
-	@echo "$(GREEN)Build completed successfully.$(RESET)"
+	@echo "$(GREEN)🎉 Build completed successfully! You're ready to go!$(RESET)"

 check-dependencies:
-	@echo "$(YELLOW)Checking dependencies...$(RESET)"
+	@echo "$(YELLOW)🔍 Checking your development environment...$(RESET)"
 	@$(MAKE) -s check-system
 	@$(MAKE) -s check-python
 	@$(MAKE) -s check-npm
@@ -42,7 +42,7 @@ ifeq ($(INSTALL_DOCKER),)
 endif
 	@$(MAKE) -s check-poetry
 	@$(MAKE) -s check-tmux
-	@echo "$(GREEN)Dependencies checked successfully.$(RESET)"
+	@echo "$(GREEN)✅ All dependencies look great!$(RESET)"

 check-system:
 	@echo "$(YELLOW)Checking system...$(RESET)"
@@ -62,11 +62,11 @@ check-system:
 	fi

 check-python:
-	@echo "$(YELLOW)Checking Python installation...$(RESET)"
+	@echo "$(YELLOW)🐍 Checking Python installation...$(RESET)"
 	@if command -v python$(PYTHON_VERSION) > /dev/null; then \
-		echo "$(BLUE)$(shell python$(PYTHON_VERSION) --version) is already installed.$(RESET)"; \
+		echo "$(BLUE)✅ Great! $(shell python$(PYTHON_VERSION) --version) is ready to go.$(RESET)"; \
 	else \
-		echo "$(RED)Python $(PYTHON_VERSION) is not installed. Please install Python $(PYTHON_VERSION) to continue.$(RESET)"; \
+		echo "$(RED)❌ Oops! Python $(PYTHON_VERSION) is not installed. Please install Python $(PYTHON_VERSION) to continue.$(RESET)"; \
 		exit 1; \
 	fi

@@ -117,76 +117,76 @@ check-tmux:
 	fi

 check-poetry:
-	@echo "$(YELLOW)Checking Poetry installation...$(RESET)"
+	@echo "$(YELLOW)📝 Checking Poetry installation...$(RESET)"
 	@if command -v poetry > /dev/null; then \
 		POETRY_VERSION=$(shell poetry --version 2>&1 | sed -E 's/Poetry \(version ([0-9]+\.[0-9]+\.[0-9]+)\)/\1/'); \
 		IFS='.' read -r -a POETRY_VERSION_ARRAY <<< "$$POETRY_VERSION"; \
 		if [ $${POETRY_VERSION_ARRAY[0]} -gt 1 ] || ([ $${POETRY_VERSION_ARRAY[0]} -eq 1 ] && [ $${POETRY_VERSION_ARRAY[1]} -ge 8 ]); then \
-			echo "$(BLUE)$(shell poetry --version) is already installed.$(RESET)"; \
+			echo "$(BLUE)✅ Perfect! $(shell poetry --version) is ready to manage your dependencies.$(RESET)"; \
 		else \
-			echo "$(RED)Poetry 1.8 or later is required. You can install poetry by running the following command, then adding Poetry to your PATH:"; \
+			echo "$(RED)❌ We need Poetry 1.8 or later. You can install it by running:"; \
 			echo "$(RED) curl -sSL https://install.python-poetry.org | python$(PYTHON_VERSION) -$(RESET)"; \
-			echo "$(RED)More detail here: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
+			echo "$(RED)📖 More details: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
 			exit 1; \
 		fi; \
 	else \
-		echo "$(RED)Poetry is not installed. You can install poetry by running the following command, then adding Poetry to your PATH:"; \
+		echo "$(RED)❌ Poetry is not installed. You can install it by running:"; \
 		echo "$(RED) curl -sSL https://install.python-poetry.org | python$(PYTHON_VERSION) -$(RESET)"; \
-		echo "$(RED)More detail here: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
+		echo "$(RED)📖 More details: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
 		exit 1; \
 	fi

 install-python-dependencies:
-	@echo "$(GREEN)Installing Python dependencies...$(RESET)"
+	@echo "$(GREEN)📦 Installing Python dependencies...$(RESET)"
 	@if [ -z "${TZ}" ]; then \
-		echo "Defaulting TZ (timezone) to UTC"; \
+		echo "🌍 Defaulting timezone to UTC"; \
 		export TZ="UTC"; \
 	fi
 	poetry env use python$(PYTHON_VERSION)
 	@if [ "$(shell uname)" = "Darwin" ]; then \
-		echo "$(BLUE)Installing chroma-hnswlib...$(RESET)"; \
+		echo "$(BLUE)🍎 Installing macOS-specific dependencies...$(RESET)"; \
 		export HNSWLIB_NO_NATIVE=1; \
 		poetry run pip install chroma-hnswlib; \
 	fi
 	@if [ -n "${POETRY_GROUP}" ]; then \
-		echo "Installing only POETRY_GROUP=${POETRY_GROUP}"; \
+		echo "📋 Installing specific dependency group: ${POETRY_GROUP}"; \
 		poetry install --only $${POETRY_GROUP}; \
 	else \
 		poetry install --with dev,test,runtime; \
 	fi
 	@if [ "${INSTALL_PLAYWRIGHT}" != "false" ] && [ "${INSTALL_PLAYWRIGHT}" != "0" ]; then \
 		if [ -f "/etc/manjaro-release" ]; then \
-			echo "$(BLUE)Detected Manjaro Linux. Installing Playwright dependencies...$(RESET)"; \
+			echo "$(BLUE)🐧 Detected Manjaro Linux. Installing browser automation tools...$(RESET)"; \
 			poetry run pip install playwright; \
 			poetry run playwright install chromium; \
 		else \
 			if [ ! -f cache/playwright_chromium_is_installed.txt ]; then \
-				echo "Running playwright install --with-deps chromium..."; \
+				echo "🌐 Installing browser automation tools..."; \
 				poetry run playwright install --with-deps chromium; \
 				mkdir -p cache; \
 				touch cache/playwright_chromium_is_installed.txt; \
 			else \
-				echo "Setup already done. Skipping playwright installation."; \
+				echo "✅ Browser tools already set up. Skipping installation."; \
 			fi \
 		fi \
 	else \
-		echo "Skipping Playwright installation (INSTALL_PLAYWRIGHT=${INSTALL_PLAYWRIGHT})."; \
+		echo "⏭️  Skipping browser automation setup (INSTALL_PLAYWRIGHT=${INSTALL_PLAYWRIGHT})."; \
 	fi
-	@echo "$(GREEN)Python dependencies installed successfully.$(RESET)"
+	@echo "$(GREEN)🎉 Python dependencies installed successfully!$(RESET)"

 install-frontend-dependencies: check-npm check-nodejs
-	@echo "$(YELLOW)Setting up frontend environment...$(RESET)"
-	@echo "$(YELLOW)Detect Node.js version...$(RESET)"
+	@echo "$(YELLOW)🎨 Setting up frontend environment...$(RESET)"
+	@echo "$(YELLOW)🔍 Detecting Node.js version...$(RESET)"
 	@cd frontend && node ./scripts/detect-node-version.js
-	echo "$(BLUE)Installing frontend dependencies with npm...$(RESET)"
+	echo "$(BLUE)📦 Installing frontend dependencies with npm...$(RESET)"
 	@cd frontend && npm install
-	@echo "$(GREEN)Frontend dependencies installed successfully.$(RESET)"
+	@echo "$(GREEN)✨ Frontend dependencies installed successfully!$(RESET)"

 install-pre-commit-hooks: check-python check-poetry install-python-dependencies
-	@echo "$(YELLOW)Installing pre-commit hooks...$(RESET)"
+	@echo "$(YELLOW)🔧 Installing pre-commit hooks...$(RESET)"
 	@git config --unset-all core.hooksPath || true
 	@poetry run pre-commit install --config $(PRE_COMMIT_CONFIG_PATH)
-	@echo "$(GREEN)Pre-commit hooks installed successfully.$(RESET)"
+	@echo "$(GREEN)✨ Pre-commit hooks installed successfully!$(RESET)"

 lint-backend: install-pre-commit-hooks
 	@echo "$(YELLOW)Running linters...$(RESET)"
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -0,0 +1,15 @@
+# Security Policy
+
+**Please send all vulnerability reports to contact@all-hands.dev in addition to opening a security advisory on GitHub.**
+
+## Security/Bugfix Versions
+Security and bug fixes are generally provided only for the most recent version of OpenHands. Fixes are released either as part of the next minor version or as an on-demand patch version.
+Security fixes are given priority and might be enough to cause a new version to be released.
+
+## Reporting a Vulnerability
+We encourage responsible disclosure of security vulnerabilities. If you find something suspicious, we encourage and appreciate your report!
+
+### Ways to report
+In order for the vulnerability reports to reach maintainers as soon as possible, the preferred way is to use the "Report a vulnerability" button under the "Security" tab of the associated GitHub project. This creates a private communication channel between the reporter and the maintainers.
+
+In addition, please also reach out to the All Hands AI security team at contact@all-hands.dev.
--- a/build_vscode.py
+++ b/build_vscode.py
@@ -55,11 +55,11 @@ def build_vscode_extension():
            print(f'--- Using pre-built VS Code extension: {vsix_path} ---')
        return

-    print(f'--- Building VS Code extension in {VSCODE_EXTENSION_DIR} ---')
+    print(f'🔨 Building VS Code extension in {VSCODE_EXTENSION_DIR}')

    try:
        # Ensure npm dependencies are installed
-        print('--- Running npm install for VS Code extension ---')
+        print('📦 Installing dependencies for VS Code extension...')
        subprocess.run(
            ['npm', 'install'],
            cwd=VSCODE_EXTENSION_DIR,
@@ -68,7 +68,7 @@ def build_vscode_extension():
        )

        # Package the extension
-        print(f'--- Packaging VS Code extension ({VSIX_FILENAME}) ---')
+        print(f'📦 Packaging VS Code extension ({VSIX_FILENAME})...')
        subprocess.run(
            ['npm', 'run', 'package-vsix'],
            cwd=VSCODE_EXTENSION_DIR,
@@ -82,14 +82,14 @@ def build_vscode_extension():
                f'VS Code extension package not found after build: {vsix_path}'
            )

-        print(f'--- VS Code extension built successfully: {vsix_path} ---')
+        print(f'🎉 VS Code extension built successfully: {vsix_path}')

    except subprocess.CalledProcessError as e:
-        print(f'--- Warning: Failed to build VS Code extension: {e} ---')
-        print('--- Continuing without building extension ---')
+        print(f'⚠️  Warning: Failed to build VS Code extension: {e}')
+        print('⏭️  Continuing without building extension...')
        if not vsix_path.exists():
-            print('--- Warning: No pre-built VS Code extension found ---')
-            print('--- VS Code extension will not be available ---')
+            print('⚠️  Warning: No pre-built VS Code extension found')
+            print('❌ VS Code extension will not be available')


 def build(setup_kwargs):
@@ -97,7 +97,7 @@ def build(setup_kwargs):
    This function is called by Poetry during the build process.
    `setup_kwargs` is a dictionary that will be passed to `setuptools.setup()`.
    """
-    print('--- Running custom Poetry build script (build_vscode.py) ---')
+    print('🔧 Running custom Poetry build script for VS Code extension...')

    # Build the VS Code extension and place the .vsix file
    build_vscode_extension()
@@ -105,10 +105,10 @@ def build(setup_kwargs):
    # Poetry will handle including files based on pyproject.toml `include` patterns.
    # Ensure openhands/integrations/vscode/*.vsix is included there.

-    print('--- Custom Poetry build script (build_vscode.py) finished ---')
+    print('✅ Custom Poetry build script completed!')


 if __name__ == '__main__':
-    print('Running build_vscode.py directly for testing VS Code extension packaging...')
+    print('🧪 Testing VS Code extension packaging...')
    build_vscode_extension()
-    print('Direct execution of build_vscode.py finished.')
+    print('✅ VS Code extension packaging test completed!')
--- a/docs/usage/how-to/gui-mode.mdx
+++ b/docs/usage/how-to/gui-mode.mdx
@@ -7,6 +7,67 @@ description: High level overview of the Graphical User Interface (GUI) in OpenHa

 - [OpenHands is running](/usage/local-setup)

+## Launching the GUI Server
+
+### Using the CLI Command
+
+You can launch the OpenHands GUI server directly from the command line using the `serve` command:
+
+<Callout type="info">
+**Prerequisites**: You need to have the [OpenHands CLI installed](/usage/how-to/cli-mode) first, OR have `uv` installed and run `uvx --python 3.12 --from openhands-ai openhands serve`. Otherwise, you'll need to use Docker directly (see the [Docker section](#using-docker-directly) below).
+</Callout>
+
+```bash
+openhands serve
+```
+
+This command will:
+- Check that Docker is installed and running
+- Pull the required Docker images
+- Launch the OpenHands GUI server at http://localhost:3000
+- Use the same configuration directory (`~/.openhands`) as the CLI mode
+
+#### Mounting Your Current Directory
+
+To mount your current working directory into the GUI server container, use the `--mount-cwd` flag:
+
+```bash
+openhands serve --mount-cwd
+```
+
+This is useful when you want to work on files in your current directory through the GUI. The directory will be mounted at `/workspace` inside the container.
+
+#### Using GPU Support
+
+If you have NVIDIA GPUs and want to make them available to the OpenHands container, use the `--gpu` flag:
+
+```bash
+openhands serve --gpu
+```
+
+This will enable GPU support via nvidia-docker, mounting all available GPUs into the container. You can combine this with other flags:
+
+```bash
+openhands serve --gpu --mount-cwd
+```
+
+**Prerequisites for GPU support:**
+- NVIDIA GPU drivers must be installed on your host system
+- [NVIDIA Container Toolkit (nvidia-docker2)](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) must be installed and configured
+
+#### Requirements
+
+Before using the `openhands serve` command, ensure that:
+- Docker is installed and running on your system
+- You have internet access to pull the required Docker images
+- Port 3000 is available on your system
+
+The CLI will automatically check these requirements and provide helpful error messages if anything is missing.
+
+### Using Docker Directly
+
+Alternatively, you can run the GUI server using Docker directly. See the [local setup guide](/usage/local-setup) for detailed Docker instructions.
+
 ## Overview

 ### Initial Setup
--- a/docs/usage/llms/llms.mdx
+++ b/docs/usage/llms/llms.mdx
@@ -18,7 +18,7 @@ Based on these findings and community feedback, these are the latest models that
 ### Cloud / API-Based Models

 - [anthropic/claude-sonnet-4-20250514](https://www.anthropic.com/api) (recommended)
- [openai/o4-mini](https://openai.com/index/introducing-o3-and-o4-mini/)
+- [openai/gpt-5-2025-08-07](https://openai.com/api/) (recommended)
 - [gemini/gemini-2.5-pro](https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/)
 - [deepseek/deepseek-chat](https://api-docs.deepseek.com/)
 - [moonshot/kimi-k2-0711-preview](https://platform.moonshot.ai/docs/pricing/chat#generation-model-kimi-k2)
--- a/docs/usage/llms/openhands-llms.mdx
+++ b/docs/usage/llms/openhands-llms.mdx
@@ -32,4 +32,4 @@ When running OpenHands, you'll need to set the following in the OpenHands UI thr

 Pricing follows official API provider rates. [You can view model prices here.](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)

-For `qwen3-coder-480b`, we charge the cheapest FP8 rate available on openrouter: $0.4 per million input tokens and $1.6 per million output tokens.
+For `qwen3-coder-480b`, we charge the cheapest FP8 rate available on openrouter: \$0.4 per million input tokens and \$1.6 per million output tokens.
--- a/docs/usage/local-setup.mdx
+++ b/docs/usage/local-setup.mdx
@@ -66,6 +66,30 @@ A system with a modern processor and a minimum of **4GB RAM** is recommended to

 ### Start the App

+#### Option 1: Using the CLI Launcher (Recommended)
+
+If you have Python 3.12+ installed, you can use the CLI launcher for a simpler experience:
+
+```bash
+# Install OpenHands
+pip install openhands-ai
+
+# Launch the GUI server
+openhands serve
+
+# Or with GPU support (requires nvidia-docker)
+openhands serve --gpu
+
+# Or with current directory mounted
+openhands serve --mount-cwd
+```
+
+Or using `uvx --python 3.12 --from openhands-ai openhands serve` if you have [uv](https://docs.astral.sh/uv/) installed.
+
+This will automatically handle Docker requirements checking, image pulling, and launching the GUI server. The `--gpu` flag enables GPU support via nvidia-docker, and `--mount-cwd` mounts your current directory into the container.
+
+#### Option 2: Using Docker Directly
+
 ```bash
 docker pull docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik

--- a/evaluation/benchmarks/EDA/run_infer.py
+++ b/evaluation/benchmarks/EDA/run_infer.py
@@ -18,8 +18,8 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -172,7 +172,7 @@ def process_instance(


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--answerer_model', '-a', default='gpt-3.5-turbo', help='answerer model'
    )
--- a/evaluation/benchmarks/commit0/run_infer.py
+++ b/evaluation/benchmarks/commit0/run_infer.py
@@ -26,8 +26,8 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -525,7 +525,7 @@ def commit0_setup(dataset: pd.DataFrame, repo_split: str) -> pd.DataFrame:


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--dataset',
        type=str,
--- a/evaluation/benchmarks/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@@ -31,8 +31,8 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
    load_from_toml,
 )
 from openhands.core.config.utils import get_agent_config_arg
@@ -294,7 +294,7 @@ Here is the task:


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--level',
        type=str,
--- a/evaluation/benchmarks/gorilla/run_infer.py
+++ b/evaluation/benchmarks/gorilla/run_infer.py
@@ -20,8 +20,8 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -134,7 +134,7 @@ def process_instance(


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--hubs',
        type=str,
--- a/evaluation/benchmarks/gpqa/run_infer.py
+++ b/evaluation/benchmarks/gpqa/run_infer.py
@@ -38,8 +38,8 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -312,7 +312,7 @@ Ok now its time to start solving the question. Good luck!


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    # data split must be one of 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended'
    parser.add_argument(
        '--data-split',
--- a/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py
+++ b/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py
@@ -21,7 +21,7 @@ from evaluation.utils.shared import (
 from openhands.core.config import (
    LLMConfig,
    OpenHandsConfig,
-    get_parser,
+    get_evaluation_parser,
    load_openhands_config,
 )
 from openhands.core.logger import openhands_logger as logger
@@ -167,7 +167,7 @@ def process_predictions(predictions_path: str):


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '-s',
        '--eval-split',
--- a/evaluation/benchmarks/lca_ci_build_repair/run_infer.py
+++ b/evaluation/benchmarks/lca_ci_build_repair/run_infer.py
@@ -30,8 +30,8 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
    load_openhands_config,
 )
 from openhands.core.logger import openhands_logger as logger
@@ -358,7 +358,7 @@ Be thorough in your exploration, testing, and reasoning. It's fine if your think


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '-s',
        '--eval-split',
--- a/evaluation/benchmarks/logic_reasoning/run_infer.py
+++ b/evaluation/benchmarks/logic_reasoning/run_infer.py
@@ -18,8 +18,8 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -267,7 +267,7 @@ def process_instance(


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--dataset',
        type=str,
--- a/evaluation/benchmarks/mint/run_infer.py
+++ b/evaluation/benchmarks/mint/run_infer.py
@@ -23,8 +23,8 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -229,7 +229,7 @@ def process_instance(


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()

    SUBSETS = [
        # Eurus subset: https://arxiv.org/abs/2404.02078
--- a/evaluation/benchmarks/ml_bench/run_analysis.py
+++ b/evaluation/benchmarks/ml_bench/run_analysis.py
@@ -4,7 +4,11 @@ import pprint

 import tqdm

-from openhands.core.config import get_llm_config_arg, get_parser, load_openhands_config
+from openhands.core.config import (
+    get_evaluation_parser,
+    get_llm_config_arg,
+    load_openhands_config,
+)
 from openhands.core.logger import openhands_logger as logger
 from openhands.llm.llm import LLM

@@ -111,7 +115,7 @@ def classify_error(llm: LLM, failed_case: dict) -> str:


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--json_file_path',
        type=str,
--- a/evaluation/benchmarks/ml_bench/run_infer.py
+++ b/evaluation/benchmarks/ml_bench/run_infer.py
@@ -34,8 +34,8 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
    load_openhands_config,
 )
 from openhands.core.logger import openhands_logger as logger
@@ -273,7 +273,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '-s',
        '--eval-split',
--- a/evaluation/benchmarks/multi_swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/multi_swe_bench/eval_infer.py
@@ -30,7 +30,7 @@ from evaluation.utils.shared import (
 from openhands.core.config import (
    LLMConfig,
    OpenHandsConfig,
-    get_parser,
+    get_evaluation_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime
@@ -323,7 +323,7 @@ def process_instance(


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--input-file',
        type=str,
--- a/evaluation/benchmarks/multi_swe_bench/run_infer.py
+++ b/evaluation/benchmarks/multi_swe_bench/run_infer.py
@@ -32,8 +32,8 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -772,7 +772,7 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:

 if __name__ == '__main__':
    # pdb.set_trace()
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--dataset',
        type=str,
--- a/evaluation/benchmarks/scienceagentbench/run_infer.py
+++ b/evaluation/benchmarks/scienceagentbench/run_infer.py
@@ -21,8 +21,8 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -239,7 +239,7 @@ If the program uses some packages that are incompatible, please figure out alter


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--use-knowledge',
        type=str,
--- a/evaluation/benchmarks/swe_bench/README.md
+++ b/evaluation/benchmarks/swe_bench/README.md
@@ -183,24 +183,7 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
 - `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent.
 - `logs/`: a directory of test logs

-### Run evaluation with `RemoteRuntime`

-OpenHands Remote Runtime is currently in beta (read [here](https://runtime.all-hands.dev/) for more details), it allows you to run rollout in parallel in the cloud, so you don't need a powerful machine to run evaluation.
-Fill out [this form](https://docs.google.com/forms/d/e/1FAIpQLSckVz_JFwg2_mOxNZjCtr7aoBFI2Mwdan3f75J_TrdMS1JV2g/viewform) to apply if you want to try this out!
-
-```bash
-./evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
-
-# Example - This evaluates patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
-evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_100_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
-```
-
-To clean-up all existing runtimes that you've already started, run:
-
-```bash
-ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/utils/scripts/cleanup_remote_runtime.sh
-```

 ## SWT-Bench Evaluation

--- a/evaluation/benchmarks/swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -26,7 +26,7 @@ from evaluation.utils.shared import (
 from openhands.core.config import (
    LLMConfig,
    OpenHandsConfig,
-    get_parser,
+    get_evaluation_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime
@@ -353,7 +353,7 @@ def process_instance(


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--input-file',
        type=str,
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -43,8 +43,8 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.config.condenser_config import NoOpCondenserConfig
 from openhands.core.config.utils import get_condenser_config_arg
@@ -732,7 +732,7 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--dataset',
        type=str,
--- a/evaluation/benchmarks/swe_bench/run_infer_interact.py
+++ b/evaluation/benchmarks/swe_bench/run_infer_interact.py
@@ -28,8 +28,8 @@ from evaluation.utils.shared import (
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.config.condenser_config import NoOpCondenserConfig
 from openhands.core.config.utils import get_condenser_config_arg
@@ -201,7 +201,7 @@ def process_instance(


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--dataset',
        type=str,
--- a/evaluation/benchmarks/swe_bench/run_localize.py
+++ b/evaluation/benchmarks/swe_bench/run_localize.py
@@ -31,8 +31,8 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -644,7 +644,7 @@ SWEGYM_EXCLUDE_IDS = [
 ]

 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--dataset',
        type=str,
--- a/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh
@@ -1,46 +0,0 @@
-#!/usr/bin/env bash
-set -eo pipefail
-
-INPUT_FILE=$1
-NUM_WORKERS=$2
-DATASET=$3
-SPLIT=$4
-
-if [ -z "$INPUT_FILE" ]; then
-  echo "INPUT_FILE not specified (should be a path to a jsonl file)"
-  exit 1
-fi
-
-if [ -z "$DATASET" ]; then
-  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
-  DATASET="princeton-nlp/SWE-bench_Lite"
-fi
-
-if [ -z "$SPLIT" ]; then
-  echo "SPLIT not specified, use default test"
-  SPLIT="test"
-fi
-
-if [ -z "$NUM_WORKERS" ]; then
-  echo "NUM_WORKERS not specified, use default 1"
-  NUM_WORKERS=1
-fi
-
-echo "... Evaluating on $INPUT_FILE ..."
-
-COMMAND="poetry run python evaluation/benchmarks/swe_bench/eval_infer.py \
-  --eval-num-workers $NUM_WORKERS \
-  --input-file $INPUT_FILE \
-  --dataset $DATASET \
-  --split $SPLIT"
-
-if [ -n "$EVAL_LIMIT" ]; then
-  echo "EVAL_LIMIT: $EVAL_LIMIT"
-  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
-fi
-
-# Run the command
-eval $COMMAND
-
-# update the output with evaluation results
-poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE
--- a/evaluation/benchmarks/testgeneval/NOTES.md
+++ b/evaluation/benchmarks/testgeneval/NOTES.md
@@ -5,8 +5,7 @@ pynguin_ids = ['pydata__xarray-6548-16541', 'pydata__xarray-7003-16557', 'pydata
 ids = ['pydata__xarray-3114-16452', 'pydata__xarray-3151-16453', 'pydata__xarray-3156-16454', 'pydata__xarray-3239-16456', 'pydata__xarray-3239-16457', 'pydata__xarray-3239-16458', 'pydata__xarray-3302-16459', 'pydata__xarray-3364-16461', 'pydata__xarray-3677-16471', 'pydata__xarray-3905-16478', 'pydata__xarray-4182-16484', 'pydata__xarray-4248-16486', 'pydata__xarray-4339-16487', 'pydata__xarray-4419-16488', 'pydata__xarray-4629-16492', 'pydata__xarray-4750-16496', 'pydata__xarray-4802-16505', 'pydata__xarray-4966-16515', 'pydata__xarray-4994-16516', 'pydata__xarray-5033-16517', 'pydata__xarray-5126-16518', 'pydata__xarray-5126-16519', 'pydata__xarray-5131-16520', 'pydata__xarray-5365-16529', 'pydata__xarray-5455-16530', 'pydata__xarray-5662-16532', 'pydata__xarray-5731-16534', 'pydata__xarray-6135-16535', 'pydata__xarray-6135-16536', 'pydata__xarray-6386-16537', 'pydata__xarray-6394-16538', 'pydata__xarray-6400-16539', 'pydata__xarray-6461-16540', 'pydata__xarray-6548-16541', 'pydata__xarray-6599-16543', 'pydata__xarray-6601-16544', 'pydata__xarray-6882-16548', 'pydata__xarray-6889-16549', 'pydata__xarray-7003-16557', 'pydata__xarray-7147-16571', 'pydata__xarray-7150-16572', 'pydata__xarray-7203-16577', 'pydata__xarray-7229-16578', 'pydata__xarray-7393-16581', 'pydata__xarray-7400-16582']


-Command eval (our approach):
-poetry run ./evaluation/benchmarks/testgeneval/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/kjain14__testgeneval-test/CodeActAgent/gpt-4o_maxiter_25_N_v0.20.0-no-hint-run_1/output.jsonl 10 kjain14/testgeneval test true
+

 Command run (our approach):
 ./evaluation/benchmarks/testgeneval/scripts/run_infer.sh llm.eval_gpt HEAD CodeActAgent -1 25 10 kjain14/testgeneval test 1 ../TestGenEval/results/testgeneval/preds/gpt-4o-2024-08-06__testgeneval__0.2__test.jsonl
--- a/evaluation/benchmarks/testgeneval/eval_infer.py
+++ b/evaluation/benchmarks/testgeneval/eval_infer.py
@@ -41,7 +41,7 @@ from evaluation.utils.shared import (
    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from openhands.core.config import OpenHandsConfig, SandboxConfig, get_parser
+from openhands.core.config import OpenHandsConfig, SandboxConfig, get_evaluation_parser
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime
 from openhands.events.action import CmdRunAction
@@ -484,7 +484,7 @@ def count_and_log_fields(evaluated_predictions, fields, key):


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--input-file', type=str, required=True, help='Path to input predictions file'
    )
--- a/evaluation/benchmarks/testgeneval/run_infer.py
+++ b/evaluation/benchmarks/testgeneval/run_infer.py
@@ -37,8 +37,8 @@ from openhands.core.config import (
    AgentConfig,
    OpenHandsConfig,
    SandboxConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -491,7 +491,7 @@ def prepare_dataset_pre(dataset: pd.DataFrame, filter_column: str) -> pd.DataFra


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--dataset',
        type=str,
--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@@ -18,8 +18,8 @@ from openhands.core.config import (
    LLMConfig,
    OpenHandsConfig,
    get_agent_config_arg,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.config.agent_config import AgentConfig
 from openhands.core.logger import openhands_logger as logger
@@ -197,7 +197,7 @@ def run_evaluator(


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--task-image-name',
        type=str,
--- a/evaluation/benchmarks/toolqa/run_infer.py
+++ b/evaluation/benchmarks/toolqa/run_infer.py
@@ -19,8 +19,8 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -157,7 +157,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =


 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--dataset',
        type=str,
--- a/evaluation/benchmarks/visual_swe_bench/run_infer.py
+++ b/evaluation/benchmarks/visual_swe_bench/run_infer.py
@@ -31,8 +31,8 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -565,7 +565,7 @@ SWEGYM_EXCLUDE_IDS = [
 ]

 if __name__ == '__main__':
-    parser = get_parser()
+    parser = get_evaluation_parser()
    parser.add_argument(
        '--dataset',
        type=str,
--- a/frontend/tests/api/file-service/file-service.api.test.ts
+++ b/frontend/tests/api/file-service/file-service.api.test.ts
@@ -1,5 +1,5 @@
 import { describe, expect, it } from "vitest";
-import { FileService } from "#/api/file-service/file-service.api";
+import OpenHands from "#/api/open-hands";
 import {
  FILE_VARIANTS_1,
  FILE_VARIANTS_2,
@@ -10,20 +10,20 @@ import {
 * You can find the mock handlers in `frontend/src/mocks/file-service-handlers.ts`.
 */

-describe("FileService", () => {
+describe("OpenHands File API", () => {
  it("should get a list of files", async () => {
-    await expect(FileService.getFiles("test-conversation-id")).resolves.toEqual(
+    await expect(OpenHands.getFiles("test-conversation-id")).resolves.toEqual(
      FILE_VARIANTS_1,
    );

    await expect(
-      FileService.getFiles("test-conversation-id-2"),
+      OpenHands.getFiles("test-conversation-id-2"),
    ).resolves.toEqual(FILE_VARIANTS_2);
  });

  it("should get content of a file", async () => {
    await expect(
-      FileService.getFile("test-conversation-id", "file1.txt"),
+      OpenHands.getFile("test-conversation-id", "file1.txt"),
    ).resolves.toEqual("Content of file1.txt");
  });
 });
--- a/frontend/tests/components/features/chat/launch-microagent-modal.test.tsx
+++ b/frontend/tests/components/features/chat/launch-microagent-modal.test.tsx
@@ -3,8 +3,6 @@ import { afterEach, describe, expect, it, vi } from "vitest";
 import userEvent from "@testing-library/user-event";
 import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
 import { LaunchMicroagentModal } from "#/components/features/chat/microagent/launch-microagent-modal";
-import { MemoryService } from "#/api/memory-service/memory-service.api";
-import { FileService } from "#/api/file-service/file-service.api";
 import { I18nKey } from "#/i18n/declaration";

 vi.mock("react-router", async () => ({
--- a/frontend/tests/utils/extract-model-and-provider.test.ts
+++ b/frontend/tests/utils/extract-model-and-provider.test.ts
@@ -82,5 +82,11 @@ describe("extractModelAndProvider", () => {
      model: "claude-opus-4-20250514",
      separator: "/",
    });
+
+    expect(extractModelAndProvider("claude-opus-4-1-20250805")).toEqual({
+      provider: "anthropic",
+      model: "claude-opus-4-1-20250805",
+      separator: "/",
+    });
  });
 });
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -1,44 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-  <head>
-    <meta charset="utf-8" />
-    <link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png">
-    <link rel="icon" type="image/png" sizes="32x32" href="/favicon-32x32.png">
-    <link rel="icon" type="image/png" sizes="16x16" href="/favicon-16x16.png">
-    <link rel="manifest" href="/site.webmanifest">
-    <link rel="mask-icon" href="/safari-pinned-tab.svg" color="#5bbad5">
-    <meta name="msapplication-TileColor" content="#da532c">
-    <meta name="theme-color" content="#ffffff">
-    <meta name="viewport" content="width=device-width, initial-scale=1" />
-    <meta name="theme-color" content="#000000" />
-    <meta
-      name="description"
-      content="OpenHands: Code Less, Make More"
-    />
-    <!--
-      Notice the use of %PUBLIC_URL% in the tags above.
-      It will be replaced with the URL of the `public` folder during the build.
-      Only files inside the `public` folder can be referenced from the HTML.
-
-      Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
-      work correctly both with client-side routing and a non-root public URL.
-      Learn how to configure a non-root public URL by running `npm run build`.
-    -->
-    <title>OpenHands</title>
-  </head>
-  <body>
-    <noscript>You need to enable JavaScript to run this app.</noscript>
-    <div id="root"></div>
-    <!--
-      This HTML file is a template.
-      If you open it directly in the browser, you will see an empty page.
-
-      You can add webfonts, meta tags, or analytics to this file.
-      The build step will place the bundled scripts into the <body> tag.
-
-      To begin the development, run `npm start` or `yarn start`.
-      To create a production bundle, use `npm run build` or `yarn build`.
-    -->
-    <script type="module" src="/src/index.tsx"></script>
-  </body>
-</html>
--- a/frontend/src/api/file-service/file-service.api.ts
+++ b/frontend/src/api/file-service/file-service.api.ts
@@ -1,66 +0,0 @@
-import { openHands } from "../open-hands-axios";
-import { GetFilesResponse, GetFileResponse } from "./file-service.types";
-import { getConversationUrl } from "../conversation.utils";
-import { FileUploadSuccessResponse } from "../open-hands.types";
-
-export class FileService {
-  /**
-   * Retrieve the list of files available in the workspace
-   * @param conversationId ID of the conversation
-   * @param path Path to list files from. If provided, it lists all the files in the given path
-   * @returns List of files available in the given path. If path is not provided, it lists all the files in the workspace
-   */
-  static async getFiles(
-    conversationId: string,
-    path?: string,
-  ): Promise<GetFilesResponse> {
-    const url = `${getConversationUrl(conversationId)}/list-files`;
-    const { data } = await openHands.get<GetFilesResponse>(url, {
-      params: { path },
-    });
-
-    return data;
-  }
-
-  /**
-   * Retrieve the content of a file
-   * @param conversationId ID of the conversation
-   * @param path Full path of the file to retrieve
-   * @returns Code content of the file
-   */
-  static async getFile(conversationId: string, path: string): Promise<string> {
-    const url = `${getConversationUrl(conversationId)}/select-file`;
-    const { data } = await openHands.get<GetFileResponse>(url, {
-      params: { file: path },
-    });
-
-    return data.code;
-  }
-
-  /**
-   * Upload multiple files to the workspace
-   * @param conversationId ID of the conversation
-   * @param files List of files.
-   * @returns list of uploaded files, list of skipped files
-   */
-  static async uploadFiles(
-    conversationId: string,
-    files: File[],
-  ): Promise<FileUploadSuccessResponse> {
-    const formData = new FormData();
-    for (const file of files) {
-      formData.append("files", file);
-    }
-    const url = `${getConversationUrl(conversationId)}/upload-files`;
-    const response = await openHands.post<FileUploadSuccessResponse>(
-      url,
-      formData,
-      {
-        headers: {
-          "Content-Type": "multipart/form-data",
-        },
-      },
-    );
-    return response.data;
-  }
-}
--- a/frontend/src/api/file-service/file-service.types.ts
+++ b/frontend/src/api/file-service/file-service.types.ts
@@ -1,5 +0,0 @@
-export type GetFilesResponse = string[];
-
-export interface GetFileResponse {
-  code: string;
-}
--- a/frontend/src/api/memory-service/memory-service.api.ts
+++ b/frontend/src/api/memory-service/memory-service.api.ts
@@ -1,21 +0,0 @@
-import { openHands } from "../open-hands-axios";
-
-interface GetPromptResponse {
-  status: string;
-  prompt: string;
-}
-
-export class MemoryService {
-  static async getPrompt(
-    conversationId: string,
-    eventId: number,
-  ): Promise<string> {
-    const { data } = await openHands.get<GetPromptResponse>(
-      `/api/conversations/${conversationId}/remember_prompt`,
-      {
-        params: { event_id: eventId },
-      },
-    );
-    return data.prompt;
-  }
-}
--- a/frontend/src/api/open-hands.ts
+++ b/frontend/src/api/open-hands.ts
@@ -15,6 +15,9 @@ import {
  GetMicroagentPromptResponse,
  CreateMicroagent,
  MicroagentContentResponse,
+  FileUploadSuccessResponse,
+  GetFilesResponse,
+  GetFileResponse,
 } from "./open-hands.types";
 import { openHands } from "./open-hands-axios";
 import { ApiSettings, PostApiSettings, Provider } from "#/types/settings";
@@ -618,12 +621,11 @@ class OpenHands {
    conversationId: string,
    eventId: number,
  ): Promise<string> {
-    const { data } = await openHands.get<GetMicroagentPromptResponse>(
-      `/api/conversations/${conversationId}/remember_prompt`,
-      {
-        params: { event_id: eventId },
-      },
-    );
+    const url = `${this.getConversationUrl(conversationId)}/remember-prompt`;
+    const { data } = await openHands.get<GetMicroagentPromptResponse>(url, {
+      params: { event_id: eventId },
+      headers: this.getConversationHeaders(),
+    });

    return data.prompt;
  }
@@ -640,6 +642,69 @@ class OpenHands {
    return data;
  }

+  /**
+   * Retrieve the list of files available in the workspace
+   * @param conversationId ID of the conversation
+   * @param path Path to list files from. If provided, it lists all the files in the given path
+   * @returns List of files available in the given path. If path is not provided, it lists all the files in the workspace
+   */
+  static async getFiles(
+    conversationId: string,
+    path?: string,
+  ): Promise<GetFilesResponse> {
+    const url = `${this.getConversationUrl(conversationId)}/list-files`;
+    const { data } = await openHands.get<GetFilesResponse>(url, {
+      params: { path },
+      headers: this.getConversationHeaders(),
+    });
+
+    return data;
+  }
+
+  /**
+   * Retrieve the content of a file
+   * @param conversationId ID of the conversation
+   * @param path Full path of the file to retrieve
+   * @returns Code content of the file
+   */
+  static async getFile(conversationId: string, path: string): Promise<string> {
+    const url = `${this.getConversationUrl(conversationId)}/select-file`;
+    const { data } = await openHands.get<GetFileResponse>(url, {
+      params: { file: path },
+      headers: this.getConversationHeaders(),
+    });
+
+    return data.code;
+  }
+
+  /**
+   * Upload multiple files to the workspace
+   * @param conversationId ID of the conversation
+   * @param files List of files.
+   * @returns list of uploaded files, list of skipped files
+   */
+  static async uploadFiles(
+    conversationId: string,
+    files: File[],
+  ): Promise<FileUploadSuccessResponse> {
+    const formData = new FormData();
+    for (const file of files) {
+      formData.append("files", file);
+    }
+    const url = `${this.getConversationUrl(conversationId)}/upload-files`;
+    const response = await openHands.post<FileUploadSuccessResponse>(
+      url,
+      formData,
+      {
+        headers: {
+          "Content-Type": "multipart/form-data",
+          ...this.getConversationHeaders(),
+        },
+      },
+    );
+    return response.data;
+  }
+
  /**
   * Get the user installation IDs
   * @param provider The provider to get installation IDs for (github, bitbucket, etc.)
--- a/frontend/src/api/open-hands.types.ts
+++ b/frontend/src/api/open-hands.types.ts
@@ -158,3 +158,9 @@ export interface MicroagentContentResponse {
  git_provider: Provider;
  triggers: string[];
 }
+
+export type GetFilesResponse = string[];
+
+export interface GetFileResponse {
+  code: string;
+}
--- a/frontend/src/hooks/mutation/use-upload-files.ts
+++ b/frontend/src/hooks/mutation/use-upload-files.ts
@@ -1,11 +1,11 @@
 import { useMutation } from "@tanstack/react-query";
-import { FileService } from "#/api/file-service/file-service.api";
+import OpenHands from "#/api/open-hands";

 export const useUploadFiles = () =>
  useMutation({
    mutationKey: ["upload-files"],
    mutationFn: (variables: { conversationId: string; files: File[] }) =>
-      FileService.uploadFiles(variables.conversationId!, variables.files),
+      OpenHands.uploadFiles(variables.conversationId!, variables.files),
    onSuccess: async () => {},
    meta: {
      disableToast: true,
--- a/frontend/src/hooks/query/use-get-microagents.ts
+++ b/frontend/src/hooks/query/use-get-microagents.ts
@@ -1,13 +1,13 @@
 import { useQuery } from "@tanstack/react-query";
 import { useConversationId } from "../use-conversation-id";
-import { FileService } from "#/api/file-service/file-service.api";
+import OpenHands from "#/api/open-hands";

 export const useGetMicroagents = (microagentDirectory: string) => {
  const { conversationId } = useConversationId();

  return useQuery({
    queryKey: ["files", "microagents", conversationId, microagentDirectory],
-    queryFn: () => FileService.getFiles(conversationId!, microagentDirectory),
+    queryFn: () => OpenHands.getFiles(conversationId!, microagentDirectory),
    enabled: !!conversationId,
    select: (data) =>
      data.map((fileName) => fileName.replace(microagentDirectory, "")),
--- a/frontend/src/hooks/query/use-microagent-prompt.ts
+++ b/frontend/src/hooks/query/use-microagent-prompt.ts
@@ -1,5 +1,5 @@
 import { useQuery } from "@tanstack/react-query";
-import { MemoryService } from "#/api/memory-service/memory-service.api";
+import OpenHands from "#/api/open-hands";
 import { useConversationId } from "../use-conversation-id";

 export const useMicroagentPrompt = (eventId: number) => {
@@ -7,7 +7,7 @@ export const useMicroagentPrompt = (eventId: number) => {

  return useQuery({
    queryKey: ["memory", "prompt", conversationId, eventId],
-    queryFn: () => MemoryService.getPrompt(conversationId!, eventId),
+    queryFn: () => OpenHands.getMicroagentPrompt(conversationId!, eventId),
    enabled: !!conversationId,
    staleTime: 1000 * 60 * 5, // 5 minutes
    gcTime: 1000 * 60 * 15, // 15 minutes
--- a/frontend/src/routes/app-settings.tsx
+++ b/frontend/src/routes/app-settings.tsx
@@ -222,7 +222,7 @@ function AppSettingsScreen() {
            className="w-full max-w-[680px]" // Match the width of the language field
          />

-          <div className="border-t border-t-tertiary pt-6 mt-2">
+          <div className="border-t border-t-tertiary pt-6 mt-2 hidden">
            <h3 className="text-lg font-medium mb-4">
              {t(I18nKey.SETTINGS$GIT_SETTINGS)}
            </h3>
--- a/frontend/src/utils/verified-models.ts
+++ b/frontend/src/utils/verified-models.ts
@@ -14,6 +14,7 @@ export const VERIFIED_MODELS = [
  "claude-3-7-sonnet-20250219",
  "claude-sonnet-4-20250514",
  "claude-opus-4-20250514",
+  "claude-opus-4-1-20250805",
  "gemini-2.5-pro",
  "o4-mini",
  "deepseek-chat",
@@ -22,11 +23,13 @@ export const VERIFIED_MODELS = [
  "devstral-medium-2507",
  "kimi-k2-0711-preview",
  "qwen3-coder-480b",
+  "gpt-5-2025-08-07",
 ];

 // LiteLLM does not return OpenAI models with the provider, so we list them here to set them ourselves for consistency
 // (e.g., they return `gpt-4o` instead of `openai/gpt-4o`)
 export const VERIFIED_OPENAI_MODELS = [
+  "gpt-5-2025-08-07",
  "gpt-4o",
  "gpt-4o-mini",
  "gpt-4.1",
@@ -47,6 +50,7 @@ export const VERIFIED_ANTHROPIC_MODELS = [
  "claude-3-7-sonnet-20250219",
  "claude-sonnet-4-20250514",
  "claude-opus-4-20250514",
+  "claude-opus-4-1-20250805",
 ];

 // LiteLLM does not return the compatible Mistral models with the provider, so we list them here to set them ourselves
@@ -61,7 +65,9 @@ export const VERIFIED_MISTRAL_MODELS = [
 // (e.g., they return `claude-sonnet-4-20250514` instead of `openhands/claude-sonnet-4-20250514`)
 export const VERIFIED_OPENHANDS_MODELS = [
  "claude-sonnet-4-20250514",
+  "gpt-5-2025-08-07",
  "claude-opus-4-20250514",
+  "claude-opus-4-1-20250805",
  "gemini-2.5-pro",
  "o3",
  "o4-mini",
--- a/openhands/agenthub/codeact_agent/tools/bash.py
+++ b/openhands/agenthub/codeact_agent/tools/bash.py
@@ -1,3 +1,4 @@
+import re
 import sys

 from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
@@ -37,7 +38,16 @@ _SHORT_BASH_DESCRIPTION = """Execute a bash command in the terminal.

 def refine_prompt(prompt: str):
    if sys.platform == 'win32':
-        return prompt.replace('bash', 'powershell')
+        # Replace 'bash' with 'powershell' including tool names like 'execute_bash'
+        # First replace 'execute_bash' with 'execute_powershell' to handle tool names
+        result = re.sub(
+            r'\bexecute_bash\b', 'execute_powershell', prompt, flags=re.IGNORECASE
+        )
+        # Then replace standalone 'bash' with 'powershell'
+        result = re.sub(
+            r'(?<!execute_)(?<!_)\bbash\b', 'powershell', result, flags=re.IGNORECASE
+        )
+        return result
    return prompt


--- a/openhands/cli/init.py
+++ b/openhands/cli/init.py
@@ -0,0 +1 @@
+"""OpenHands CLI module."""
--- a/openhands/cli/entry.py
+++ b/openhands/cli/entry.py
@@ -0,0 +1,54 @@
+"""Main entry point for OpenHands CLI with subcommand support."""
+
+import sys
+
+import openhands
+import openhands.cli.suppress_warnings  # noqa: F401
+from openhands.cli.gui_launcher import launch_gui_server
+from openhands.cli.main import run_cli_command
+from openhands.core.config import get_cli_parser
+from openhands.core.config.arg_utils import get_subparser
+
+
+def main():
+    """Main entry point with subcommand support and backward compatibility."""
+    parser = get_cli_parser()
+
+    # If user only asks for --help or -h without a subcommand
+    if len(sys.argv) == 2 and sys.argv[1] in ('--help', '-h'):
+        # Print top-level help
+        print(parser.format_help())
+
+        # Also print help for `cli` subcommand
+        print('\n' + '=' * 80)
+        print('CLI command help:\n')
+
+        cli_parser = get_subparser(parser, 'cli')
+        print(cli_parser.format_help())
+
+        sys.exit(0)
+
+    # Special case: no subcommand provided, simulate "openhands cli"
+    if len(sys.argv) == 1 or (
+        len(sys.argv) > 1 and sys.argv[1] not in ['cli', 'serve']
+    ):
+        # Inject 'cli' as default command
+        sys.argv.insert(1, 'cli')
+
+    args = parser.parse_args()
+
+    if hasattr(args, 'version') and args.version:
+        print(f'OpenHands CLI version: {openhands.get_version()}')
+        sys.exit(0)
+
+    if args.command == 'serve':
+        launch_gui_server(mount_cwd=args.mount_cwd, gpu=args.gpu)
+    elif args.command == 'cli' or args.command is None:
+        run_cli_command(args)
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/openhands/cli/gui_launcher.py
+++ b/openhands/cli/gui_launcher.py
@@ -0,0 +1,219 @@
+"""GUI launcher for OpenHands CLI."""
+
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+from prompt_toolkit import print_formatted_text
+from prompt_toolkit.formatted_text import HTML
+
+from openhands import __version__
+
+
+def _format_docker_command_for_logging(cmd: list[str]) -> str:
+    """Format a Docker command for logging with grey color.
+
+    Args:
+        cmd (list[str]): The Docker command as a list of strings
+
+    Returns:
+        str: The formatted command string in grey HTML color
+    """
+    cmd_str = ' '.join(cmd)
+    return f'<grey>Running Docker command: {cmd_str}</grey>'
+
+
+def check_docker_requirements() -> bool:
+    """Check if Docker is installed and running.
+
+    Returns:
+        bool: True if Docker is available and running, False otherwise.
+    """
+    # Check if Docker is installed
+    if not shutil.which('docker'):
+        print_formatted_text(
+            HTML('<ansired>❌ Docker is not installed or not in PATH.</ansired>')
+        )
+        print_formatted_text(
+            HTML(
+                '<grey>Please install Docker first: https://docs.docker.com/get-docker/</grey>'
+            )
+        )
+        return False
+
+    # Check if Docker daemon is running
+    try:
+        result = subprocess.run(
+            ['docker', 'info'], capture_output=True, text=True, timeout=10
+        )
+        if result.returncode != 0:
+            print_formatted_text(
+                HTML('<ansired>❌ Docker daemon is not running.</ansired>')
+            )
+            print_formatted_text(
+                HTML('<grey>Please start Docker and try again.</grey>')
+            )
+            return False
+    except (subprocess.TimeoutExpired, subprocess.SubprocessError) as e:
+        print_formatted_text(
+            HTML('<ansired>❌ Failed to check Docker status.</ansired>')
+        )
+        print_formatted_text(HTML(f'<grey>Error: {e}</grey>'))
+        return False
+
+    return True
+
+
+def ensure_config_dir_exists() -> Path:
+    """Ensure the OpenHands configuration directory exists and return its path."""
+    config_dir = Path.home() / '.openhands'
+    config_dir.mkdir(exist_ok=True)
+    return config_dir
+
+
+def launch_gui_server(mount_cwd: bool = False, gpu: bool = False) -> None:
+    """Launch the OpenHands GUI server using Docker.
+
+    Args:
+        mount_cwd: If True, mount the current working directory into the container.
+        gpu: If True, enable GPU support by mounting all GPUs into the container via nvidia-docker.
+    """
+    print_formatted_text(
+        HTML('<ansiblue>🚀 Launching OpenHands GUI server...</ansiblue>')
+    )
+    print_formatted_text('')
+
+    # Check Docker requirements
+    if not check_docker_requirements():
+        sys.exit(1)
+
+    # Ensure config directory exists
+    config_dir = ensure_config_dir_exists()
+
+    # Get the current version for the Docker image
+    version = __version__
+    runtime_image = f'docker.all-hands.dev/all-hands-ai/runtime:{version}-nikolaik'
+    app_image = f'docker.all-hands.dev/all-hands-ai/openhands:{version}'
+
+    print_formatted_text(HTML('<grey>Pulling required Docker images...</grey>'))
+
+    # Pull the runtime image first
+    pull_cmd = ['docker', 'pull', runtime_image]
+    print_formatted_text(HTML(_format_docker_command_for_logging(pull_cmd)))
+    try:
+        subprocess.run(
+            pull_cmd,
+            check=True,
+            timeout=300,  # 5 minutes timeout
+        )
+    except subprocess.CalledProcessError:
+        print_formatted_text(
+            HTML('<ansired>❌ Failed to pull runtime image.</ansired>')
+        )
+        sys.exit(1)
+    except subprocess.TimeoutExpired:
+        print_formatted_text(
+            HTML('<ansired>❌ Timeout while pulling runtime image.</ansired>')
+        )
+        sys.exit(1)
+
+    print_formatted_text('')
+    print_formatted_text(
+        HTML('<ansigreen>✅ Starting OpenHands GUI server...</ansigreen>')
+    )
+    print_formatted_text(
+        HTML('<grey>The server will be available at: http://localhost:3000</grey>')
+    )
+    print_formatted_text(HTML('<grey>Press Ctrl+C to stop the server.</grey>'))
+    print_formatted_text('')
+
+    # Build the Docker command
+    docker_cmd = [
+        'docker',
+        'run',
+        '-it',
+        '--rm',
+        '--pull=always',
+        '-e',
+        f'SANDBOX_RUNTIME_CONTAINER_IMAGE={runtime_image}',
+        '-e',
+        'LOG_ALL_EVENTS=true',
+        '-v',
+        '/var/run/docker.sock:/var/run/docker.sock',
+        '-v',
+        f'{config_dir}:/.openhands',
+    ]
+
+    # Add GPU support if requested
+    if gpu:
+        print_formatted_text(
+            HTML('<ansigreen>🖥️ Enabling GPU support via nvidia-docker...</ansigreen>')
+        )
+        # Add the --gpus all flag to enable all GPUs
+        docker_cmd.insert(2, '--gpus')
+        docker_cmd.insert(3, 'all')
+        # Add environment variable to pass GPU support to sandbox containers
+        docker_cmd.extend(
+            [
+                '-e',
+                'SANDBOX_ENABLE_GPU=true',
+            ]
+        )
+
+    # Add current working directory mount if requested
+    if mount_cwd:
+        cwd = Path.cwd()
+        # Following the documentation at https://docs.all-hands.dev/usage/runtimes/docker#connecting-to-your-filesystem
+        docker_cmd.extend(
+            [
+                '-e',
+                f'SANDBOX_VOLUMES={cwd}:/workspace:rw',
+            ]
+        )
+
+        # Set user ID for Unix-like systems only
+        if os.name != 'nt':  # Not Windows
+            try:
+                user_id = subprocess.check_output(['id', '-u'], text=True).strip()
+                docker_cmd.extend(['-e', f'SANDBOX_USER_ID={user_id}'])
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                # If 'id' command fails or doesn't exist, skip setting user ID
+                pass
+        # Print the folder that will be mounted to inform the user
+        print_formatted_text(
+            HTML(
+                f'<ansigreen>📂 Mounting current directory:</ansigreen> <ansiyellow>{cwd}</ansiyellow> <ansigreen>to</ansigreen> <ansiyellow>/workspace</ansiyellow>'
+            )
+        )
+
+    docker_cmd.extend(
+        [
+            '-p',
+            '3000:3000',
+            '--add-host',
+            'host.docker.internal:host-gateway',
+            '--name',
+            'openhands-app',
+            app_image,
+        ]
+    )
+
+    try:
+        # Log and run the Docker command
+        print_formatted_text(HTML(_format_docker_command_for_logging(docker_cmd)))
+        subprocess.run(docker_cmd, check=True)
+    except subprocess.CalledProcessError as e:
+        print_formatted_text('')
+        print_formatted_text(
+            HTML('<ansired>❌ Failed to start OpenHands GUI server.</ansired>')
+        )
+        print_formatted_text(HTML(f'<grey>Error: {e}</grey>'))
+        sys.exit(1)
+    except KeyboardInterrupt:
+        print_formatted_text('')
+        print_formatted_text(
+            HTML('<ansigreen>✓ OpenHands GUI server stopped successfully.</ansigreen>')
+        )
+        sys.exit(0)
--- a/openhands/cli/main.py
+++ b/openhands/cli/main.py
@@ -45,7 +45,6 @@ from openhands.controller import AgentController
 from openhands.controller.agent import Agent
 from openhands.core.config import (
    OpenHandsConfig,
-    parse_arguments,
    setup_config_from_args,
 )
 from openhands.core.config.condenser_config import NoOpCondenserConfig
@@ -129,12 +128,13 @@ async def run_session(
    conversation_instructions: str | None = None,
    session_name: str | None = None,
    skip_banner: bool = False,
+    conversation_id: str | None = None,
 ) -> bool:
    reload_microagents = False
    new_session_requested = False
    exit_reason = ExitReason.INTENTIONAL

-    sid = generate_sid(config, session_name)
+    sid = conversation_id or generate_sid(config, session_name)
    is_loaded = asyncio.Event()
    is_paused = asyncio.Event()  # Event to track agent pause requests
    always_confirm_mode = False  # Flag to enable always confirm mode
@@ -523,10 +523,8 @@ def run_alias_setup_flow(config: OpenHandsConfig) -> None:
    print_formatted_text('')


-async def main_with_loop(loop: asyncio.AbstractEventLoop) -> None:
+async def main_with_loop(loop: asyncio.AbstractEventLoop, args) -> None:
    """Runs the agent in CLI mode."""
-    args = parse_arguments()
-
    # Set log level from command line argument if provided
    if args.log_level and isinstance(args.log_level, str):
        log_level = getattr(logging, str(args.log_level).upper())
@@ -574,13 +572,9 @@ async def main_with_loop(loop: asyncio.AbstractEventLoop) -> None:

    # Use settings from settings store if available and override with command line arguments
    if settings:
-        # Handle agent configuration
-        if args.agent_cls:
-            config.default_agent = str(args.agent_cls)
-        else:
-            # settings.agent is not None because we check for it in setup_config_from_args
-            assert settings.agent is not None
-            config.default_agent = settings.agent
+        # settings.agent is not None because we check for it in setup_config_from_args
+        assert settings.agent is not None
+        config.default_agent = settings.agent

        # Handle LLM configuration with proper precedence:
        # 1. CLI parameters (-l) have highest precedence (already handled in setup_config_from_args)
@@ -705,6 +699,7 @@ After reviewing the file, please ask the user what they would like to do with it
        task_str,
        session_name=args.name,
        skip_banner=banner_shown,
+        conversation_id=args.conversation,
    )

    # If a new session was requested, run it
@@ -717,18 +712,19 @@ After reviewing the file, please ask the user what they would like to do with it
    get_runtime_cls(config.runtime).teardown(config)


-def main():
+def run_cli_command(args):
+    """Run the CLI command with proper error handling and cleanup."""
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
-        loop.run_until_complete(main_with_loop(loop))
+        loop.run_until_complete(main_with_loop(loop, args))
    except KeyboardInterrupt:
        print_formatted_text('⚠️ Session was interrupted: interrupted\n')
    except ConnectionRefusedError as e:
-        print(f'Connection refused: {e}')
+        print_formatted_text(f'Connection refused: {e}')
        sys.exit(1)
    except Exception as e:
-        print(f'An error occurred: {e}')
+        print_formatted_text(f'An error occurred: {e}')
        sys.exit(1)
    finally:
        try:
@@ -741,9 +737,5 @@ def main():
            loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
            loop.close()
        except Exception as e:
-            print(f'Error during cleanup: {e}')
+            print_formatted_text(f'Error during cleanup: {e}')
            sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()
--- a/openhands/cli/settings.py
+++ b/openhands/cli/settings.py
@@ -27,7 +27,7 @@ from openhands.core.config.condenser_config import (
    CondenserPipelineConfig,
    ConversationWindowCondenserConfig,
 )
-from openhands.core.config.utils import OH_DEFAULT_AGENT
+from openhands.core.config.config_utils import OH_DEFAULT_AGENT
 from openhands.memory.condenser.impl.llm_summarizing_condenser import (
    LLMSummarizingCondenserConfig,
 )
--- a/openhands/cli/utils.py
+++ b/openhands/cli/utils.py
@@ -150,6 +150,7 @@ def organize_models_and_providers(
 VERIFIED_PROVIDERS = ['openhands', 'anthropic', 'openai', 'mistral']

 VERIFIED_OPENAI_MODELS = [
+    'gpt-5-2025-08-07',
    'o4-mini',
    'gpt-4o',
    'gpt-4o-mini',
@@ -164,6 +165,7 @@ VERIFIED_OPENAI_MODELS = [
 VERIFIED_ANTHROPIC_MODELS = [
    'claude-sonnet-4-20250514',
    'claude-opus-4-20250514',
+    'claude-opus-4-1-20250805',
    'claude-3-7-sonnet-20250219',
    'claude-3-sonnet-20240229',
    'claude-3-opus-20240229',
@@ -183,7 +185,9 @@ VERIFIED_MISTRAL_MODELS = [

 VERIFIED_OPENHANDS_MODELS = [
    'claude-sonnet-4-20250514',
+    'gpt-5-2025-08-07',
    'claude-opus-4-20250514',
+    'claude-opus-4-1-20250805',
    'devstral-small-2507',
    'devstral-medium-2507',
    'o3',
--- a/openhands/core/config/init.py
+++ b/openhands/core/config/init.py
@@ -1,4 +1,9 @@
 from openhands.core.config.agent_config import AgentConfig
+from openhands.core.config.arg_utils import (
+    get_cli_parser,
+    get_evaluation_parser,
+    get_headless_parser,
+)
 from openhands.core.config.cli_config import CLIConfig
 from openhands.core.config.config_utils import (
    OH_DEFAULT_AGENT,
@@ -15,7 +20,6 @@ from openhands.core.config.utils import (
    finalize_config,
    get_agent_config_arg,
    get_llm_config_arg,
-    get_parser,
    load_from_env,
    load_from_toml,
    load_openhands_config,
@@ -41,7 +45,9 @@ __all__ = [
    'get_agent_config_arg',
    'get_llm_config_arg',
    'get_field_info',
-    'get_parser',
+    'get_cli_parser',
+    'get_headless_parser',
+    'get_evaluation_parser',
    'parse_arguments',
    'setup_config_from_args',
 ]
--- a/openhands/core/config/arg_utils.py
+++ b/openhands/core/config/arg_utils.py
@@ -0,0 +1,224 @@
+"""Centralized command line argument configuration for OpenHands CLI and headless modes."""
+
+import argparse
+from argparse import ArgumentParser, _SubParsersAction
+
+
+def get_subparser(parser: ArgumentParser, name: str) -> ArgumentParser:
+    for action in parser._actions:
+        if isinstance(action, _SubParsersAction):
+            if name in action.choices:
+                return action.choices[name]
+    raise ValueError(f"Subparser '{name}' not found")
+
+
+def add_common_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add common arguments shared between CLI and headless modes."""
+    parser.add_argument(
+        '--config-file',
+        type=str,
+        default='config.toml',
+        help='Path to the config file (default: config.toml in the current directory)',
+    )
+    parser.add_argument(
+        '-t',
+        '--task',
+        type=str,
+        default='',
+        help='The task for the agent to perform',
+    )
+    parser.add_argument(
+        '-f',
+        '--file',
+        type=str,
+        help='Path to a file containing the task. Overrides -t if both are provided.',
+    )
+    parser.add_argument(
+        '-n',
+        '--name',
+        help='Session name',
+        type=str,
+        default='',
+    )
+    parser.add_argument(
+        '--log-level',
+        help='Set the log level',
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        '-l',
+        '--llm-config',
+        default=None,
+        type=str,
+        help='Replace default LLM ([llm] section in config.toml) config with the specified LLM config, e.g. "llama3" for [llm.llama3] section in config.toml',
+    )
+    parser.add_argument(
+        '--agent-config',
+        default=None,
+        type=str,
+        help='Replace default Agent ([agent] section in config.toml) config with the specified Agent config, e.g. "CodeAct" for [agent.CodeAct] section in config.toml',
+    )
+    parser.add_argument(
+        '-v', '--version', action='store_true', help='Show version information'
+    )
+
+
+def add_evaluation_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add arguments specific to evaluation mode."""
+    # Evaluation-specific arguments
+    parser.add_argument(
+        '--eval-output-dir',
+        default='evaluation/evaluation_outputs/outputs',
+        type=str,
+        help='The directory to save evaluation output',
+    )
+    parser.add_argument(
+        '--eval-n-limit',
+        default=None,
+        type=int,
+        help='The number of instances to evaluate',
+    )
+    parser.add_argument(
+        '--eval-num-workers',
+        default=4,
+        type=int,
+        help='The number of workers to use for evaluation',
+    )
+    parser.add_argument(
+        '--eval-note',
+        default=None,
+        type=str,
+        help='The note to add to the evaluation directory',
+    )
+    parser.add_argument(
+        '--eval-ids',
+        default=None,
+        type=str,
+        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
+    )
+
+
+def add_headless_specific_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add arguments specific to headless mode (full evaluation suite)."""
+    parser.add_argument(
+        '-d',
+        '--directory',
+        type=str,
+        help='The working directory for the agent',
+    )
+    parser.add_argument(
+        '-c',
+        '--agent-cls',
+        default=None,
+        type=str,
+        help='Name of the default agent to use',
+    )
+    parser.add_argument(
+        '-i',
+        '--max-iterations',
+        default=None,
+        type=int,
+        help='The maximum number of iterations to run the agent',
+    )
+    parser.add_argument(
+        '-b',
+        '--max-budget-per-task',
+        type=float,
+        help='The maximum budget allowed per task, beyond which the agent will stop.',
+    )
+    # Additional headless-specific arguments
+    parser.add_argument(
+        '--no-auto-continue',
+        help='Disable auto-continue responses in headless mode (i.e. headless will read from stdin instead of auto-continuing)',
+        action='store_true',
+        default=False,
+    )
+    parser.add_argument(
+        '--selected-repo',
+        help='GitHub repository to clone (format: owner/repo)',
+        type=str,
+        default=None,
+    )
+
+
+def get_cli_parser() -> argparse.ArgumentParser:
+    """Create argument parser for CLI mode with simplified argument set."""
+    # Create a description with welcome message explaining available commands
+    description = (
+        'Welcome to OpenHands: Code Less, Make More\n\n'
+        'OpenHands supports two main commands:\n'
+        '  serve - Launch the OpenHands GUI server (web interface)\n'
+        '  cli   - Run OpenHands in CLI mode (terminal interface)\n\n'
+        'Running "openhands" without a command is the same as "openhands cli"'
+    )
+
+    parser = argparse.ArgumentParser(
+        description=description,
+        prog='openhands',
+        formatter_class=argparse.RawDescriptionHelpFormatter,  # Preserve formatting in description
+        epilog='For more information about a command, run: openhands COMMAND --help',
+    )
+
+    # Create subparsers
+    subparsers = parser.add_subparsers(
+        dest='command',
+        title='commands',
+        description='OpenHands supports two main commands:',
+        metavar='COMMAND',
+    )
+
+    # Add 'serve' subcommand
+    serve_parser = subparsers.add_parser(
+        'serve', help='Launch the OpenHands GUI server using Docker (web interface)'
+    )
+    serve_parser.add_argument(
+        '--mount-cwd',
+        help='Mount the current working directory into the GUI server container',
+        action='store_true',
+        default=False,
+    )
+    serve_parser.add_argument(
+        '--gpu',
+        help='Enable GPU support by mounting all GPUs into the Docker container via nvidia-docker',
+        action='store_true',
+        default=False,
+    )
+
+    # Add 'cli' subcommand - import all the existing CLI arguments
+    cli_parser = subparsers.add_parser(
+        'cli', help='Run OpenHands in CLI mode (terminal interface)'
+    )
+    add_common_arguments(cli_parser)
+
+    cli_parser.add_argument(
+        '--override-cli-mode',
+        help='Override the default settings for CLI mode',
+        type=bool,
+        default=False,
+    )
+    parser.add_argument(
+        '--conversation',
+        help='The conversation id to continue',
+        type=str,
+        default=None,
+    )
+
+    return parser
+
+
+def get_headless_parser() -> argparse.ArgumentParser:
+    """Create argument parser for headless mode with full argument set."""
+    parser = argparse.ArgumentParser(description='Run the agent via CLI')
+    add_common_arguments(parser)
+    add_headless_specific_arguments(parser)
+    return parser
+
+
+def get_evaluation_parser() -> argparse.ArgumentParser:
+    """Create argument parser for evaluation mode."""
+    parser = argparse.ArgumentParser(description='Run OpenHands in evaluation mode')
+    add_common_arguments(parser)
+    add_headless_specific_arguments(parser)
+    add_evaluation_arguments(parser)
+    return parser
--- a/openhands/core/config/mcp_config.py
+++ b/openhands/core/config/mcp_config.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import os
 import re
 import shlex
@@ -302,6 +304,13 @@ class MCPConfig(BaseModel):
            raise ValueError(f'Invalid MCP configuration: {e}')
        return mcp_mapping

+    def merge(self, other: MCPConfig):
+        return MCPConfig(
+            sse_servers=self.sse_servers + other.sse_servers,
+            stdio_servers=self.stdio_servers + other.stdio_servers,
+            shttp_servers=self.shttp_servers + other.shttp_servers,
+        )
+

 class OpenHandsMCPConfig:
    @staticmethod
--- a/openhands/core/config/utils.py
+++ b/openhands/core/config/utils.py
@@ -15,15 +15,12 @@ from pydantic import BaseModel, SecretStr, ValidationError
 from openhands import __version__
 from openhands.core import logger
 from openhands.core.config.agent_config import AgentConfig
+from openhands.core.config.arg_utils import get_headless_parser
 from openhands.core.config.condenser_config import (
    CondenserConfig,
    condenser_config_from_toml_section,
    create_condenser_config,
 )
-from openhands.core.config.config_utils import (
-    OH_DEFAULT_AGENT,
-    OH_MAX_ITERATIONS,
-)
 from openhands.core.config.extended_config import ExtendedConfig
 from openhands.core.config.kubernetes_config import KubernetesConfig
 from openhands.core.config.llm_config import LLMConfig
@@ -80,6 +77,17 @@ def load_from_env(
                set_attr_from_env(field_value, prefix=field_name + '_')

            elif env_var_name in env_or_toml_dict:
+                # Special case: avoid overriding workspace_mount_path_in_sandbox from env
+                # when SANDBOX_VOLUMES is set without an explicit /workspace mount.
+                if (
+                    isinstance(sub_config, OpenHandsConfig)
+                    and field_name == 'workspace_mount_path_in_sandbox'
+                ):
+                    vols = env_or_toml_dict.get('SANDBOX_VOLUMES')
+                    if vols and '/workspace' not in str(vols):
+                        # Skip overriding; keep the default '/workspace'
+                        continue
+
                # convert the env var to the correct type and set it
                value = env_or_toml_dict[env_var_name]

@@ -674,142 +682,9 @@ def get_condenser_config_arg(
        return None


-# Command line arguments
-def get_parser() -> argparse.ArgumentParser:
-    """Get the argument parser."""
-    parser = argparse.ArgumentParser(description='Run the agent via CLI')
-
-    # Add version argument
-    parser.add_argument(
-        '-v', '--version', action='store_true', help='Show version information'
-    )
-
-    parser.add_argument(
-        '--config-file',
-        type=str,
-        default='config.toml',
-        help='Path to the config file (default: config.toml in the current directory)',
-    )
-    parser.add_argument(
-        '-d',
-        '--directory',
-        type=str,
-        help='The working directory for the agent',
-    )
-    parser.add_argument(
-        '-t',
-        '--task',
-        type=str,
-        default='',
-        help='The task for the agent to perform',
-    )
-    parser.add_argument(
-        '-f',
-        '--file',
-        type=str,
-        help='Path to a file containing the task. Overrides -t if both are provided.',
-    )
-    parser.add_argument(
-        '-c',
-        '--agent-cls',
-        default=OH_DEFAULT_AGENT,
-        type=str,
-        help='Name of the default agent to use',
-    )
-    parser.add_argument(
-        '-i',
-        '--max-iterations',
-        default=OH_MAX_ITERATIONS,
-        type=int,
-        help='The maximum number of iterations to run the agent',
-    )
-    parser.add_argument(
-        '-b',
-        '--max-budget-per-task',
-        type=float,
-        help='The maximum budget allowed per task, beyond which the agent will stop.',
-    )
-    # --eval configs are for evaluations only
-    parser.add_argument(
-        '--eval-output-dir',
-        default='evaluation/evaluation_outputs/outputs',
-        type=str,
-        help='The directory to save evaluation output',
-    )
-    parser.add_argument(
-        '--eval-n-limit',
-        default=None,
-        type=int,
-        help='The number of instances to evaluate',
-    )
-    parser.add_argument(
-        '--eval-num-workers',
-        default=4,
-        type=int,
-        help='The number of workers to use for evaluation',
-    )
-    parser.add_argument(
-        '--eval-note',
-        default=None,
-        type=str,
-        help='The note to add to the evaluation directory',
-    )
-    parser.add_argument(
-        '-l',
-        '--llm-config',
-        default=None,
-        type=str,
-        help='Replace default LLM ([llm] section in config.toml) config with the specified LLM config, e.g. "llama3" for [llm.llama3] section in config.toml',
-    )
-    parser.add_argument(
-        '--agent-config',
-        default=None,
-        type=str,
-        help='Replace default Agent ([agent] section in config.toml) config with the specified Agent config, e.g. "CodeAct" for [agent.CodeAct] section in config.toml',
-    )
-    parser.add_argument(
-        '-n',
-        '--name',
-        help='Session name',
-        type=str,
-        default='',
-    )
-    parser.add_argument(
-        '--eval-ids',
-        default=None,
-        type=str,
-        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
-    )
-    parser.add_argument(
-        '--no-auto-continue',
-        help='Disable auto-continue responses in headless mode (i.e. headless will read from stdin instead of auto-continuing)',
-        action='store_true',
-        default=False,
-    )
-    parser.add_argument(
-        '--selected-repo',
-        help='GitHub repository to clone (format: owner/repo)',
-        type=str,
-        default=None,
-    )
-    parser.add_argument(
-        '--override-cli-mode',
-        help='Override the default settings for CLI mode',
-        type=bool,
-        default=False,
-    )
-    parser.add_argument(
-        '--log-level',
-        help='Set the log level',
-        type=str,
-        default=None,
-    )
-    return parser
-
-
 def parse_arguments() -> argparse.Namespace:
    """Parse command line arguments."""
-    parser = get_parser()
+    parser = get_headless_parser()
    args = parser.parse_args()

    if args.version:
@@ -914,17 +789,17 @@ def setup_config_from_args(args: argparse.Namespace) -> OpenHandsConfig:
        )

    # Override default agent if provided
-    if args.agent_cls:
+    if hasattr(args, 'agent_cls') and args.agent_cls:
        config.default_agent = args.agent_cls

    # Set max iterations and max budget per task if provided, otherwise fall back to config values
-    if args.max_iterations is not None:
+    if hasattr(args, 'max_iterations') and args.max_iterations is not None:
        config.max_iterations = args.max_iterations
-    if args.max_budget_per_task is not None:
+    if hasattr(args, 'max_budget_per_task') and args.max_budget_per_task is not None:
        config.max_budget_per_task = args.max_budget_per_task

    # Read selected repository in config for use by CLI and main.py
-    if args.selected_repo is not None:
+    if hasattr(args, 'selected_repo') and args.selected_repo is not None:
        config.sandbox.selected_repo = args.selected_repo

    return config
--- a/openhands/llm/fn_call_converter.py
+++ b/openhands/llm/fn_call_converter.py
@@ -383,7 +383,7 @@ Do NOT assume the environment is the same as in the example above.
 """
    example = example.lstrip()

-    return example
+    return refine_prompt(example)


 IN_CONTEXT_LEARNING_EXAMPLE_PREFIX = get_example_for_tools
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -63,6 +63,7 @@ CACHE_PROMPT_SUPPORTED_MODELS = [
    'claude-sonnet-4-20250514',
    'claude-sonnet-4',
    'claude-opus-4-20250514',
+    'claude-opus-4-1-20250805',
 ]

 # function calling supporting models
@@ -77,6 +78,7 @@ FUNCTION_CALLING_SUPPORTED_MODELS = [
    'claude-sonnet-4-20250514',
    'claude-sonnet-4',
    'claude-opus-4-20250514',
+    'claude-opus-4-1-20250805',
    'gpt-4o-mini',
    'gpt-4o',
    'o1-2024-12-17',
@@ -92,6 +94,7 @@ FUNCTION_CALLING_SUPPORTED_MODELS = [
    'kimi-k2-instruct',
    'Qwen3-Coder-480B-A35B-Instruct',
    'qwen3-coder',  # this will match both qwen3-coder-480b (openhands provider) and qwen3-coder (for openrouter)
+    'gpt-5-2025-08-07',
 ]

 REASONING_EFFORT_SUPPORTED_MODELS = [
@@ -105,6 +108,7 @@ REASONING_EFFORT_SUPPORTED_MODELS = [
    'o4-mini-2025-04-16',
    'gemini-2.5-flash',
    'gemini-2.5-pro',
+    'gpt-5-2025-08-07',
 ]

 MODELS_WITHOUT_STOP_WORDS = [
--- a/openhands/server/routes/manage_conversations.py
+++ b/openhands/server/routes/manage_conversations.py
@@ -10,17 +10,18 @@ from jinja2 import Environment, FileSystemLoader
 from pydantic import BaseModel, ConfigDict, Field

 from openhands.core.config.llm_config import LLMConfig
+from openhands.core.config.mcp_config import MCPConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import (
    ChangeAgentStateAction,
    NullAction,
 )
 from openhands.events.event_filter import EventFilter
+from openhands.events.event_store import EventStore
 from openhands.events.observation import (
    AgentStateChangedObservation,
    NullObservation,
 )
-from openhands.events.stream import EventStream
 from openhands.integrations.provider import (
    PROVIDER_TOKEN_TYPE,
    ProviderHandler,
@@ -44,11 +45,11 @@ from openhands.server.services.conversation_service import (
    create_new_conversation,
    setup_init_convo_settings,
 )
-from openhands.server.session.conversation import ServerConversation
 from openhands.server.shared import (
    ConversationStoreImpl,
    config,
    conversation_manager,
+    file_store,
 )
 from openhands.server.types import LLMAuthenticationError, MissingSettingsError
 from openhands.server.user_auth import (
@@ -60,7 +61,7 @@ from openhands.server.user_auth import (
    get_user_settings_store,
 )
 from openhands.server.user_auth.user_auth import AuthType
-from openhands.server.utils import get_conversation as get_conversation_object
+from openhands.server.utils import get_conversation as get_conversation_metadata
 from openhands.server.utils import get_conversation_store
 from openhands.storage.conversation.conversation_store import ConversationStore
 from openhands.storage.data_models.conversation_metadata import (
@@ -87,6 +88,7 @@ class InitSessionRequest(BaseModel):
    suggested_task: SuggestedTask | None = None
    create_microagent: CreateMicroagent | None = None
    conversation_instructions: str | None = None
+    mcp_config: MCPConfig | None = None
    # Only nested runtimes require the ability to specify a conversation id, and it could be a security risk
    if os.getenv('ALLOW_SET_CONVERSATION_ID', '0') == '1':
        conversation_id: str = Field(default_factory=lambda: uuid.uuid4().hex)
@@ -178,6 +180,7 @@ async def new_conversation(
            conversation_instructions=conversation_instructions,
            git_provider=git_provider,
            conversation_id=conversation_id,
+            mcp_config=data.mcp_config,
        )

        return ConversationResponse(
@@ -331,23 +334,20 @@ async def delete_conversation(
    return True


-@app.get('/conversations/{conversation_id}/remember_prompt')
+@app.get('/conversations/{conversation_id}/remember-prompt')
 async def get_prompt(
+    conversation_id: str,
    event_id: int,
    user_settings: SettingsStore = Depends(get_user_settings_store),
-    conversation: ServerConversation | None = Depends(get_conversation_object),
+    metadata: ConversationMetadata = Depends(get_conversation_metadata),
 ):
-    if conversation is None:
-        return JSONResponse(
-            status_code=404,
-            content={'error': 'Conversation not found.'},
-        )
-
-    # get event stream for the conversation
-    event_stream = conversation.event_stream
+    # get event store for the conversation
+    event_store = EventStore(
+        sid=conversation_id, file_store=file_store, user_id=metadata.user_id
+    )

    # retrieve the relevant events
-    stringified_events = _get_contextual_events(event_stream, event_id)
+    stringified_events = _get_contextual_events(event_store, event_id)

    # generate a prompt
    settings = await user_settings.load()
@@ -551,7 +551,7 @@ async def stop_conversation(
        )


-def _get_contextual_events(event_stream: EventStream, event_id: int) -> str:
+def _get_contextual_events(event_store: EventStore, event_id: int) -> str:
    # find the specified events to learn from
    # Get X events around the target event
    context_size = 4
@@ -567,7 +567,7 @@ def _get_contextual_events(event_stream: EventStream, event_id: int) -> str:
    )  # the types of events that can be in an agent's history

    # from event_id - context_size to event_id..
-    context_before = event_stream.search_events(
+    context_before = event_store.search_events(
        start_id=event_id,
        filter=agent_event_filter,
        reverse=True,
@@ -575,7 +575,7 @@ def _get_contextual_events(event_stream: EventStream, event_id: int) -> str:
    )

    # from event_id to event_id + context_size + 1
-    context_after = event_stream.search_events(
+    context_after = event_store.search_events(
        start_id=event_id + 1,
        filter=agent_event_filter,
        limit=context_size + 1,
--- a/openhands/server/services/conversation_service.py
+++ b/openhands/server/services/conversation_service.py
@@ -2,6 +2,7 @@ import uuid
 from types import MappingProxyType
 from typing import Any

+from openhands.core.config.mcp_config import MCPConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action.message import MessageAction
 from openhands.experiments.experiment_manager import ExperimentManagerImpl
@@ -44,6 +45,7 @@ async def create_new_conversation(
    attach_convo_id: bool = False,
    git_provider: ProviderType | None = None,
    conversation_id: str | None = None,
+    mcp_config: MCPConfig | None = None,
 ) -> AgentLoopInfo:
    logger.info(
        'Creating conversation',
@@ -82,6 +84,9 @@ async def create_new_conversation(
    session_init_args['selected_branch'] = selected_branch
    session_init_args['git_provider'] = git_provider
    session_init_args['conversation_instructions'] = conversation_instructions
+    if mcp_config:
+        session_init_args['mcp_config'] = mcp_config
+
    conversation_init_data = ConversationInitData(**session_init_args)

    logger.info('Loading conversation store')
--- a/openhands/server/session/session.py
+++ b/openhands/server/session/session.py
@@ -124,10 +124,12 @@ class Session:
        )

        # Set Git user configuration if provided in settings
-        if hasattr(settings, 'git_user_name') and settings.git_user_name:
-            self.config.git_user_name = settings.git_user_name
-        if hasattr(settings, 'git_user_email') and settings.git_user_email:
-            self.config.git_user_email = settings.git_user_email
+        git_user_name = getattr(settings, 'git_user_name', None)
+        if git_user_name is not None:
+            self.config.git_user_name = git_user_name
+        git_user_email = getattr(settings, 'git_user_email', None)
+        if git_user_email is not None:
+            self.config.git_user_email = git_user_email
        max_iterations = settings.max_iterations or self.config.max_iterations

        # Prioritize settings over config for max_budget_per_task
@@ -152,6 +154,14 @@ class Session:
        self.logger.debug(
            f'MCP configuration before setup - self.config.mcp_config: {self.config.mcp}'
        )
+
+        # Check if settings has custom mcp_config
+        mcp_config = getattr(settings, 'mcp_config', None)
+        if mcp_config is not None:
+            # Use the provided MCP SHTTP servers instead of default setup
+            self.config.mcp = self.config.mcp.merge(mcp_config)
+            self.logger.debug(f'Merged custom MCP Config: {mcp_config}')
+
        # Add OpenHands' MCP server by default
        openhands_mcp_server, openhands_mcp_stdio_servers = (
            OpenHandsMCPConfigImpl.create_default_mcp_server_config(
@@ -163,7 +173,7 @@ class Session:
            self.config.mcp.shttp_servers.append(openhands_mcp_server)
            self.logger.debug('Added default MCP HTTP server to config')

-        self.config.mcp.stdio_servers.extend(openhands_mcp_stdio_servers)
+            self.config.mcp.stdio_servers.extend(openhands_mcp_stdio_servers)

        self.logger.debug(
            f'MCP configuration after setup - self.config.mcp: {self.config.mcp}'
--- a/openhands/utils/llm.py
+++ b/openhands/utils/llm.py
@@ -56,6 +56,7 @@ def get_supported_llm_models(config: OpenHandsConfig) -> list[str]:
    # Add OpenHands provider models
    openhands_models = [
        'openhands/claude-sonnet-4-20250514',
+        'openhands/gpt-5-2025-08-07',
        'openhands/claude-opus-4-20250514',
        'openhands/gemini-2.5-pro',
        'openhands/o3',
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -4,6 +4,7 @@ from itertools import islice

 from jinja2 import Template

+from openhands.agenthub.codeact_agent.tools.bash import refine_prompt
 from openhands.controller.state.state import State
 from openhands.core.message import Message, TextContent
 from openhands.events.observation.agent import MicroagentKnowledge
@@ -91,7 +92,8 @@ class PromptManager:
            return Template(file.read())

    def get_system_message(self) -> str:
-        return self.system_template.render().strip()
+        system_message = self.system_template.render().strip()
+        return refine_prompt(system_message)

    def get_example_user_message(self) -> str:
        """This is an initial user message that can be provided to the agent
--- a/openhands/utils/term_color.py
+++ b/openhands/utils/term_color.py
@@ -10,6 +10,7 @@ class TermColor(Enum):
    SUCCESS = 'green'
    ERROR = 'red'
    INFO = 'blue'
+    GREY = 'dark_grey'


 def colorize(text: str, color: TermColor = TermColor.WARNING) -> str:
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,6 +0,0 @@
-{
-  "name": "OpenHands",
-  "lockfileVersion": 3,
-  "requires": true,
-  "packages": {}
-}
--- a/poetry.lock
+++ b/poetry.lock
@@ -5152,8 +5152,11 @@ files = [
    {file = "lxml-5.4.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7ce1a171ec325192c6a636b64c94418e71a1964f56d002cc28122fceff0b6121"},
    {file = "lxml-5.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:795f61bcaf8770e1b37eec24edf9771b307df3af74d1d6f27d812e15a9ff3872"},
    {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29f451a4b614a7b5b6c2e043d7b64a15bd8304d7e767055e8ab68387a8cacf4e"},
+    {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:891f7f991a68d20c75cb13c5c9142b2a3f9eb161f1f12a9489c82172d1f133c0"},
    {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4aa412a82e460571fad592d0f93ce9935a20090029ba08eca05c614f99b0cc92"},
+    {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:ac7ba71f9561cd7d7b55e1ea5511543c0282e2b6450f122672a2694621d63b7e"},
    {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:c5d32f5284012deaccd37da1e2cd42f081feaa76981f0eaa474351b68df813c5"},
+    {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:ce31158630a6ac85bddd6b830cffd46085ff90498b397bd0a259f59d27a12188"},
    {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:31e63621e073e04697c1b2d23fcb89991790eef370ec37ce4d5d469f40924ed6"},
    {file = "lxml-5.4.0-cp37-cp37m-win32.whl", hash = "sha256:be2ba4c3c5b7900246a8f866580700ef0d538f2ca32535e991027bdaba944063"},
    {file = "lxml-5.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:09846782b1ef650b321484ad429217f5154da4d6e786636c38e434fa32e94e49"},
@@ -5227,6 +5230,22 @@ files = [
 [package.dependencies]
 cobble = ">=0.1.3,<0.2"

+[[package]]
+name = "markdown"
+version = "3.8.2"
+description = "Python implementation of John Gruber's Markdown."
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "markdown-3.8.2-py3-none-any.whl", hash = "sha256:5c83764dbd4e00bdd94d85a19b8d55ccca20fe35b2e678a1422b380324dd5f24"},
+    {file = "markdown-3.8.2.tar.gz", hash = "sha256:247b9a70dd12e27f67431ce62523e675b866d254f900c4fe75ce3dda62237c45"},
+]
+
+[package.extras]
+docs = ["mdx_gh_links (>=0.2)", "mkdocs (>=1.6)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"]
+testing = ["coverage", "pyyaml"]
+
 [[package]]
 name = "markdown-it-py"
 version = "3.0.0"
@@ -11766,4 +11785,4 @@ third-party-runtimes = ["daytona", "e2b", "modal", "runloop-api-client"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.12,<3.14"
-content-hash = "8568c6ec2e11d4fcb23e206a24896b4d2d50e694c04011b668148f484e95b406"
+content-hash = "d83111cc28bf935f1c759d3ce07a21c69a85f6df035db26042326bd8fba4969f"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,6 +58,7 @@ whatthepatch = "^1.0.6"
 protobuf = "^5.0.0,<6.0.0"                         # Updated to support newer opentelemetry
 opentelemetry-api = "^1.33.1"
 opentelemetry-exporter-otlp-proto-grpc = "^1.33.1"
+markdown = "^3.6"                                  # Required for CLI TUI rendering

 libtmux = ">=0.37,<0.40"
 pygithub = "^2.5.0"
@@ -166,7 +167,7 @@ joblib = "*"
 swebench = { git = "https://github.com/ryanhoangt/SWE-bench.git", rev = "fix-modal-patch-eval" }

 [tool.poetry.scripts]
-openhands = "openhands.cli.main:main"
+openhands = "openhands.cli.entry:main"

 [tool.poetry.group.testgeneval.dependencies]
 fuzzywuzzy = "^0.18.0"
--- a/tests/unit/test_arg_parser.py
+++ b/tests/unit/test_arg_parser.py
@@ -1,17 +1,36 @@
 import pytest

-from openhands.core.config import OH_DEFAULT_AGENT, OH_MAX_ITERATIONS, get_parser
+from openhands.core.config import (
+    get_evaluation_parser,
+    get_headless_parser,
+)


-def test_parser_default_values():
-    parser = get_parser()
+def test_headless_parser_default_values():
+    parser = get_headless_parser()
    args = parser.parse_args([])

    assert args.directory is None
    assert args.task == ''
    assert args.file is None
-    assert args.agent_cls == OH_DEFAULT_AGENT
-    assert args.max_iterations == OH_MAX_ITERATIONS
+    assert args.agent_cls is None
+    assert args.max_iterations is None
+    assert args.max_budget_per_task is None
+    assert args.llm_config is None
+    assert args.name == ''
+    assert not args.no_auto_continue
+    assert args.selected_repo is None
+
+
+def test_evaluation_parser_default_values():
+    parser = get_evaluation_parser()
+    args = parser.parse_args([])
+
+    assert args.directory is None
+    assert args.task == ''
+    assert args.file is None
+    assert args.agent_cls is None
+    assert args.max_iterations is None
    assert args.max_budget_per_task is None
    assert args.eval_output_dir == 'evaluation/evaluation_outputs/outputs'
    assert args.eval_n_limit is None
@@ -23,8 +42,8 @@ def test_parser_default_values():
    assert args.selected_repo is None


-def test_parser_custom_values():
-    parser = get_parser()
+def test_evaluation_parser_custom_values():
+    parser = get_evaluation_parser()
    args = parser.parse_args(
        [
            '-v',
@@ -76,7 +95,7 @@ def test_parser_custom_values():


 def test_parser_file_overrides_task():
-    parser = get_parser()
+    parser = get_headless_parser()
    args = parser.parse_args(['-t', 'task from command', '-f', 'task_file.txt'])

    assert args.task == 'task from command'
@@ -84,31 +103,31 @@ def test_parser_file_overrides_task():


 def test_parser_invalid_max_iterations():
-    parser = get_parser()
+    parser = get_headless_parser()
    with pytest.raises(SystemExit):
        parser.parse_args(['-i', 'not_a_number'])


 def test_parser_invalid_max_budget():
-    parser = get_parser()
+    parser = get_headless_parser()
    with pytest.raises(SystemExit):
        parser.parse_args(['-b', 'not_a_number'])


-def test_parser_invalid_eval_n_limit():
-    parser = get_parser()
+def test_evaluation_parser_invalid_eval_n_limit():
+    parser = get_evaluation_parser()
    with pytest.raises(SystemExit):
        parser.parse_args(['--eval-n-limit', 'not_a_number'])


-def test_parser_invalid_eval_num_workers():
-    parser = get_parser()
+def test_evaluation_parser_invalid_eval_num_workers():
+    parser = get_evaluation_parser()
    with pytest.raises(SystemExit):
        parser.parse_args(['--eval-num-workers', 'not_a_number'])


-def test_help_message(capsys):
-    parser = get_parser()
+def test_headless_parser_help_message(capsys):
+    parser = get_headless_parser()
    with pytest.raises(SystemExit):
        parser.parse_args(['--help'])
    captured = capsys.readouterr()
@@ -126,6 +145,41 @@ def test_help_message(capsys):
        '-c AGENT_CLS, --agent-cls AGENT_CLS',
        '-i MAX_ITERATIONS, --max-iterations MAX_ITERATIONS',
        '-b MAX_BUDGET_PER_TASK, --max-budget-per-task MAX_BUDGET_PER_TASK',
+        '-l LLM_CONFIG, --llm-config LLM_CONFIG',
+        '--agent-config AGENT_CONFIG',
+        '-n NAME, --name NAME',
+        '--config-file CONFIG_FILE',
+        '--no-auto-continue',
+        '--selected-repo SELECTED_REPO',
+        '--log-level LOG_LEVEL',
+    ]
+
+    for element in expected_elements:
+        assert element in help_output, f"Expected '{element}' to be in the help message"
+
+    option_count = help_output.count('  -')
+    assert option_count == 15, f'Expected 15 options, found {option_count}'
+
+
+def test_evaluation_parser_help_message(capsys):
+    parser = get_evaluation_parser()
+    with pytest.raises(SystemExit):
+        parser.parse_args(['--help'])
+    captured = capsys.readouterr()
+    help_output = captured.out
+    print(help_output)
+    expected_elements = [
+        'usage:',
+        'Run OpenHands in evaluation mode',
+        'options:',
+        '-v, --version',
+        '-h, --help',
+        '-d DIRECTORY, --directory DIRECTORY',
+        '-t TASK, --task TASK',
+        '-f FILE, --file FILE',
+        '-c AGENT_CLS, --agent-cls AGENT_CLS',
+        '-i MAX_ITERATIONS, --max-iterations MAX_ITERATIONS',
+        '-b MAX_BUDGET_PER_TASK, --max-budget-per-task MAX_BUDGET_PER_TASK',
        '--eval-output-dir EVAL_OUTPUT_DIR',
        '--eval-n-limit EVAL_N_LIMIT',
        '--eval-num-workers EVAL_NUM_WORKERS',
@@ -137,7 +191,6 @@ def test_help_message(capsys):
        '--config-file CONFIG_FILE',
        '--no-auto-continue',
        '--selected-repo SELECTED_REPO',
-        '--override-cli-mode OVERRIDE_CLI_MODE',
        '--log-level LOG_LEVEL',
    ]

@@ -145,11 +198,11 @@ def test_help_message(capsys):
        assert element in help_output, f"Expected '{element}' to be in the help message"

    option_count = help_output.count('  -')
-    assert option_count == 21, f'Expected 21 options, found {option_count}'
+    assert option_count == 20, f'Expected 20 options, found {option_count}'


 def test_selected_repo_format():
    """Test that the selected-repo argument accepts owner/repo format."""
-    parser = get_parser()
+    parser = get_headless_parser()
    args = parser.parse_args(['--selected-repo', 'owner/repo'])
    assert args.selected_repo == 'owner/repo'
--- a/tests/unit/test_cli.py
+++ b/tests/unit/test_cli.py
@@ -325,7 +325,6 @@ async def test_run_session_with_initial_action(


@pytest.mark.asyncio
-@patch('openhands.cli.main.parse_arguments')
@patch('openhands.cli.main.setup_config_from_args')
@patch('openhands.cli.main.FileSettingsStore.get_instance')
@patch('openhands.cli.main.check_folder_security_agreement')
@@ -345,7 +344,6 @@ async def test_main_without_task(
    mock_check_security,
    mock_get_settings_store,
    mock_setup_config,
-    mock_parse_args,
 ):
    """Test main function without a task."""
    loop = asyncio.get_running_loop()
@@ -359,7 +357,10 @@ async def test_main_without_task(
    mock_args.llm_config = None
    mock_args.name = None
    mock_args.file = None
-    mock_parse_args.return_value = mock_args
+    mock_args.conversation = None
+    mock_args.log_level = None
+    mock_args.config_file = 'config.toml'
+    mock_args.override_cli_mode = None

    # Mock config
    mock_config = MagicMock()
@@ -393,10 +394,9 @@ async def test_main_without_task(
    mock_run_session.return_value = False

    # Run the function
-    await cli.main_with_loop(loop)
+    await cli.main_with_loop(loop, mock_args)

    # Assertions
-    mock_parse_args.assert_called_once()
    mock_setup_config.assert_called_once_with(mock_args)
    mock_get_settings_store.assert_called_once()
    mock_settings_store.load.assert_called_once()
@@ -412,11 +412,11 @@ async def test_main_without_task(
        None,
        session_name=None,
        skip_banner=False,
+        conversation_id=None,
    )


@pytest.mark.asyncio
-@patch('openhands.cli.main.parse_arguments')
@patch('openhands.cli.main.setup_config_from_args')
@patch('openhands.cli.main.FileSettingsStore.get_instance')
@patch('openhands.cli.main.check_folder_security_agreement')
@@ -436,7 +436,6 @@ async def test_main_with_task(
    mock_check_security,
    mock_get_settings_store,
    mock_setup_config,
-    mock_parse_args,
 ):
    """Test main function with a task."""
    loop = asyncio.get_running_loop()
@@ -449,7 +448,11 @@ async def test_main_with_task(
    mock_args.agent_cls = 'custom-agent'
    mock_args.llm_config = 'custom-config'
    mock_args.file = None
-    mock_parse_args.return_value = mock_args
+    mock_args.name = None
+    mock_args.conversation = None
+    mock_args.log_level = None
+    mock_args.config_file = 'config.toml'
+    mock_args.override_cli_mode = None

    # Mock config
    mock_config = MagicMock()
@@ -484,10 +487,9 @@ async def test_main_with_task(
    mock_run_session.side_effect = [True, False]

    # Run the function
-    await cli.main_with_loop(loop)
+    await cli.main_with_loop(loop, mock_args)

    # Assertions
-    mock_parse_args.assert_called_once()
    mock_setup_config.assert_called_once_with(mock_args)
    mock_get_settings_store.assert_called_once()
    mock_settings_store.load.assert_called_once()
@@ -518,7 +520,6 @@ async def test_main_with_task(


@pytest.mark.asyncio
-@patch('openhands.cli.main.parse_arguments')
@patch('openhands.cli.main.setup_config_from_args')
@patch('openhands.cli.main.FileSettingsStore.get_instance')
@patch('openhands.cli.main.check_folder_security_agreement')
@@ -538,7 +539,6 @@ async def test_main_with_session_name_passes_name_to_run_session(
    mock_check_security,
    mock_get_settings_store,
    mock_setup_config,
-    mock_parse_args,
 ):
    """Test main function with a session name passes it to run_session."""
    loop = asyncio.get_running_loop()
@@ -553,7 +553,10 @@ async def test_main_with_session_name_passes_name_to_run_session(
    mock_args.llm_config = None
    mock_args.name = test_session_name  # Set the session name
    mock_args.file = None
-    mock_parse_args.return_value = mock_args
+    mock_args.conversation = None
+    mock_args.log_level = None
+    mock_args.config_file = 'config.toml'
+    mock_args.override_cli_mode = None

    # Mock config
    mock_config = MagicMock()
@@ -587,10 +590,9 @@ async def test_main_with_session_name_passes_name_to_run_session(
    mock_run_session.return_value = False

    # Run the function
-    await cli.main_with_loop(loop)
+    await cli.main_with_loop(loop, mock_args)

    # Assertions
-    mock_parse_args.assert_called_once()
    mock_setup_config.assert_called_once_with(mock_args)
    mock_get_settings_store.assert_called_once()
    mock_settings_store.load.assert_called_once()
@@ -606,6 +608,7 @@ async def test_main_with_session_name_passes_name_to_run_session(
        None,
        session_name=test_session_name,
        skip_banner=False,
+        conversation_id=None,
    )


@@ -709,7 +712,6 @@ async def test_run_session_with_name_attempts_state_restore(


@pytest.mark.asyncio
-@patch('openhands.cli.main.parse_arguments')
@patch('openhands.cli.main.setup_config_from_args')
@patch('openhands.cli.main.FileSettingsStore.get_instance')
@patch('openhands.cli.main.check_folder_security_agreement')
@@ -729,7 +731,6 @@ async def test_main_security_check_fails(
    mock_check_security,
    mock_get_settings_store,
    mock_setup_config,
-    mock_parse_args,
 ):
    """Test main function when security check fails."""
    loop = asyncio.get_running_loop()
@@ -739,7 +740,14 @@ async def test_main_security_check_fails(

    # Mock arguments
    mock_args = MagicMock()
-    mock_parse_args.return_value = mock_args
+    mock_args.agent_cls = None
+    mock_args.llm_config = None
+    mock_args.name = None
+    mock_args.file = None
+    mock_args.conversation = None
+    mock_args.log_level = None
+    mock_args.config_file = 'config.toml'
+    mock_args.override_cli_mode = None

    # Mock config
    mock_config = MagicMock()
@@ -761,10 +769,9 @@ async def test_main_security_check_fails(
    mock_check_security.return_value = False

    # Run the function
-    await cli.main_with_loop(loop)
+    await cli.main_with_loop(loop, mock_args)

    # Assertions
-    mock_parse_args.assert_called_once()
    mock_setup_config.assert_called_once_with(mock_args)
    mock_get_settings_store.assert_called_once()
    mock_settings_store.load.assert_called_once()
@@ -775,7 +782,6 @@ async def test_main_security_check_fails(


@pytest.mark.asyncio
-@patch('openhands.cli.main.parse_arguments')
@patch('openhands.cli.main.setup_config_from_args')
@patch('openhands.cli.main.FileSettingsStore.get_instance')
@patch('openhands.cli.main.check_folder_security_agreement')
@@ -795,7 +801,6 @@ async def test_config_loading_order(
    mock_check_security,
    mock_get_settings_store,
    mock_setup_config,
-    mock_parse_args,
 ):
    """Test the order of configuration loading in the main function.

@@ -816,7 +821,10 @@ async def test_config_loading_order(
    # Add a file property to avoid file I/O errors
    mock_args.file = None
    mock_args.log_level = 'INFO'
-    mock_parse_args.return_value = mock_args
+    mock_args.name = None
+    mock_args.conversation = None
+    mock_args.config_file = 'config.toml'
+    mock_args.override_cli_mode = None

    # Mock read_task to return a dummy task
    mock_read_task.return_value = 'Test task'
@@ -859,10 +867,9 @@ async def test_config_loading_order(
    mock_run_session.return_value = False  # No new session requested

    # Run the function
-    await cli.main_with_loop(loop)
+    await cli.main_with_loop(loop, mock_args)

    # Assertions for argument parsing and config setup
-    mock_parse_args.assert_called_once()
    mock_setup_config.assert_called_once_with(mock_args)
    mock_get_settings_store.assert_called_once()
    mock_settings_store.load.assert_called_once()
@@ -892,7 +899,6 @@ async def test_config_loading_order(


@pytest.mark.asyncio
-@patch('openhands.cli.main.parse_arguments')
@patch('openhands.cli.main.setup_config_from_args')
@patch('openhands.cli.main.FileSettingsStore.get_instance')
@patch('openhands.cli.main.check_folder_security_agreement')
@@ -914,7 +920,6 @@ async def test_main_with_file_option(
    mock_check_security,
    mock_get_settings_store,
    mock_setup_config,
-    mock_parse_args,
 ):
    """Test main function with a file option."""
    loop = asyncio.get_running_loop()
@@ -929,7 +934,10 @@ async def test_main_with_file_option(
    mock_args.name = None
    mock_args.file = '/path/to/test/file.txt'
    mock_args.task = None
-    mock_parse_args.return_value = mock_args
+    mock_args.conversation = None
+    mock_args.log_level = None
+    mock_args.config_file = 'config.toml'
+    mock_args.override_cli_mode = None

    # Mock config
    mock_config = MagicMock()
@@ -965,10 +973,9 @@ async def test_main_with_file_option(
    mock_run_session.return_value = False

    # Run the function
-    await cli.main_with_loop(loop)
+    await cli.main_with_loop(loop, mock_args)

    # Assertions
-    mock_parse_args.assert_called_once()
    mock_setup_config.assert_called_once_with(mock_args)
    mock_get_settings_store.assert_called_once()
    mock_settings_store.load.assert_called_once()
--- a/tests/unit/test_config_precedence.py
+++ b/tests/unit/test_config_precedence.py
@@ -3,6 +3,8 @@ from unittest.mock import MagicMock, patch
 import pytest

 from openhands.core.config import (
+    OH_DEFAULT_AGENT,
+    OH_MAX_ITERATIONS,
    OpenHandsConfig,
    get_llm_config_arg,
    setup_config_from_args,
@@ -308,3 +310,74 @@ def test_cli_settings_json_not_override_config_toml(
    # Verify that settings.json did not override config.toml
    assert test_llm_config.model == 'config-toml-model'
    assert test_llm_config.api_key == 'config-toml-api-key'
+
+
+def test_default_values_applied_when_none():
+    """Test that default values are applied when config values are None."""
+
+    # Create mock args with None values for agent_cls and max_iterations
+    mock_args = MagicMock()
+    mock_args.config_file = None
+    mock_args.llm_config = None
+    mock_args.agent_cls = None
+    mock_args.max_iterations = None
+
+    # Load config
+    with patch(
+        'openhands.core.config.utils.load_openhands_config',
+        return_value=OpenHandsConfig(),
+    ):
+        config = setup_config_from_args(mock_args)
+
+    # Verify they match the expected defaults
+    assert config.default_agent == OH_DEFAULT_AGENT
+    assert config.max_iterations == OH_MAX_ITERATIONS
+
+
+def test_cli_args_override_defaults():
+    """Test that CLI arguments override default values."""
+
+    # Create mock args with custom values
+    mock_args = MagicMock()
+    mock_args.config_file = None
+    mock_args.llm_config = None
+    mock_args.agent_cls = 'CustomAgent'
+    mock_args.max_iterations = 50
+
+    # Load config
+    with patch(
+        'openhands.core.config.utils.load_openhands_config',
+        return_value=OpenHandsConfig(),
+    ):
+        config = setup_config_from_args(mock_args)
+
+    # Verify custom values are used instead of defaults
+    assert config.default_agent == 'CustomAgent'
+    assert config.max_iterations == 50
+
+
+def test_cli_args_none_uses_config_toml_values():
+    """Test that when CLI args agent_cls and max_iterations are None, config.toml values are used."""
+
+    # Create mock args with None values for agent_cls and max_iterations
+    mock_args = MagicMock()
+    mock_args.config_file = None
+    mock_args.llm_config = None
+    mock_args.agent_cls = None
+    mock_args.max_iterations = None
+
+    # Create a config with specific values from config.toml
+    config_from_toml = OpenHandsConfig()
+    config_from_toml.default_agent = 'ConfigTomlAgent'
+    config_from_toml.max_iterations = 100
+
+    # Load config
+    with patch(
+        'openhands.core.config.utils.load_openhands_config',
+        return_value=config_from_toml,
+    ):
+        config = setup_config_from_args(mock_args)
+
+    # Verify config.toml values are preserved when CLI args are None
+    assert config.default_agent == 'ConfigTomlAgent'
+    assert config.max_iterations == 100
--- a/tests/unit/test_get_repository_microagents.py
+++ b/tests/unit/test_get_repository_microagents.py
@@ -13,6 +13,7 @@ from openhands.integrations.service_types import (
    Repository,
 )
 from openhands.microagent.types import MicroagentContentResponse
+from openhands.server.dependencies import check_session_api_key
 from openhands.server.routes.git import app as git_app
 from openhands.server.user_auth import (
    get_access_token,
@@ -49,10 +50,15 @@ def test_client():
    def mock_get_user_id():
        return 'test_user'

+    def mock_check_session_api_key():
+        # Mock session API key check to always pass for tests
+        return None
+
    # Override the dependencies in the app
    app.dependency_overrides[get_provider_tokens] = mock_get_provider_tokens
    app.dependency_overrides[get_access_token] = mock_get_access_token
    app.dependency_overrides[get_user_id] = mock_get_user_id
+    app.dependency_overrides[check_session_api_key] = mock_check_session_api_key

    yield TestClient(app)

--- a/tests/unit/test_middleware.py
+++ b/tests/unit/test_middleware.py
@@ -46,24 +46,32 @@ def test_localhost_cors_middleware_init_without_env_var():


 def test_localhost_cors_middleware_is_allowed_origin_localhost(app):
-    """Test that localhost origins are allowed regardless of port."""
-    app.add_middleware(LocalhostCORSMiddleware)
-    client = TestClient(app)
+    """Test that localhost origins are allowed regardless of port when no specific origins are configured."""
+    # Test without setting PERMITTED_CORS_ORIGINS to trigger localhost behavior
+    with patch.dict(os.environ, {}, clear=True):
+        app.add_middleware(LocalhostCORSMiddleware)
+        client = TestClient(app)

-    # Test with localhost
-    response = client.get('/test', headers={'Origin': 'http://localhost:8000'})
-    assert response.status_code == 200
-    assert response.headers['access-control-allow-origin'] == 'http://localhost:8000'
+        # Test with localhost
+        response = client.get('/test', headers={'Origin': 'http://localhost:8000'})
+        assert response.status_code == 200
+        assert (
+            response.headers['access-control-allow-origin'] == 'http://localhost:8000'
+        )

-    # Test with different port
-    response = client.get('/test', headers={'Origin': 'http://localhost:3000'})
-    assert response.status_code == 200
-    assert response.headers['access-control-allow-origin'] == 'http://localhost:3000'
+        # Test with different port
+        response = client.get('/test', headers={'Origin': 'http://localhost:3000'})
+        assert response.status_code == 200
+        assert (
+            response.headers['access-control-allow-origin'] == 'http://localhost:3000'
+        )

-    # Test with 127.0.0.1
-    response = client.get('/test', headers={'Origin': 'http://127.0.0.1:8000'})
-    assert response.status_code == 200
-    assert response.headers['access-control-allow-origin'] == 'http://127.0.0.1:8000'
+        # Test with 127.0.0.1
+        response = client.get('/test', headers={'Origin': 'http://127.0.0.1:8000'})
+        assert response.status_code == 200
+        assert (
+            response.headers['access-control-allow-origin'] == 'http://127.0.0.1:8000'
+        )


 def test_localhost_cors_middleware_is_allowed_origin_non_localhost(app):
@@ -87,14 +95,15 @@ def test_localhost_cors_middleware_is_allowed_origin_non_localhost(app):

 def test_localhost_cors_middleware_missing_origin(app):
    """Test behavior when Origin header is missing."""
-    app.add_middleware(LocalhostCORSMiddleware)
-    client = TestClient(app)
+    with patch.dict(os.environ, {}, clear=True):
+        app.add_middleware(LocalhostCORSMiddleware)
+        client = TestClient(app)

-    # Test without Origin header
-    response = client.get('/test')
-    assert response.status_code == 200
-    # There should be no access-control-allow-origin header
-    assert 'access-control-allow-origin' not in response.headers
+        # Test without Origin header
+        response = client.get('/test')
+        assert response.status_code == 200
+        # There should be no access-control-allow-origin header
+        assert 'access-control-allow-origin' not in response.headers


 def test_localhost_cors_middleware_inheritance():
--- a/tests/unit/test_windows_prompt_refinement.py
+++ b/tests/unit/test_windows_prompt_refinement.py
@@ -0,0 +1,179 @@
+import sys
+from unittest.mock import patch
+
+import pytest
+
+from openhands.agenthub.codeact_agent.codeact_agent import CodeActAgent
+from openhands.core.config import AgentConfig
+from openhands.llm.llm import LLM
+
+# Skip all tests in this module if not running on Windows
+pytestmark = pytest.mark.skipif(
+    sys.platform != 'win32', reason='Windows prompt refinement tests require Windows'
+)
+
+
+@pytest.fixture
+def mock_llm():
+    """Create a mock LLM for testing."""
+    llm = LLM(config={'model': 'gpt-4', 'api_key': 'test'})
+    return llm
+
+
+@pytest.fixture
+def agent_config():
+    """Create a basic agent config for testing."""
+    return AgentConfig()
+
+
+def test_codeact_agent_system_prompt_no_bash_on_windows(mock_llm, agent_config):
+    """Test that CodeActAgent's system prompt doesn't contain 'bash' on Windows."""
+    # Create a CodeActAgent instance
+    agent = CodeActAgent(llm=mock_llm, config=agent_config)
+
+    # Get the system prompt
+    system_prompt = agent.prompt_manager.get_system_message()
+
+    # Assert that 'bash' doesn't exist in the system prompt (case-insensitive)
+    assert 'bash' not in system_prompt.lower(), (
+        f"System prompt contains 'bash' on Windows platform. "
+        f"It should be replaced with 'powershell'. "
+        f'System prompt: {system_prompt}'
+    )
+
+    # Verify that 'powershell' exists instead (case-insensitive)
+    assert 'powershell' in system_prompt.lower(), (
+        f"System prompt should contain 'powershell' on Windows platform. "
+        f'System prompt: {system_prompt}'
+    )
+
+
+def test_codeact_agent_tool_descriptions_no_bash_on_windows(mock_llm, agent_config):
+    """Test that CodeActAgent's tool descriptions don't contain 'bash' on Windows."""
+    # Create a CodeActAgent instance
+    agent = CodeActAgent(llm=mock_llm, config=agent_config)
+
+    # Get the tools
+    tools = agent.tools
+
+    # Check each tool's description and parameters
+    for tool in tools:
+        if tool['type'] == 'function':
+            function_info = tool['function']
+
+            # Check function description
+            description = function_info.get('description', '')
+            assert 'bash' not in description.lower(), (
+                f"Tool '{function_info['name']}' description contains 'bash' on Windows. "
+                f'Description: {description}'
+            )
+
+            # Check parameter descriptions
+            parameters = function_info.get('parameters', {})
+            properties = parameters.get('properties', {})
+
+            for param_name, param_info in properties.items():
+                param_description = param_info.get('description', '')
+                assert 'bash' not in param_description.lower(), (
+                    f"Tool '{function_info['name']}' parameter '{param_name}' "
+                    f"description contains 'bash' on Windows. "
+                    f'Parameter description: {param_description}'
+                )
+
+
+def test_in_context_learning_example_no_bash_on_windows():
+    """Test that in-context learning examples don't contain 'bash' on Windows."""
+    from openhands.agenthub.codeact_agent.tools.bash import create_cmd_run_tool
+    from openhands.agenthub.codeact_agent.tools.finish import FinishTool
+    from openhands.agenthub.codeact_agent.tools.str_replace_editor import (
+        create_str_replace_editor_tool,
+    )
+    from openhands.llm.fn_call_converter import get_example_for_tools
+
+    # Create a sample set of tools
+    tools = [
+        create_cmd_run_tool(),
+        create_str_replace_editor_tool(),
+        FinishTool,
+    ]
+
+    # Get the in-context learning example
+    example = get_example_for_tools(tools)
+
+    # Assert that 'bash' doesn't exist in the example (case-insensitive)
+    assert 'bash' not in example.lower(), (
+        f"In-context learning example contains 'bash' on Windows platform. "
+        f"It should be replaced with 'powershell'. "
+        f'Example: {example}'
+    )
+
+    # Verify that 'powershell' exists instead (case-insensitive)
+    if example:  # Only check if example is not empty
+        assert 'powershell' in example.lower(), (
+            f"In-context learning example should contain 'powershell' on Windows platform. "
+            f'Example: {example}'
+        )
+
+
+def test_refine_prompt_function_works():
+    """Test that the refine_prompt function correctly replaces 'bash' with 'powershell'."""
+    from openhands.agenthub.codeact_agent.tools.bash import refine_prompt
+
+    # Test basic replacement
+    test_prompt = 'Execute a bash command to list files'
+    refined_prompt = refine_prompt(test_prompt)
+
+    assert 'bash' not in refined_prompt.lower()
+    assert 'powershell' in refined_prompt.lower()
+    assert refined_prompt == 'Execute a powershell command to list files'
+
+    # Test multiple occurrences
+    test_prompt = 'Use bash to run bash commands in the bash shell'
+    refined_prompt = refine_prompt(test_prompt)
+
+    assert 'bash' not in refined_prompt.lower()
+    assert (
+        refined_prompt
+        == 'Use powershell to run powershell commands in the powershell shell'
+    )
+
+    # Test case sensitivity
+    test_prompt = 'BASH and Bash and bash should all be replaced'
+    refined_prompt = refine_prompt(test_prompt)
+
+    assert 'bash' not in refined_prompt.lower()
+    assert (
+        refined_prompt
+        == 'powershell and powershell and powershell should all be replaced'
+    )
+
+    # Test execute_bash tool name replacement
+    test_prompt = 'Use the execute_bash tool to run commands'
+    refined_prompt = refine_prompt(test_prompt)
+
+    assert 'execute_bash' not in refined_prompt.lower()
+    assert 'execute_powershell' in refined_prompt.lower()
+    assert refined_prompt == 'Use the execute_powershell tool to run commands'
+
+    # Test that words containing 'bash' but not equal to 'bash' are preserved
+    test_prompt = 'The bashful person likes bash-like syntax'
+    refined_prompt = refine_prompt(test_prompt)
+
+    # 'bashful' should be preserved, 'bash-like' should become 'powershell-like'
+    assert 'bashful' in refined_prompt
+    assert 'powershell-like' in refined_prompt
+    assert refined_prompt == 'The bashful person likes powershell-like syntax'
+
+
+def test_refine_prompt_function_on_non_windows():
+    """Test that the refine_prompt function doesn't change anything on non-Windows platforms."""
+    from openhands.agenthub.codeact_agent.tools.bash import refine_prompt
+
+    # Mock sys.platform to simulate non-Windows
+    with patch('openhands.agenthub.codeact_agent.tools.bash.sys.platform', 'linux'):
+        test_prompt = 'Execute a bash command to list files'
+        refined_prompt = refine_prompt(test_prompt)
+
+        # On non-Windows, the prompt should remain unchanged
+        assert refined_prompt == test_prompt
+        assert 'bash' in refined_prompt.lower()
Author	SHA1	Message	Date
openhands	f10360f416	test: fix unit tests - Add missing dependency 'markdown' for CLI TUI rendering - Prevent env var WORKSPACE_MOUNT_PATH_IN_SANDBOX from overriding default when SANDBOX_VOLUMES lacks /workspace Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-28 18:41:13 +00:00
openhands	bf13354bbd	Make setup process more friendly and welcoming - Add emojis and encouraging language to setup messages - Replace technical jargon with conversational tone - Add visual indicators for different setup steps - Include completion messages that celebrate user progress - Update setup script, Makefile, and VS Code build script	2025-08-11 18:15:52 +00:00
Robert Brennan	385acded2c	Update SECURITY.md	2025-08-11 09:01:17 -04:00
Robert Brennan	ab079488c6	Create SECURITY.md	2025-08-09 14:37:18 -04:00
Boxuan Li	803bdced9c	Fix Windows prompt refinement: ensure 'bash' is replaced with 'powershell' in all prompts (#10179 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-08 20:28:36 -07:00
Xingyao Wang	3eecac2003	docs: Add GPT-5 model recommendation and fix pricing display issue (#10177 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-08 19:19:59 +00:00
mamoodi	c02e09fc2d	Hide Git Settings section from Application settings (#10176 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-08 19:06:40 +00:00
Tim O'Farrell	18f8661770	feat: add mcp_shttp_servers override to conversation initialization (#10171 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-08 18:05:44 +00:00
Xingyao Wang	04ff4a025b	feat(cli): Use CLI to launch OpenHands UI server via Docker (#9783 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-09 02:04:07 +08:00
mamoodi	81ef363658	Increase stale bot inactivity time and better messaging (#10167 ) Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>	2025-08-08 16:41:15 +00:00
Xingyao Wang	1474c5bc1c	Support gpt-5-2025-08-07 and add it to OpenHands provider (#10172 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-08 16:05:51 +00:00
sp.wack	9b0a5da839	Use EventStore directly in remember prompt; merge client services (#10143 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-08 18:03:03 +04:00
Graham Neubig	7ab2ad2c1b	Fix authentication setup issues in unit tests (#10118 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-07 22:12:21 -04:00
Graham Neubig	8416a019cb	Fix unit test failures by prioritizing current directory in PYTHONPATH (#10105 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-07 22:12:02 -04:00
Engel Nyst	73a7c7786d	Load previous conversation by id (CLI) (#10156 )	2025-08-07 23:09:20 +02:00
aeft	11d12c5a01	fix: prevent CLI argument parser defaults from overriding config file values (#10140 )	2025-08-08 04:48:04 +08:00
Xingyao Wang	c4f303a07b	chore(eval): Remove eval_infer_remote.sh script and related references (#10157 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-07 20:46:59 +00:00
Kenny Dizi	3a629cdf08	Add support model `claude-opus-4-1-20250805` (#10120 )	2025-08-07 18:48:34 +00:00
sp.wack	6ea33b657d	chore(frontend): Remove some dead code (#10121 )	2025-08-08 02:40:35 +08:00