AutoGPT/autogpt_platform/backend/backend/copilot/sdk/service.py

"""Claude Agent SDK service layer for CoPilot chat completions."""

import asyncio
import base64
import json
import logging
import os
import re
import shutil
import sys
import time
import uuid
from collections.abc import AsyncGenerator, AsyncIterator
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, NamedTuple, cast

if TYPE_CHECKING:
    from backend.copilot.permissions import CopilotPermissions

from claude_agent_sdk import (
    AssistantMessage,
    ClaudeAgentOptions,
    ClaudeSDKClient,
    ResultMessage,
    TextBlock,
    ThinkingBlock,
    ToolResultBlock,
    ToolUseBlock,
)
from langfuse import propagate_attributes
from langsmith.integrations.claude_agent_sdk import configure_claude_agent_sdk
from pydantic import BaseModel

from backend.copilot.context import get_workspace_manager
from backend.copilot.permissions import apply_tool_permissions
from backend.data.redis_client import get_redis_async
from backend.executor.cluster_lock import AsyncClusterLock
from backend.util.exceptions import NotFoundError
from backend.util.settings import Settings

from ..config import ChatConfig
from ..constants import (
    COPILOT_ERROR_PREFIX,
    COPILOT_RETRYABLE_ERROR_PREFIX,
    COPILOT_SYSTEM_PREFIX,
    FRIENDLY_TRANSIENT_MSG,
    is_transient_api_error,
)
from ..context import encode_cwd_for_cli
from ..model import (
    ChatMessage,
    ChatSession,
    get_chat_session,
    update_session_title,
    upsert_chat_session,
)
from ..prompting import get_sdk_supplement
from ..response_model import (
    StreamBaseResponse,
    StreamError,
    StreamFinish,
    StreamFinishStep,
    StreamHeartbeat,
    StreamStart,
    StreamStartStep,
    StreamStatus,
    StreamTextDelta,
    StreamToolInputAvailable,
    StreamToolInputStart,
    StreamToolOutputAvailable,
    StreamUsage,
)
from ..service import (
    _build_system_prompt,
    _generate_session_title,
    _is_langfuse_configured,
)
from ..token_tracking import persist_and_record_usage
from ..tools.e2b_sandbox import get_or_create_sandbox, pause_sandbox_direct
from ..tools.sandbox import WORKSPACE_PREFIX, make_session_path
from ..tracking import track_user_message
from .compaction import CompactionTracker, filter_compaction_messages
from .env import build_sdk_env  # noqa: F401 — re-export for backward compat
from .response_adapter import SDKResponseAdapter
from .security_hooks import create_security_hooks
from .tool_adapter import (
    create_copilot_mcp_server,
    get_copilot_tool_names,
    get_sdk_disallowed_tools,
    reset_stash_event,
    reset_tool_failure_counters,
    set_execution_context,
    wait_for_stash,
)
from .transcript import (
    _run_compression,
    cleanup_stale_project_dirs,
    compact_transcript,
    download_transcript,
    read_compacted_entries,
    upload_transcript,
    validate_transcript,
    write_transcript_to_tempfile,
)
from .transcript_builder import TranscriptBuilder

logger = logging.getLogger(__name__)
config = ChatConfig()


# On context-size errors the SDK query is retried with progressively
# less context: (1) original transcript → (2) compacted transcript →
# (3) no transcript (DB messages only).
# Non-context errors (network, auth, rate-limit) are NOT retried.
_MAX_STREAM_ATTEMPTS = 3

# Hard circuit breaker: abort the stream if the model sends this many
# consecutive tool calls with empty parameters (a sign of context
# saturation or serialization failure).  The MCP wrapper now returns
# guidance on the first empty call, giving the model a chance to
# self-correct.  The limit is generous to allow recovery attempts.
_EMPTY_TOOL_CALL_LIMIT = 5

# User-facing error shown when the empty-tool-call circuit breaker trips.
_CIRCUIT_BREAKER_ERROR_MSG = (
    "AutoPilot was unable to complete the tool call "
    "— this usually happens when the response is "
    "too large to fit in a single tool call. "
    "Try breaking your request into smaller parts."
)

# Patterns that indicate the prompt/request exceeds the model's context limit.
# Matched case-insensitively against the full exception chain.
_PROMPT_TOO_LONG_PATTERNS: tuple[str, ...] = (
    "prompt is too long",
    "request too large",
    "maximum context length",
    "context_length_exceeded",
    "input tokens exceed",
    "input is too long",
    "content length exceeds",
)


# Map raw SDK error patterns to user-friendly messages.
# Matched case-insensitively; first match wins.
_FRIENDLY_ERROR_MAP: tuple[tuple[str, str], ...] = (
    ("authentication", "Authentication failed. Please check your API key."),
    ("invalid api key", "Authentication failed. Please check your API key."),
    ("unauthorized", "Authentication failed. Please check your API key."),
    ("rate limit", "Rate limit exceeded. Please wait a moment and try again."),
    ("overloaded", "The AI service is currently overloaded. Please try again shortly."),
    ("server error", "The AI service encountered an internal error. Please retry."),
    ("timeout", "The request timed out. Please try again."),
    ("connection", "Connection error. Please check your network and retry."),
)


def _friendly_error_text(raw: str) -> str:
    """Map a raw SDK error string to a user-friendly message.

    Returns the mapped message if a known pattern is found, otherwise
    returns a generic sanitized version of the raw error.
    """
    lower = raw.lower()
    for pattern, friendly in _FRIENDLY_ERROR_MAP:
        if pattern in lower:
            return friendly
    # Fallback: sanitize but keep the original text for debugging
    return f"SDK stream error: {raw}"


def _is_prompt_too_long(err: BaseException) -> bool:
    """Return True if *err* indicates the prompt exceeds the model's limit.

    Walks the exception chain (`__cause__` / `__context__`) so that
    wrapped errors are detected too.
    """
    seen: set[int] = set()
    current: BaseException | None = err
    while current is not None and id(current) not in seen:
        seen.add(id(current))
        msg = str(current).lower()
        if any(p in msg for p in _PROMPT_TOO_LONG_PATTERNS):
            return True
        current = current.__cause__ or current.__context__
    return False


def _is_sdk_disconnect_error(exc: BaseException) -> bool:
    """Return True if *exc* is an expected SDK cleanup error from client disconnect.

    Two known patterns occur when ``GeneratorExit`` tears down the async
    generator and the SDK's ``__aexit__`` runs in a different context/task:

    * ``RuntimeError``: cancel scope exited in wrong task (anyio)
    * ``ValueError``: ContextVar token created in a different Context (OTEL)

    These are suppressed to avoid polluting Sentry with non-actionable noise.
    """
    if isinstance(exc, RuntimeError) and "cancel scope" in str(exc):
        return True
    if isinstance(exc, ValueError) and "was created in a different Context" in str(exc):
        return True
    return False


def _is_tool_only_message(sdk_msg: object) -> bool:
    """Return True if *sdk_msg* is an AssistantMessage containing only ToolUseBlocks.

    Such a message represents a parallel tool-call batch (no text output yet).
    The ``bool(…content)`` guard prevents vacuous-truth evaluation on an empty list.
    """
    return (
        isinstance(sdk_msg, AssistantMessage)
        and bool(sdk_msg.content)
        and all(isinstance(b, ToolUseBlock) for b in sdk_msg.content)
    )


class ReducedContext(NamedTuple):
    builder: TranscriptBuilder
    use_resume: bool
    resume_file: str | None
    transcript_lost: bool
    tried_compaction: bool


@dataclass
class _TokenUsage:
    """Token usage accumulators for a single turn.

    Separated from `_RetryState` because usage is reset between retry
    attempts independently of the retry-control fields, and is read by
    the outer `stream_chat_completion_sdk` scope after the retry loop
    completes.
    """

    prompt_tokens: int = 0
    completion_tokens: int = 0
    cache_read_tokens: int = 0
    cache_creation_tokens: int = 0
    cost_usd: float | None = None

    def reset(self) -> None:
        """Reset all accumulators for a new attempt."""
        self.prompt_tokens = 0
        self.completion_tokens = 0
        self.cache_read_tokens = 0
        self.cache_creation_tokens = 0
        self.cost_usd = None


@dataclass
class _RetryState:
    """Mutable state passed to `_run_stream_attempt` instead of closures.

    Holds values that the retry loop modifies between attempts so the inner
    generator does not rely on reassigning closure variables.
    """

    options: ClaudeAgentOptions
    query_message: str
    was_compacted: bool
    use_resume: bool
    resume_file: str | None
    transcript_msg_count: int
    adapter: SDKResponseAdapter
    transcript_builder: TranscriptBuilder
    usage: _TokenUsage


@dataclass
class _StreamContext:
    """Per-request variables shared across all retry attempts.

    Extracted so that `_run_stream_attempt` can be a module-level function
    rather than a closure, making it independently testable and reducing the
    cognitive load of the 970-line `stream_chat_completion_sdk` function.

    Scalar fields (IDs, paths, the message string) are set once before the
    retry loop and never reassigned.  `session`, `compaction`, and
    `lock` are **shared mutable references** whose internal state is
    modified by both the retry loop and `_run_stream_attempt` (e.g.
    `session.messages` is rolled back on retry, `compaction` tracks
    mid-stream compaction events).  Their *references* are constant even
    though the objects they point to are mutated.
    """

    session: ChatSession
    session_id: str
    log_prefix: str
    sdk_cwd: str
    current_message: str
    file_ids: list[str] | None
    message_id: str
    attachments: "PreparedAttachments"
    compaction: CompactionTracker
    lock: AsyncClusterLock


async def _reduce_context(
    transcript_content: str,
    tried_compaction: bool,
    session_id: str,
    sdk_cwd: str,
    log_prefix: str,
) -> ReducedContext:
    """Prepare reduced context for a retry attempt.

    On the first retry, compacts the transcript via LLM summarization.
    On subsequent retries (or if compaction fails), drops the transcript
    entirely so the query is rebuilt from DB messages only.

    `transcript_lost` is True when the transcript was dropped (caller
    should set `skip_transcript_upload`).
    """
    # First retry: try compacting
    if transcript_content and not tried_compaction:
        compacted = await compact_transcript(
            transcript_content, model=config.model, log_prefix=log_prefix
        )
        if (
            compacted
            and compacted != transcript_content
            and validate_transcript(compacted)
        ):
            logger.info("%s Using compacted transcript for retry", log_prefix)
            tb = TranscriptBuilder()
            tb.load_previous(compacted, log_prefix=log_prefix)
            resume_file = await asyncio.to_thread(
                write_transcript_to_tempfile, compacted, session_id, sdk_cwd
            )
            if resume_file:
                return ReducedContext(tb, True, resume_file, False, True)
            logger.warning("%s Failed to write compacted transcript", log_prefix)
        logger.warning("%s Compaction failed, dropping transcript", log_prefix)

    # Subsequent retry or compaction failed: drop transcript entirely
    logger.warning("%s Dropping transcript, rebuilding from DB messages", log_prefix)
    return ReducedContext(TranscriptBuilder(), False, None, True, True)


def _append_error_marker(
    session: ChatSession | None,
    display_msg: str,
    *,
    retryable: bool = False,
) -> None:
    """Append a copilot error marker to *session* so it persists across refresh.

    Args:
        session: The chat session to append to (no-op if `None`).
        display_msg: User-visible error text.
        retryable: If `True`, use the retryable prefix so the frontend
            shows a "Try Again" button.
    """
    if session is None:
        return
    prefix = COPILOT_RETRYABLE_ERROR_PREFIX if retryable else COPILOT_ERROR_PREFIX
    session.messages.append(
        ChatMessage(role="assistant", content=f"{prefix} {display_msg}")
    )


def _setup_langfuse_otel() -> None:
    """Configure OTEL tracing for the Claude Agent SDK → Langfuse.

    This uses LangSmith's built-in Claude Agent SDK integration to monkey-patch
    `ClaudeSDKClient`, capturing every tool call and model turn as OTEL spans.
    Spans are exported via OTLP to Langfuse (or any OTEL-compatible backend).

    To route traces elsewhere, override `OTEL_EXPORTER_OTLP_ENDPOINT` and
    `OTEL_EXPORTER_OTLP_HEADERS` environment variables — no code changes needed.
    """
    if not _is_langfuse_configured():
        return

    try:
        settings = Settings()
        pk = settings.secrets.langfuse_public_key
        sk = settings.secrets.langfuse_secret_key
        host = settings.secrets.langfuse_host

        # OTEL exporter config — these are only set if not already present,
        # so explicit env-var overrides always win.
        creds = base64.b64encode(f"{pk}:{sk}".encode()).decode()
        os.environ.setdefault("LANGSMITH_OTEL_ENABLED", "true")
        os.environ.setdefault("LANGSMITH_OTEL_ONLY", "true")
        os.environ.setdefault("LANGSMITH_TRACING", "true")
        os.environ.setdefault("OTEL_EXPORTER_OTLP_ENDPOINT", f"{host}/api/public/otel")
        os.environ.setdefault(
            "OTEL_EXPORTER_OTLP_HEADERS", f"Authorization=Basic {creds}"
        )

        # Set the Langfuse environment via OTEL resource attributes so the
        # Langfuse server maps it to the first-class environment field.
        tracing_env = settings.secrets.langfuse_tracing_environment
        os.environ.setdefault(
            "OTEL_RESOURCE_ATTRIBUTES",
            f"langfuse.environment={tracing_env}",
        )

        configure_claude_agent_sdk(tags=["sdk"])
        logger.info(
            "OTEL tracing configured for Claude Agent SDK → %s [%s]", host, tracing_env
        )
    except Exception:
        logger.warning("OTEL setup skipped — failed to configure", exc_info=True)


_setup_langfuse_otel()


# Set to hold background tasks to prevent garbage collection
_background_tasks: set[asyncio.Task[Any]] = set()


_SDK_CWD_PREFIX = WORKSPACE_PREFIX

_last_sweep_time: float = 0.0
_SWEEP_INTERVAL_SECONDS = 300  # 5 minutes

# Heartbeat interval — keep SSE alive through proxies/LBs during tool execution.
# IMPORTANT: Must be less than frontend timeout (12s in useCopilotPage.ts)
_HEARTBEAT_INTERVAL = 10.0  # seconds


STREAM_LOCK_PREFIX = "copilot:stream:lock:"


async def _safe_close_sdk_client(
    sdk_client: ClaudeSDKClient,
    log_prefix: str,
) -> None:
    """Close a ClaudeSDKClient, suppressing errors from client disconnect.

    When the SSE client disconnects mid-stream, ``GeneratorExit`` propagates
    through the async generator stack and causes ``ClaudeSDKClient.__aexit__``
    to run in a different async context or task than where the client was
    opened.  This triggers two known error classes:

    * ``ValueError``: ``<Token var=<ContextVar name='current_context'>>
      was created in a different Context`` — OpenTelemetry's
      ``context.detach()`` fails because the OTEL context token was
      created in the original generator coroutine but detach runs in
      the GC / cleanup coroutine (Sentry: AUTOGPT-SERVER-8BT).

    * ``RuntimeError``: ``Attempted to exit cancel scope in a different
      task than it was entered in`` — anyio's ``TaskGroup.__aexit__``
      detects that the cancel scope was entered in one task but is
      being exited in another (Sentry: AUTOGPT-SERVER-8BW).

    Both are harmless — the TCP connection is already dead and no
    resources leak.  Logging them at ``debug`` level keeps observability
    without polluting Sentry.
    """
    try:
        await sdk_client.__aexit__(None, None, None)
    except (ValueError, RuntimeError) as exc:
        if _is_sdk_disconnect_error(exc):
            # Expected during client disconnect — suppress to avoid Sentry noise.
            logger.debug(
                "%s SDK client cleanup error suppressed (client disconnect): %s: %s",
                log_prefix,
                type(exc).__name__,
                exc,
            )
        else:
            raise
    except GeneratorExit:
        # GeneratorExit can propagate through __aexit__ — suppress it here
        # since the generator is already being torn down.
        logger.debug(
            "%s SDK client cleanup GeneratorExit suppressed (client disconnect)",
            log_prefix,
        )
    except Exception:
        # Unexpected cleanup error — log at error level so Sentry captures it
        # (via its logging integration), but don't propagate since we're in
        # teardown and the caller cannot meaningfully handle this.
        logger.error(
            "%s Unexpected SDK client cleanup error",
            log_prefix,
            exc_info=True,
        )


async def _iter_sdk_messages(
    client: ClaudeSDKClient,
) -> AsyncGenerator[Any, None]:
    """Yield SDK messages with heartbeat-based timeouts.

    Uses an explicit async iterator with non-cancelling heartbeats.

    CRITICAL: we must NOT cancel `__anext__()` mid-flight — doing so
    (via `asyncio.timeout` or `wait_for`) corrupts the SDK's internal
    anyio memory stream, causing `StopAsyncIteration` on the next call
    and silently dropping all in-flight tool results.  Instead, wrap
    `__anext__()` in a `Task` and use `asyncio.wait()` with a
    timeout.  On timeout we yield a heartbeat sentinel but keep the Task
    alive so it can deliver the next message.

    Yields `None` on heartbeat timeout (caller should refresh locks and
    emit heartbeat events).  Yields the raw SDK message otherwise.
    On stream end (`StopAsyncIteration`), the generator returns normally.
    Any other exception from the SDK propagates to the caller.
    """
    msg_iter = client.receive_response().__aiter__()
    pending_task: asyncio.Task[Any] | None = None

    async def _next_msg() -> Any:
        """Await the next SDK message, wrapped for use with `asyncio.Task`."""
        return await msg_iter.__anext__()

    try:
        while True:
            if pending_task is None:
                pending_task = asyncio.create_task(_next_msg())

            done, _ = await asyncio.wait({pending_task}, timeout=_HEARTBEAT_INTERVAL)

            if not done:
                yield None  # heartbeat sentinel
                continue

            pending_task = None
            try:
                yield done.pop().result()
            except StopAsyncIteration:
                return
    finally:
        if pending_task is not None and not pending_task.done():
            pending_task.cancel()
            try:
                await pending_task
            except (asyncio.CancelledError, StopAsyncIteration):
                pass


def _resolve_sdk_model() -> str | None:
    """Resolve the model name for the Claude Agent SDK CLI.

    Uses `config.claude_agent_model` if set, otherwise derives from
    `config.model` by stripping the OpenRouter provider prefix (e.g.,
    `"anthropic/claude-opus-4.6"` → `"claude-opus-4-6"`).

    OpenRouter uses dot-separated versions (`claude-opus-4.6`) while the
    direct Anthropic API uses hyphen-separated versions (`claude-opus-4-6`).
    Normalisation is only applied when the SDK will actually talk to
    Anthropic directly (not through OpenRouter).

    When `use_claude_code_subscription` is enabled and no explicit
    `claude_agent_model` is set, returns `None` so the CLI uses the
    default model for the user's subscription plan.
    """
    if config.claude_agent_model:
        return config.claude_agent_model
    if config.use_claude_code_subscription:
        return None
    model = config.model
    if "/" in model:
        model = model.split("/", 1)[1]
    # OpenRouter uses dots in versions (claude-opus-4.6) but the direct
    # Anthropic API requires hyphens (claude-opus-4-6).  Only normalise
    # when NOT routing through OpenRouter.
    if not config.openrouter_active:
        model = model.replace(".", "-")
    return model


def _make_sdk_cwd(session_id: str) -> str:
    """Create a safe, session-specific working directory path.

    Delegates to :func:`~backend.copilot.tools.sandbox.make_session_path`
    (single source of truth for path sanitization) and adds a defence-in-depth
    assertion.
    """
    cwd = make_session_path(session_id)
    # Defence-in-depth: normpath + startswith is a CodeQL-recognised sanitizer
    cwd = os.path.normpath(cwd)
    if not cwd.startswith(_SDK_CWD_PREFIX):
        raise ValueError(f"SDK cwd escaped prefix: {cwd}")
    return cwd


async def _cleanup_sdk_tool_results(cwd: str) -> None:
    """Remove SDK session artifacts for a specific working directory.

    Cleans up the ephemeral working directory `/tmp/copilot-<session>/`.

    Also sweeps stale CLI project directories (older than 12 h) to prevent
    unbounded disk growth.  The sweep is best-effort, rate-limited to once
    every 5 minutes, and capped at 50 directories per sweep.

    Security: *cwd* MUST be created by `_make_sdk_cwd()` which sanitizes
    the session_id.
    """
    normalized = os.path.normpath(cwd)
    if not normalized.startswith(_SDK_CWD_PREFIX):
        logger.warning("[SDK] Rejecting cleanup for path outside workspace: %s", cwd)
        return

    await asyncio.to_thread(shutil.rmtree, normalized, True)

    # Best-effort sweep of old project dirs to prevent disk leak.
    # Pass the encoded cwd so only this session's project directory is swept,
    # which is safe in multi-tenant environments.
    global _last_sweep_time
    now = time.time()
    if now - _last_sweep_time >= _SWEEP_INTERVAL_SECONDS:
        _last_sweep_time = now
        encoded = encode_cwd_for_cli(normalized)
        await asyncio.to_thread(cleanup_stale_project_dirs, encoded)


def _format_sdk_content_blocks(blocks: list) -> list[dict[str, Any]]:
    """Convert SDK content blocks to transcript format.

    Handles TextBlock, ToolUseBlock, ToolResultBlock, and ThinkingBlock.
    Raw dicts (e.g. ``redacted_thinking`` blocks that the SDK may not have
    a typed class for) are passed through verbatim to preserve them in the
    transcript.  Unknown typed block objects are logged and skipped.
    """
    result: list[dict[str, Any]] = []
    for block in blocks or []:
        if isinstance(block, TextBlock):
            result.append({"type": "text", "text": block.text})
        elif isinstance(block, ToolUseBlock):
            result.append(
                {
                    "type": "tool_use",
                    "id": block.id,
                    "name": block.name,
                    "input": block.input,
                }
            )
        elif isinstance(block, ToolResultBlock):
            tool_result_entry: dict[str, Any] = {
                "type": "tool_result",
                "tool_use_id": block.tool_use_id,
                "content": block.content,
            }
            if block.is_error:
                tool_result_entry["is_error"] = True
            result.append(tool_result_entry)
        elif isinstance(block, ThinkingBlock):
            result.append(
                {
                    "type": "thinking",
                    "thinking": block.thinking,
                    "signature": block.signature,
                }
            )
        elif isinstance(block, dict) and "type" in block:
            # Preserve raw dict blocks (e.g. redacted_thinking) verbatim.
            result.append(block)
        else:
            logger.warning(
                f"[SDK] Unknown content block type: {type(block).__name__}. "
                f"This may indicate a new SDK version with additional block types."
            )
    return result


async def _compress_messages(
    messages: list[ChatMessage],
) -> tuple[list[ChatMessage], bool]:
    """Compress a list of messages if they exceed the token threshold.

    Delegates to `_run_compression` (`transcript.py`) which centralizes
    the "try LLM, fallback to truncation" pattern with timeouts.  Both
    `_compress_messages` and `compact_transcript` share this helper so
    client acquisition and error handling are consistent.

    See also:
        `_run_compression` — shared compression with timeout guards.
        `compact_transcript` — compresses JSONL transcript entries.
        `CompactionTracker` — emits UI events for mid-stream compaction.
    """
    messages = filter_compaction_messages(messages)

    if len(messages) < 2:
        return messages, False

    # Convert ChatMessages to dicts for compress_context
    messages_dict = []
    for msg in messages:
        msg_dict: dict[str, Any] = {"role": msg.role}
        if msg.content:
            msg_dict["content"] = msg.content
        if msg.tool_calls:
            msg_dict["tool_calls"] = msg.tool_calls
        if msg.tool_call_id:
            msg_dict["tool_call_id"] = msg.tool_call_id
        messages_dict.append(msg_dict)

    try:
        result = await _run_compression(messages_dict, config.model, "[SDK]")
    except Exception as exc:
        # Guard against timeouts or unexpected errors in compression —
        # return the original messages so the caller can proceed without
        # compaction rather than propagating the error to the retry loop.
        logger.warning("[SDK] _compress_messages failed, returning originals: %s", exc)
        return messages, False

    if result.was_compacted:
        logger.info(
            f"[SDK] Context compacted: {result.original_token_count} -> "
            f"{result.token_count} tokens "
            f"({result.messages_summarized} summarized, "
            f"{result.messages_dropped} dropped)"
        )
        # Convert compressed dicts back to ChatMessages
        return [
            ChatMessage(
                role=m["role"],
                content=m.get("content"),
                tool_calls=m.get("tool_calls"),
                tool_call_id=m.get("tool_call_id"),
            )
            for m in result.messages
        ], True

    return messages, False


def _format_conversation_context(messages: list[ChatMessage]) -> str | None:
    """Format conversation messages into a context prefix for the user message.

    Includes user messages, assistant text, tool call summaries, and
    tool result summaries so the agent retains full context about what
    tools were invoked and their outcomes.

    Returns None if there are no messages to format.
    """
    if not messages:
        return None

    # Filter out compaction messages first, then format
    messages = filter_compaction_messages(messages)

    lines: list[str] = []
    for msg in messages:
        if msg.role == "user":
            if msg.content:
                lines.append(f"User: {msg.content}")
        elif msg.role == "assistant":
            if msg.content:
                lines.append(f"You responded: {msg.content}")
            # Omit tool_calls — any text representation gets mimicked
            # by the model. Tool results below provide the context.
        elif msg.role == "tool":
            content = msg.content or ""
            lines.append(f"Tool output: {content[:500]}")

    if not lines:
        return None

    return "<conversation_history>\n" + "\n".join(lines) + "\n</conversation_history>"


async def _build_query_message(
    current_message: str,
    session: ChatSession,
    use_resume: bool,
    transcript_msg_count: int,
    session_id: str,
) -> tuple[str, bool]:
    """Build the query message with appropriate context.

    Returns:
        Tuple of (query_message, was_compacted).
    """
    msg_count = len(session.messages)

    if use_resume and transcript_msg_count > 0:
        if transcript_msg_count < msg_count - 1:
            gap = session.messages[transcript_msg_count:-1]
            compressed, was_compressed = await _compress_messages(gap)
            gap_context = _format_conversation_context(compressed)
            if gap_context:
                logger.info(
                    "[SDK] Transcript stale: covers %d of %d messages, "
                    "gap=%d (compressed=%s)",
                    transcript_msg_count,
                    msg_count,
                    len(gap),
                    was_compressed,
                )
                return (
                    f"{gap_context}\n\nNow, the user says:\n{current_message}",
                    was_compressed,
                )
    elif not use_resume and msg_count > 1:
        logger.warning(
            f"[SDK] Using compression fallback for session "
            f"{session_id} ({msg_count} messages) — no transcript for --resume"
        )
        compressed, was_compressed = await _compress_messages(session.messages[:-1])
        history_context = _format_conversation_context(compressed)
        if history_context:
            return (
                f"{history_context}\n\nNow, the user says:\n{current_message}",
                was_compressed,
            )

    return current_message, False


# Claude API vision-supported image types.
_VISION_MIME_TYPES = frozenset({"image/png", "image/jpeg", "image/gif", "image/webp"})

# Max size for embedding images directly in the user message (20 MiB raw).
_MAX_INLINE_IMAGE_BYTES = 20 * 1024 * 1024

# Matches characters unsafe for filenames.
_UNSAFE_FILENAME = re.compile(r"[^\w.\-]")


def _save_to_sdk_cwd(sdk_cwd: str, filename: str, content: bytes) -> str:
    """Write file content to the SDK ephemeral directory.

    Returns the absolute path.  Adds a numeric suffix on name collisions.
    """
    safe = _UNSAFE_FILENAME.sub("_", filename) or "file"
    candidate = os.path.join(sdk_cwd, safe)
    if os.path.exists(candidate):
        stem, ext = os.path.splitext(safe)
        idx = 1
        while os.path.exists(candidate):
            candidate = os.path.join(sdk_cwd, f"{stem}_{idx}{ext}")
            idx += 1
    with open(candidate, "wb") as f:
        f.write(content)
    return candidate


class PreparedAttachments(BaseModel):
    """Result of preparing file attachments for a query."""

    hint: str = ""
    """Text hint describing the files (appended to the user message)."""

    image_blocks: list[dict[str, Any]] = []
    """Claude API image content blocks to embed in the user message."""


async def _prepare_file_attachments(
    file_ids: list[str],
    user_id: str,
    session_id: str,
    sdk_cwd: str,
) -> PreparedAttachments:
    """Download workspace files and prepare them for Claude.

    Images (PNG/JPEG/GIF/WebP) are embedded directly as vision content blocks
    in the user message so Claude can see them without tool calls.

    Non-image files (PDFs, text, etc.) are saved to *sdk_cwd* so the CLI's
    built-in Read tool can access them.

    Returns a :class:`PreparedAttachments` with a text hint and any image
    content blocks.
    """
    empty = PreparedAttachments(hint="", image_blocks=[])
    if not file_ids or not user_id:
        return empty

    try:
        manager = await get_workspace_manager(user_id, session_id)
    except Exception:
        logger.warning(
            "Failed to create workspace manager for file attachments",
            exc_info=True,
        )
        return empty

    image_blocks: list[dict[str, Any]] = []
    file_descriptions: list[str] = []

    for fid in file_ids:
        try:
            file_info = await manager.get_file_info(fid)
            if file_info is None:
                continue
            content = await manager.read_file_by_id(fid)
            mime = (file_info.mime_type or "").split(";")[0].strip().lower()

            # Images: embed directly in the user message as vision blocks
            if mime in _VISION_MIME_TYPES and len(content) <= _MAX_INLINE_IMAGE_BYTES:
                b64 = base64.b64encode(content).decode("ascii")
                image_blocks.append(
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": mime,
                            "data": b64,
                        },
                    }
                )
                file_descriptions.append(
                    f"- {file_info.name} ({mime}, "
                    f"{file_info.size_bytes:,} bytes) [embedded as image]"
                )
            else:
                # Non-image files: save to sdk_cwd for Read tool access
                local_path = _save_to_sdk_cwd(sdk_cwd, file_info.name, content)
                file_descriptions.append(
                    f"- {file_info.name} ({mime}, "
                    f"{file_info.size_bytes:,} bytes) saved to {local_path}"
                )
        except Exception:
            logger.warning("Failed to prepare file %s", fid[:12], exc_info=True)

    if not file_descriptions:
        return empty

    noun = "file" if len(file_descriptions) == 1 else "files"
    has_non_images = len(file_descriptions) > len(image_blocks)
    read_hint = " Use the Read tool to view non-image files." if has_non_images else ""
    hint = (
        f"[The user attached {len(file_descriptions)} {noun}.{read_hint}\n"
        + "\n".join(file_descriptions)
        + "]"
    )
    return PreparedAttachments(hint=hint, image_blocks=image_blocks)


@dataclass
class _StreamAccumulator:
    """Mutable state accumulated during a single streaming attempt.

    Tracks the assistant message being built, tool calls, and flags that
    control session-message bookkeeping in the dispatch loop.
    """

    assistant_response: ChatMessage
    accumulated_tool_calls: list[dict[str, Any]]
    has_appended_assistant: bool = False
    has_tool_results: bool = False
    stream_completed: bool = False


def _dispatch_response(
    response: StreamBaseResponse,
    acc: _StreamAccumulator,
    ctx: "_StreamContext",
    state: "_RetryState",
    entries_replaced: bool,
    log_prefix: str,
) -> StreamBaseResponse | None:
    """Process a single adapter response and update session/accumulator state.

    Returns the response to yield to the client, or `None` if the response
    should be suppressed (e.g. `StreamStart` duplicates).

    Handles:
    - Logging tool events and errors
    - Persisting error markers
    - Accumulating text deltas into `assistant_response`
    - Appending tool input/output to session messages and transcript
    - Detecting `StreamFinish`
    """
    if isinstance(response, StreamStart):
        return None

    if isinstance(
        response,
        (StreamToolInputAvailable, StreamToolOutputAvailable),
    ):
        extra = ""
        if isinstance(response, StreamToolOutputAvailable):
            out_len = len(str(response.output))
            extra = f", output_len={out_len}"
        logger.info(
            "%s Tool event: %s, tool=%s%s",
            log_prefix,
            type(response).__name__,
            getattr(response, "toolName", "N/A"),
            extra,
        )

    # Persist error markers so they survive page refresh
    if isinstance(response, StreamError):
        logger.error(
            "%s Sending error to frontend: %s (code=%s)",
            log_prefix,
            response.errorText,
            response.code,
        )
        _append_error_marker(
            ctx.session,
            response.errorText,
            retryable=(response.code == "transient_api_error"),
        )

    if isinstance(response, StreamTextDelta):
        delta = response.delta or ""
        if acc.has_tool_results and acc.has_appended_assistant:
            acc.assistant_response = ChatMessage(role="assistant", content=delta)
            acc.accumulated_tool_calls = []
            acc.has_appended_assistant = False
            acc.has_tool_results = False
            ctx.session.messages.append(acc.assistant_response)
            acc.has_appended_assistant = True
        else:
            acc.assistant_response.content = (
                acc.assistant_response.content or ""
            ) + delta
            if not acc.has_appended_assistant:
                ctx.session.messages.append(acc.assistant_response)
                acc.has_appended_assistant = True

    elif isinstance(response, StreamToolInputAvailable):
        acc.accumulated_tool_calls.append(
            {
                "id": response.toolCallId,
                "type": "function",
                "function": {
                    "name": response.toolName,
                    "arguments": json.dumps(response.input or {}),
                },
            }
        )
        acc.assistant_response.tool_calls = acc.accumulated_tool_calls
        if not acc.has_appended_assistant:
            ctx.session.messages.append(acc.assistant_response)
            acc.has_appended_assistant = True

    elif isinstance(response, StreamToolOutputAvailable):
        content = (
            response.output
            if isinstance(response.output, str)
            else json.dumps(response.output, ensure_ascii=False)
        )
        ctx.session.messages.append(
            ChatMessage(
                role="tool",
                content=content,
                tool_call_id=response.toolCallId,
            )
        )
        if not entries_replaced:
            state.transcript_builder.append_tool_result(
                tool_use_id=response.toolCallId,
                content=content,
            )
        acc.has_tool_results = True

    elif isinstance(response, StreamFinish):
        acc.stream_completed = True

    return response


class _HandledStreamError(Exception):
    """Raised by `_run_stream_attempt` after it has already yielded a
    `StreamError` to the client (e.g. transient API error, circuit breaker).

    This signals the outer retry loop that the attempt failed so it can
    perform session-message rollback and set the `ended_with_stream_error`
    flag, **without** yielding a duplicate `StreamError` to the client.

    Attributes:
        error_msg: The user-facing error message to persist.
        code: Machine-readable error code (e.g. ``circuit_breaker_empty_tool_calls``).
        retryable: Whether the frontend should offer a retry button.
    """

    def __init__(
        self,
        message: str,
        error_msg: str | None = None,
        code: str | None = None,
        retryable: bool = True,
    ):
        super().__init__(message)
        self.error_msg = error_msg
        self.code = code
        self.retryable = retryable


@dataclass
class _EmptyToolBreakResult:
    """Result of checking for empty tool calls in a single AssistantMessage."""

    count: int  # Updated consecutive counter
    tripped: bool  # Whether the circuit breaker fired
    error: StreamError | None  # StreamError to yield (if tripped)
    error_msg: str | None  # Error message (if tripped)
    error_code: str | None  # Error code (if tripped)


def _check_empty_tool_breaker(
    sdk_msg: object,
    consecutive: int,
    ctx: _StreamContext,
    state: _RetryState,
) -> _EmptyToolBreakResult:
    """Detect consecutive empty tool calls and trip the circuit breaker.

    Returns an ``_EmptyToolBreakResult`` with the updated counter and, if the
    breaker tripped, the ``StreamError`` to yield plus the error metadata.
    """
    if not isinstance(sdk_msg, AssistantMessage):
        return _EmptyToolBreakResult(consecutive, False, None, None, None)

    empty_tools = [
        b.name for b in sdk_msg.content if isinstance(b, ToolUseBlock) and not b.input
    ]
    if not empty_tools:
        # Reset on any non-empty-tool AssistantMessage (including text-only
        # messages — any() over empty content is False).
        return _EmptyToolBreakResult(0, False, None, None, None)

    consecutive += 1

    # Log full diagnostics on first occurrence only; subsequent hits just
    # log the counter to reduce noise.
    if consecutive == 1:
        logger.warning(
            "%s Empty tool call detected (%d/%d): "
            "tools=%s, model=%s, error=%s, "
            "block_types=%s, cumulative_usage=%s",
            ctx.log_prefix,
            consecutive,
            _EMPTY_TOOL_CALL_LIMIT,
            empty_tools,
            sdk_msg.model,
            sdk_msg.error,
            [type(b).__name__ for b in sdk_msg.content],
            {
                "prompt": state.usage.prompt_tokens,
                "completion": state.usage.completion_tokens,
                "cache_read": state.usage.cache_read_tokens,
            },
        )
    else:
        logger.warning(
            "%s Empty tool call detected (%d/%d): tools=%s",
            ctx.log_prefix,
            consecutive,
            _EMPTY_TOOL_CALL_LIMIT,
            empty_tools,
        )

    if consecutive < _EMPTY_TOOL_CALL_LIMIT:
        return _EmptyToolBreakResult(consecutive, False, None, None, None)

    logger.error(
        "%s Circuit breaker: aborting stream after %d "
        "consecutive empty tool calls. "
        "This is likely caused by the model attempting "
        "to write content too large for a single tool "
        "call's output token limit. The model should "
        "write large files in chunks using bash_exec "
        "with cat >> (append).",
        ctx.log_prefix,
        consecutive,
    )
    error_msg = _CIRCUIT_BREAKER_ERROR_MSG
    error_code = "circuit_breaker_empty_tool_calls"
    _append_error_marker(ctx.session, error_msg, retryable=True)
    return _EmptyToolBreakResult(
        count=consecutive,
        tripped=True,
        error=StreamError(errorText=error_msg, code=error_code),
        error_msg=error_msg,
        error_code=error_code,
    )


async def _run_stream_attempt(
    ctx: _StreamContext,
    state: _RetryState,
) -> AsyncIterator[StreamBaseResponse]:
    """Run one SDK streaming attempt.

    Opens a `ClaudeSDKClient`, sends the query, iterates SDK messages with
    heartbeat timeouts, dispatches adapter responses, and performs post-stream
    cleanup (safety-net flush, stopped-by-user handling).

    Yields stream events.  On stream error the exception propagates to the
    caller so the retry loop can rollback and retry.

    Args:
        ctx: Per-request context shared across retry attempts.  Scalar
            fields (IDs, paths, message string) are set once and never
            reassigned.  `session`, `compaction`, and `lock` are
            shared mutable references: `session.messages` is rolled back
            on retry, `compaction` tracks mid-stream compaction events,
            and `lock` is refreshed during heartbeats.  Their references
            are constant even though the objects they point to are mutated.
        state: Mutable retry state — holds values that the retry loop
            modifies between attempts (options, query, adapter, etc.).

    See also:
        `stream_chat_completion_sdk` — owns the retry loop that calls this
        function up to `_MAX_STREAM_ATTEMPTS` times with reduced context.
    """
    acc = _StreamAccumulator(
        assistant_response=ChatMessage(role="assistant", content=""),
        accumulated_tool_calls=[],
    )
    ended_with_stream_error = False
    # Stores the error message used by _append_error_marker so the outer
    # retry loop can re-append the correct message after session rollback.
    stream_error_msg: str | None = None
    stream_error_code: str | None = None

    consecutive_empty_tool_calls = 0

    # --- Intermediate persistence tracking ---
    # Flush session messages to DB periodically so page reloads show progress
    # during long-running turns (see incident d2f7cba3: 82-min turn lost on refresh).
    _last_flush_time = time.monotonic()
    _msgs_since_flush = 0
    _FLUSH_INTERVAL_SECONDS = 30.0
    _FLUSH_MESSAGE_THRESHOLD = 10

    # Use manual __aenter__/__aexit__ instead of ``async with`` so we can
    # suppress SDK cleanup errors that occur when the SSE client disconnects
    # mid-stream.  GeneratorExit causes the SDK's ``__aexit__`` to run in a
    # different async context/task than where the client was opened, which
    # triggers:
    #   - ValueError: ContextVar token mismatch (AUTOGPT-SERVER-8BT)
    #   - RuntimeError: cancel scope in wrong task  (AUTOGPT-SERVER-8BW)
    # Both are harmless — the TCP connection is already dead.
    sdk_client = ClaudeSDKClient(options=state.options)
    client = await sdk_client.__aenter__()
    try:
        logger.info(
            "%s Sending query — resume=%s, total_msgs=%d, "
            "query_len=%d, attached_files=%d, image_blocks=%d",
            ctx.log_prefix,
            state.use_resume,
            len(ctx.session.messages),
            len(state.query_message),
            len(ctx.file_ids) if ctx.file_ids else 0,
            len(ctx.attachments.image_blocks),
        )

        ctx.compaction.reset_for_query()
        if state.was_compacted:
            for ev in ctx.compaction.emit_pre_query(ctx.session):
                yield ev

        if ctx.attachments.image_blocks:
            content_blocks: list[dict[str, Any]] = [
                *ctx.attachments.image_blocks,
                {"type": "text", "text": state.query_message},
            ]
            user_msg = {
                "type": "user",
                "message": {"role": "user", "content": content_blocks},
                "parent_tool_use_id": None,
                "session_id": ctx.session_id,
            }
            if client._transport is None:  # noqa: SLF001
                raise RuntimeError("ClaudeSDKClient transport is not initialized")
            await client._transport.write(json.dumps(user_msg) + "\n")  # noqa: SLF001
            state.transcript_builder.append_user(
                content=[
                    *ctx.attachments.image_blocks,
                    {"type": "text", "text": ctx.current_message},
                ]
            )
        else:
            await client.query(state.query_message, session_id=ctx.session_id)
            state.transcript_builder.append_user(content=ctx.current_message)

        async for sdk_msg in _iter_sdk_messages(client):
            # Heartbeat sentinel — refresh lock and keep SSE alive
            if sdk_msg is None:
                await ctx.lock.refresh()
                for ev in ctx.compaction.emit_start_if_ready():
                    yield ev
                yield StreamHeartbeat()
                continue

            logger.info(
                "%s Received: %s %s (unresolved=%d, current=%d, resolved=%d)",
                ctx.log_prefix,
                type(sdk_msg).__name__,
                getattr(sdk_msg, "subtype", ""),
                len(state.adapter.current_tool_calls)
                - len(state.adapter.resolved_tool_calls),
                len(state.adapter.current_tool_calls),
                len(state.adapter.resolved_tool_calls),
            )

            # Log AssistantMessage API errors (e.g. invalid_request)
            # so we can debug Anthropic API 400s surfaced by the CLI.
            sdk_error = getattr(sdk_msg, "error", None)
            if isinstance(sdk_msg, AssistantMessage) and sdk_error:
                error_text = str(sdk_error)
                error_preview = str(sdk_msg.content)[:500]
                logger.error(
                    "[SDK] [%s] AssistantMessage has error=%s, "
                    "content_blocks=%d, content_preview=%s",
                    ctx.session_id[:12],
                    sdk_error,
                    len(sdk_msg.content),
                    error_preview,
                )

                # Intercept prompt-too-long errors surfaced as
                # AssistantMessage.error (not as a Python exception).
                # Re-raise so the outer retry loop can compact the
                # transcript and retry with reduced context.
                # Check both error_text and error_preview: sdk_error
                # being set confirms this is an error message (not user
                # content), so checking content is safe. The actual
                # error description (e.g. "Prompt is too long") may be
                # in the content, not the error type field
                # (e.g. error="invalid_request", content="Prompt is
                # too long").
                if _is_prompt_too_long(Exception(error_text)) or _is_prompt_too_long(
                    Exception(error_preview)
                ):
                    logger.warning(
                        "%s Prompt-too-long detected via AssistantMessage "
                        "error — raising for retry",
                        ctx.log_prefix,
                    )
                    raise RuntimeError("Prompt is too long")

                # Intercept transient API errors (socket closed,
                # ECONNRESET) — replace the raw message with a
                # user-friendly error text and use the retryable
                # error prefix so the frontend shows a retry button.
                # Check both the error field and content for patterns.
                if is_transient_api_error(error_text) or is_transient_api_error(
                    error_preview
                ):
                    logger.warning(
                        "%s Transient Anthropic API error detected, "
                        "suppressing raw error text",
                        ctx.log_prefix,
                    )
                    stream_error_msg = FRIENDLY_TRANSIENT_MSG
                    stream_error_code = "transient_api_error"
                    _append_error_marker(
                        ctx.session,
                        stream_error_msg,
                        retryable=True,
                    )
                    yield StreamError(
                        errorText=stream_error_msg,
                        code=stream_error_code,
                    )
                    ended_with_stream_error = True
                    break

            # Determine if the message is a tool-only batch (all content
            # items are ToolUseBlocks) — such messages have no text output yet,
            # so we skip the wait_for_stash flush below.
            #
            # Note: parallel execution of tools is handled natively by the
            # SDK CLI via readOnlyHint annotations on tool definitions.
            is_tool_only = False
            if isinstance(sdk_msg, AssistantMessage) and sdk_msg.content:
                is_tool_only = all(
                    isinstance(item, ToolUseBlock) for item in sdk_msg.content
                )

            # Race-condition fix: SDK hooks (PostToolUse) are
            # executed asynchronously via start_soon() — the next
            # message can arrive before the hook stashes output.
            # wait_for_stash() awaits an asyncio.Event signaled by
            # stash_pending_tool_output(), completing as soon as
            # the hook finishes (typically <1ms).  The sleep(0)
            # after lets any remaining concurrent hooks complete.
            #
            # Skip for parallel tool continuations: when the SDK
            # sends parallel tool calls as separate
            # AssistantMessages (each containing only
            # ToolUseBlocks), we must NOT wait/flush — the prior
            # tools are still executing concurrently.
            if (
                state.adapter.has_unresolved_tool_calls
                and isinstance(sdk_msg, (AssistantMessage, ResultMessage))
                and not is_tool_only
            ):
                if await wait_for_stash():
                    await asyncio.sleep(0)
                else:
                    logger.warning(
                        "%s Timed out waiting for PostToolUse "
                        "hook stash (%d unresolved tool calls)",
                        ctx.log_prefix,
                        len(state.adapter.current_tool_calls)
                        - len(state.adapter.resolved_tool_calls),
                    )

            # Log ResultMessage details and capture token usage
            if isinstance(sdk_msg, ResultMessage):
                logger.info(
                    "%s Received: ResultMessage %s "
                    "(unresolved=%d, current=%d, resolved=%d, "
                    "num_turns=%d, cost_usd=%s, result=%s)",
                    ctx.log_prefix,
                    sdk_msg.subtype,
                    len(state.adapter.current_tool_calls)
                    - len(state.adapter.resolved_tool_calls),
                    len(state.adapter.current_tool_calls),
                    len(state.adapter.resolved_tool_calls),
                    sdk_msg.num_turns,
                    sdk_msg.total_cost_usd,
                    (sdk_msg.result or "")[:200],
                )
                if sdk_msg.subtype in (
                    "error",
                    "error_during_execution",
                ):
                    logger.error(
                        "%s SDK execution failed with error: %s",
                        ctx.log_prefix,
                        sdk_msg.result or "(no error message provided)",
                    )

                # Check for prompt-too-long regardless of subtype — the
                # SDK may return subtype="success" with result="Prompt is
                # too long" when the CLI rejects the prompt before calling
                # the API (cost_usd=0, no tokens consumed).  If we only
                # check the "error" subtype path, the stream appears to
                # complete normally, the synthetic error text is stored
                # in the transcript, and the session grows without bound.
                if _is_prompt_too_long(RuntimeError(sdk_msg.result or "")):
                    raise RuntimeError("Prompt is too long")

                # Capture token usage from ResultMessage.
                # Anthropic reports cached tokens separately:
                #   input_tokens = uncached only
                #   cache_read_input_tokens = served from cache
                #   cache_creation_input_tokens = written to cache
                if sdk_msg.usage:
                    state.usage.prompt_tokens += sdk_msg.usage.get("input_tokens", 0)
                    state.usage.cache_read_tokens += sdk_msg.usage.get(
                        "cache_read_input_tokens", 0
                    )
                    state.usage.cache_creation_tokens += sdk_msg.usage.get(
                        "cache_creation_input_tokens", 0
                    )
                    state.usage.completion_tokens += sdk_msg.usage.get(
                        "output_tokens", 0
                    )
                    logger.info(
                        "%s Token usage: uncached=%d, cache_read=%d, "
                        "cache_create=%d, output=%d",
                        ctx.log_prefix,
                        state.usage.prompt_tokens,
                        state.usage.cache_read_tokens,
                        state.usage.cache_creation_tokens,
                        state.usage.completion_tokens,
                    )
                if sdk_msg.total_cost_usd is not None:
                    state.usage.cost_usd = sdk_msg.total_cost_usd

            # Emit compaction end if SDK finished compacting.
            # Sync TranscriptBuilder with the CLI's active context.
            compact_result = await ctx.compaction.emit_end_if_ready(ctx.session)
            if compact_result.events:
                # Compaction events end with StreamFinishStep, which maps to
                # Vercel AI SDK's "finish-step" — that clears activeTextParts.
                # Close any open text block BEFORE the compaction events so
                # the text-end arrives before finish-step, preventing
                # "text-end for missing text part" errors on the frontend.
                pre_close: list[StreamBaseResponse] = []
                state.adapter._end_text_if_open(pre_close)
                # Compaction events bypass the adapter, so sync step state
                # when a StreamFinishStep is present — otherwise the adapter
                # will skip StreamStartStep on the next AssistantMessage.
                if any(
                    isinstance(ev, StreamFinishStep) for ev in compact_result.events
                ):
                    state.adapter.step_open = False
                for r in pre_close:
                    yield r
            for ev in compact_result.events:
                yield ev
            entries_replaced = False
            if compact_result.just_ended:
                compacted = await asyncio.to_thread(
                    read_compacted_entries,
                    compact_result.transcript_path,
                )
                if compacted is not None:
                    state.transcript_builder.replace_entries(
                        compacted, log_prefix=ctx.log_prefix
                    )
                    entries_replaced = True

            # --- Hard circuit breaker for empty tool calls ---
            breaker = _check_empty_tool_breaker(
                sdk_msg, consecutive_empty_tool_calls, ctx, state
            )
            consecutive_empty_tool_calls = breaker.count
            if breaker.tripped and breaker.error is not None:
                stream_error_msg = breaker.error_msg
                stream_error_code = breaker.error_code
                yield breaker.error
                ended_with_stream_error = True
                break

            # --- Dispatch adapter responses ---
            for response in state.adapter.convert_message(sdk_msg):
                dispatched = _dispatch_response(
                    response, acc, ctx, state, entries_replaced, ctx.log_prefix
                )
                if dispatched is not None:
                    yield dispatched

            # Append assistant entry AFTER convert_message so that
            # any stashed tool results from the previous turn are
            # recorded first, preserving the required API order:
            # assistant(tool_use) → tool_result → assistant(text).
            # Skip if replace_entries just ran — the CLI session
            # file already contains this message.
            if isinstance(sdk_msg, AssistantMessage) and not entries_replaced:
                state.transcript_builder.append_assistant(
                    content_blocks=_format_sdk_content_blocks(sdk_msg.content),
                    model=sdk_msg.model,
                )

            # --- Intermediate persistence ---
            # Flush session messages to DB periodically so page reloads
            # show progress during long-running turns.
            _msgs_since_flush += 1
            now = time.monotonic()
            if (
                _msgs_since_flush >= _FLUSH_MESSAGE_THRESHOLD
                or (now - _last_flush_time) >= _FLUSH_INTERVAL_SECONDS
            ):
                try:
                    await asyncio.shield(upsert_chat_session(ctx.session))
                    logger.debug(
                        "%s Intermediate flush: %d messages "
                        "(msgs_since=%d, elapsed=%.1fs)",
                        ctx.log_prefix,
                        len(ctx.session.messages),
                        _msgs_since_flush,
                        now - _last_flush_time,
                    )
                except Exception as flush_err:
                    logger.warning(
                        "%s Intermediate flush failed: %s",
                        ctx.log_prefix,
                        flush_err,
                    )
                _last_flush_time = now
                _msgs_since_flush = 0

            if acc.stream_completed:
                break
    finally:
        await _safe_close_sdk_client(sdk_client, ctx.log_prefix)

    # --- Post-stream processing (only on success) ---
    if state.adapter.has_unresolved_tool_calls:
        logger.warning(
            "%s %d unresolved tool(s) after stream — flushing",
            ctx.log_prefix,
            len(state.adapter.current_tool_calls)
            - len(state.adapter.resolved_tool_calls),
        )
        safety_responses: list[StreamBaseResponse] = []
        state.adapter._flush_unresolved_tool_calls(safety_responses)
        for response in safety_responses:
            if isinstance(
                response,
                (StreamToolInputAvailable, StreamToolOutputAvailable),
            ):
                logger.info(
                    "%s Safety flush: %s, tool=%s",
                    ctx.log_prefix,
                    type(response).__name__,
                    getattr(response, "toolName", "N/A"),
                )
            if isinstance(response, StreamToolOutputAvailable):
                state.transcript_builder.append_tool_result(
                    tool_use_id=response.toolCallId,
                    content=(
                        response.output
                        if isinstance(response.output, str)
                        else json.dumps(response.output, ensure_ascii=False)
                    ),
                )
            yield response

    if not acc.stream_completed and not ended_with_stream_error:
        logger.info(
            "%s Stream ended without ResultMessage (stopped by user)",
            ctx.log_prefix,
        )
        closing_responses: list[StreamBaseResponse] = []
        state.adapter._end_text_if_open(closing_responses)
        for r in closing_responses:
            yield r
        ctx.session.messages.append(
            ChatMessage(
                role="assistant",
                content=f"{COPILOT_SYSTEM_PREFIX} Execution stopped by user",
            )
        )

    if (
        acc.assistant_response.content or acc.assistant_response.tool_calls
    ) and not acc.has_appended_assistant:
        ctx.session.messages.append(acc.assistant_response)

    # If the attempt ended with a transient error that was already surfaced
    # to the client (StreamError yielded above), raise so the outer retry
    # loop can rollback session messages and set its error flags properly.
    if ended_with_stream_error:
        raise _HandledStreamError(
            "Stream error handled — StreamError already yielded",
            error_msg=stream_error_msg,
            code=stream_error_code,
        )


async def stream_chat_completion_sdk(
    session_id: str,
    message: str | None = None,
    is_user_message: bool = True,
    user_id: str | None = None,
    session: ChatSession | None = None,
    file_ids: list[str] | None = None,
    permissions: "CopilotPermissions | None" = None,
    **_kwargs: Any,
) -> AsyncIterator[StreamBaseResponse]:
    """Stream chat completion using Claude Agent SDK.

    Args:
        file_ids: Optional workspace file IDs attached to the user's message.
            Images are embedded as vision content blocks; other files are
            saved to the SDK working directory for the Read tool.
    """

    if session is None:
        session = await get_chat_session(session_id, user_id)

    if not session:
        raise NotFoundError(
            f"Session {session_id} not found. Please create a new session first."
        )

    # Type narrowing: session is guaranteed ChatSession after the check above
    session = cast(ChatSession, session)

    # Clean up ALL trailing error markers from previous turn before starting
    # a new turn.  Multiple markers can accumulate when a mid-stream error is
    # followed by a cleanup error in __aexit__ (both append a marker).
    while (
        len(session.messages) > 0
        and session.messages[-1].role == "assistant"
        and session.messages[-1].content
        and (
            COPILOT_ERROR_PREFIX in session.messages[-1].content
            or COPILOT_RETRYABLE_ERROR_PREFIX in session.messages[-1].content
        )
    ):
        logger.info(
            "[SDK] [%s] Removing stale error marker from previous turn",
            session_id[:12],
        )
        session.messages.pop()

    # Append the new message to the session if it's not already there
    new_message_role = "user" if is_user_message else "assistant"
    if message and (
        len(session.messages) == 0
        or not (
            session.messages[-1].role == new_message_role
            and session.messages[-1].content == message
        )
    ):
        session.messages.append(ChatMessage(role=new_message_role, content=message))
        if is_user_message:
            track_user_message(
                user_id=user_id, session_id=session_id, message_length=len(message)
            )

    # Structured log prefix: [SDK][<session>][T<turn>]
    # Turn = number of user messages (1-based), computed AFTER appending the new message.
    turn = sum(1 for m in session.messages if m.role == "user")
    log_prefix = f"[SDK][{session_id[:12]}][T{turn}]"

    session = await upsert_chat_session(session)

    # Generate title for new sessions (first user message)
    if is_user_message and not session.title:
        user_messages = [m for m in session.messages if m.role == "user"]
        if len(user_messages) == 1:
            first_message = user_messages[0].content or message or ""
            if first_message:
                task = asyncio.create_task(
                    _update_title_async(session_id, first_message, user_id)
                )
                _background_tasks.add(task)
                task.add_done_callback(_background_tasks.discard)

    message_id = str(uuid.uuid4())
    stream_id = str(uuid.uuid4())
    ended_with_stream_error = False
    e2b_sandbox = None
    use_resume = False
    resume_file: str | None = None
    transcript_builder = TranscriptBuilder()
    sdk_cwd = ""
    # True when transcript_builder represents a full prefix of session history.
    # First turn (<=1 prior message) is fully covered even without a download.
    # Set to True when load_previous succeeds; stays False when download fails
    # on a session with prior messages, preventing a partial upload that would
    # mislead _build_query_message into skipping gap reconstruction next turn.
    transcript_covers_prefix = True

    # Acquire stream lock to prevent concurrent streams to the same session
    lock = AsyncClusterLock(
        redis=await get_redis_async(),
        key=f"{STREAM_LOCK_PREFIX}{session_id}",
        owner_id=stream_id,
        timeout=config.stream_lock_ttl,
    )

    lock_owner = await lock.try_acquire()
    if lock_owner != stream_id:
        # Another stream is active
        logger.warning(
            "%s Session already has an active stream: %s", log_prefix, lock_owner
        )
        yield StreamError(
            errorText="Another stream is already active for this session. "
            "Please wait or stop it.",
            code="stream_already_active",
        )
        return

    # OTEL context manager — initialized inside the try and cleaned up in finally.
    _otel_ctx: Any = None
    skip_transcript_upload = False
    transcript_content: str = ""
    state: _RetryState | None = None

    # Token usage accumulators — populated from ResultMessage at end of turn
    turn_prompt_tokens = 0  # uncached input tokens only
    turn_completion_tokens = 0
    turn_cache_read_tokens = 0
    turn_cache_creation_tokens = 0
    turn_cost_usd: float | None = None

    # Make sure there is no more code between the lock acquisition and try-block.
    try:
        # Build system prompt (reuses non-SDK path with Langfuse support).
        # Pre-compute the cwd here so the exact working directory path can be
        # injected into the supplement instead of the generic placeholder.
        # Catch ValueError early so the failure yields a clean StreamError rather
        # than propagating outside the stream error-handling path.
        has_history = len(session.messages) > 1
        try:
            sdk_cwd = _make_sdk_cwd(session_id)
            os.makedirs(sdk_cwd, exist_ok=True)
        except (ValueError, OSError) as e:
            logger.error("%s Invalid SDK cwd: %s", log_prefix, e)
            yield StreamError(
                errorText="Unable to initialize working directory.",
                code="sdk_cwd_error",
            )
            return
        # --- Run independent async I/O operations in parallel ---
        # E2B sandbox setup, system prompt build (Langfuse + DB), and transcript
        # download are independent network calls.  Running them concurrently
        # saves ~200-500ms compared to sequential execution.

        async def _setup_e2b():
            """Set up E2B sandbox if configured, return sandbox or None."""
            if not (e2b_api_key := config.active_e2b_api_key):
                if config.use_e2b_sandbox:
                    logger.warning(
                        "[E2B] [%s] E2B sandbox enabled but no API key configured "
                        "(CHAT_E2B_API_KEY / E2B_API_KEY) — falling back to bubblewrap",
                        session_id[:12],
                    )
                return None
            try:
                sandbox = await get_or_create_sandbox(
                    session_id,
                    api_key=e2b_api_key,
                    template=config.e2b_sandbox_template,
                    timeout=config.e2b_sandbox_timeout,
                    on_timeout=config.e2b_sandbox_on_timeout,
                )
            except Exception as e2b_err:
                logger.error(
                    "[E2B] [%s] Setup failed: %s",
                    session_id[:12],
                    e2b_err,
                    exc_info=True,
                )
                return None

            return sandbox

        async def _fetch_transcript():
            """Download transcript for --resume if applicable."""
            if not (
                config.claude_agent_use_resume and user_id and len(session.messages) > 1
            ):
                return None
            try:
                return await download_transcript(
                    user_id, session_id, log_prefix=log_prefix
                )
            except Exception as transcript_err:
                logger.warning(
                    "%s Transcript download failed, continuing without --resume: %s",
                    log_prefix,
                    transcript_err,
                )
                return None

        e2b_sandbox, (base_system_prompt, _), dl = await asyncio.gather(
            _setup_e2b(),
            _build_system_prompt(user_id, has_conversation_history=has_history),
            _fetch_transcript(),
        )

        use_e2b = e2b_sandbox is not None
        # Append appropriate supplement (Claude gets tool schemas automatically)
        system_prompt = base_system_prompt + get_sdk_supplement(
            use_e2b=use_e2b, cwd=sdk_cwd
        )
        # Process transcript download result
        transcript_msg_count = 0
        if dl:
            is_valid = validate_transcript(dl.content)
            dl_lines = dl.content.strip().split("\n") if dl.content else []
            logger.info(
                "%s Downloaded transcript: %dB, %d lines, msg_count=%d, valid=%s",
                log_prefix,
                len(dl.content),
                len(dl_lines),
                dl.message_count,
                is_valid,
            )
            if is_valid:
                # Load previous FULL context into builder
                transcript_content = dl.content
                transcript_builder.load_previous(dl.content, log_prefix=log_prefix)
                resume_file = await asyncio.to_thread(
                    write_transcript_to_tempfile, dl.content, session_id, sdk_cwd
                )
                if resume_file:
                    use_resume = True
                    transcript_msg_count = dl.message_count
                    logger.debug(
                        "%s Using --resume (%dB, msg_count=%d)",
                        log_prefix,
                        len(dl.content),
                        transcript_msg_count,
                    )
            else:
                logger.warning("%s Transcript downloaded but invalid", log_prefix)
                transcript_covers_prefix = False
        elif config.claude_agent_use_resume and user_id and len(session.messages) > 1:
            logger.warning(
                "%s No transcript available (%d messages in session)",
                log_prefix,
                len(session.messages),
            )
            transcript_covers_prefix = False

        yield StreamStart(messageId=message_id, sessionId=session_id)

        set_execution_context(
            user_id,
            session,
            sandbox=e2b_sandbox,
            sdk_cwd=sdk_cwd,
            permissions=permissions,
        )

        # Fail fast when no API credentials are available at all.
        # sdk_cwd routes the CLI's temp dir into the per-session workspace
        # so sub-agent output files land inside sdk_cwd (see build_sdk_env).
        sdk_env = build_sdk_env(session_id=session_id, user_id=user_id, sdk_cwd=sdk_cwd)

        if not config.api_key and not config.use_claude_code_subscription:
            raise RuntimeError(
                "No API key configured. Set OPEN_ROUTER_API_KEY, "
                "CHAT_API_KEY, or ANTHROPIC_API_KEY for API access, "
                "or CHAT_USE_CLAUDE_CODE_SUBSCRIPTION=true to use "
                "Claude Code CLI subscription (requires `claude login`)."
            )

        mcp_server = create_copilot_mcp_server(use_e2b=use_e2b)

        sdk_model = _resolve_sdk_model()

        # Track SDK-internal compaction (PreCompact hook → start, next msg → end)
        compaction = CompactionTracker()

        security_hooks = create_security_hooks(
            user_id,
            sdk_cwd=sdk_cwd,
            max_subtasks=config.claude_agent_max_subtasks,
            on_compact=compaction.on_compact,
        )

        if permissions is not None:
            allowed, disallowed = apply_tool_permissions(permissions, use_e2b=use_e2b)
        else:
            allowed = get_copilot_tool_names(use_e2b=use_e2b)
            disallowed = get_sdk_disallowed_tools(use_e2b=use_e2b)

        def _on_stderr(line: str) -> None:
            """Log a stderr line emitted by the Claude CLI subprocess."""
            sid = session_id[:12] if session_id else "?"
            logger.info("[SDK] [%s] CLI stderr: %s", sid, line.rstrip())

        sdk_options_kwargs: dict[str, Any] = {
            "system_prompt": system_prompt,
            "mcp_servers": {"copilot": mcp_server},
            "allowed_tools": allowed,
            "disallowed_tools": disallowed,
            "hooks": security_hooks,
            "cwd": sdk_cwd,
            "max_buffer_size": config.claude_agent_max_buffer_size,
            "stderr": _on_stderr,
        }
        if sdk_model:
            sdk_options_kwargs["model"] = sdk_model
        if sdk_env:
            sdk_options_kwargs["env"] = sdk_env
        if use_resume and resume_file:
            sdk_options_kwargs["resume"] = resume_file

        options = ClaudeAgentOptions(**sdk_options_kwargs)  # type: ignore[arg-type]  # dynamic kwargs

        adapter = SDKResponseAdapter(message_id=message_id, session_id=session_id)

        # Propagate user_id/session_id as OTEL context attributes so the
        # langsmith tracing integration attaches them to every span.  This
        # is what Langfuse (or any OTEL backend) maps to its native
        # user/session fields.
        _otel_ctx = propagate_attributes(
            user_id=user_id,
            session_id=session_id,
            trace_name="copilot-sdk",
            tags=["sdk"],
            metadata={
                "resume": str(use_resume),
                "conversation_turn": str(turn),
            },
        )
        _otel_ctx.__enter__()

        current_message = message or ""
        if not current_message and session.messages:
            last_user = [m for m in session.messages if m.role == "user"]
            if last_user:
                current_message = last_user[-1].content or ""

        if not current_message.strip():
            yield StreamError(
                errorText="Message cannot be empty.",
                code="empty_prompt",
            )
            return

        query_message, was_compacted = await _build_query_message(
            current_message,
            session,
            use_resume,
            transcript_msg_count,
            session_id,
        )
        # If files are attached, prepare them: images become vision
        # content blocks in the user message, other files go to sdk_cwd.
        attachments = await _prepare_file_attachments(
            file_ids or [], user_id or "", session_id, sdk_cwd
        )
        if attachments.hint:
            query_message = f"{query_message}\n\n{attachments.hint}"

        tried_compaction = False

        # Build the per-request context carrier (shared across attempts).
        # Scalar fields are immutable; session/compaction/lock are shared
        # mutable references (see `_StreamContext` docstring for details).
        stream_ctx = _StreamContext(
            session=session,
            session_id=session_id,
            log_prefix=log_prefix,
            sdk_cwd=sdk_cwd,
            current_message=current_message,
            file_ids=file_ids,
            message_id=message_id,
            attachments=attachments,
            compaction=compaction,
            lock=lock,
        )

        # ---------------------------------------------------------------
        # Retry loop: original → compacted → no transcript
        # ---------------------------------------------------------------
        ended_with_stream_error = False
        attempts_exhausted = False
        stream_err: Exception | None = None

        state = _RetryState(
            options=options,
            query_message=query_message,
            was_compacted=was_compacted,
            use_resume=use_resume,
            resume_file=resume_file,
            transcript_msg_count=transcript_msg_count,
            adapter=adapter,
            transcript_builder=transcript_builder,
            usage=_TokenUsage(),
        )

        for attempt in range(_MAX_STREAM_ATTEMPTS):
            # Clear any stale stash signal from the previous attempt so
            # wait_for_stash() doesn't fire prematurely on a leftover event.
            reset_stash_event()
            # Reset tool-level circuit breaker so failures from a previous
            # (rolled-back) attempt don't carry over to the fresh attempt.
            reset_tool_failure_counters()
            if attempt > 0:
                logger.info(
                    "%s Retrying with reduced context (%d/%d)",
                    log_prefix,
                    attempt + 1,
                    _MAX_STREAM_ATTEMPTS,
                )
                yield StreamStatus(message="Optimizing conversation context\u2026")

                ctx = await _reduce_context(
                    transcript_content,
                    tried_compaction,
                    session_id,
                    sdk_cwd,
                    log_prefix,
                )
                state.transcript_builder = ctx.builder
                state.use_resume = ctx.use_resume
                state.resume_file = ctx.resume_file
                tried_compaction = ctx.tried_compaction
                state.transcript_msg_count = 0
                if ctx.transcript_lost:
                    skip_transcript_upload = True

                # Rebuild SDK options and query for the reduced context
                sdk_options_kwargs_retry = dict(sdk_options_kwargs)
                if ctx.use_resume and ctx.resume_file:
                    sdk_options_kwargs_retry["resume"] = ctx.resume_file
                elif "resume" in sdk_options_kwargs_retry:
                    del sdk_options_kwargs_retry["resume"]
                state.options = ClaudeAgentOptions(**sdk_options_kwargs_retry)  # type: ignore[arg-type]  # dynamic kwargs
                state.query_message, state.was_compacted = await _build_query_message(
                    current_message,
                    session,
                    state.use_resume,
                    state.transcript_msg_count,
                    session_id,
                )
                if attachments.hint:
                    state.query_message = f"{state.query_message}\n\n{attachments.hint}"
                state.adapter = SDKResponseAdapter(
                    message_id=message_id, session_id=session_id
                )
                # Reset token accumulators so a failed attempt's partial
                # usage is not double-counted in the successful attempt.
                state.usage.reset()

            pre_attempt_msg_count = len(session.messages)
            events_yielded = 0

            try:
                async for event in _run_stream_attempt(stream_ctx, state):
                    if not isinstance(
                        event,
                        (
                            StreamHeartbeat,
                            # Compaction UI events are cosmetic and must not
                            # block retry — they're emitted before the SDK
                            # query on compacted attempts.
                            StreamStartStep,
                            StreamFinishStep,
                            StreamToolInputStart,
                            StreamToolInputAvailable,
                            StreamToolOutputAvailable,
                        ),
                    ):
                        events_yielded += 1
                    yield event
                break  # Stream completed — exit retry loop
            except asyncio.CancelledError:
                logger.warning(
                    "%s Streaming cancelled (attempt %d/%d)",
                    log_prefix,
                    attempt + 1,
                    _MAX_STREAM_ATTEMPTS,
                )
                raise
            except _HandledStreamError as exc:
                # _run_stream_attempt already yielded a StreamError and
                # appended an error marker.  We only need to rollback
                # session messages and set the error flag — do NOT set
                # stream_err so the post-loop code won't emit a
                # duplicate StreamError.
                logger.warning(
                    "%s Stream error handled in attempt "
                    "(attempt %d/%d, code=%s, events_yielded=%d)",
                    log_prefix,
                    attempt + 1,
                    _MAX_STREAM_ATTEMPTS,
                    exc.code or "transient",
                    events_yielded,
                )
                session.messages = session.messages[:pre_attempt_msg_count]
                # transcript_builder still contains entries from the aborted
                # attempt that no longer match session.messages.  Skip upload
                # so a future --resume doesn't replay rolled-back content.
                skip_transcript_upload = True
                # Re-append the error marker so it survives the rollback
                # and is persisted by the finally block (see #2947655365).
                # Use the specific error message from the attempt (e.g.
                # circuit breaker msg) rather than always the generic one.
                _append_error_marker(
                    session,
                    exc.error_msg or FRIENDLY_TRANSIENT_MSG,
                    retryable=True,
                )
                ended_with_stream_error = True
                break
            except Exception as e:
                stream_err = e
                is_context_error = _is_prompt_too_long(e)
                logger.warning(
                    "%s Stream error (attempt %d/%d, context_error=%s, "
                    "events_yielded=%d): %s",
                    log_prefix,
                    attempt + 1,
                    _MAX_STREAM_ATTEMPTS,
                    is_context_error,
                    events_yielded,
                    stream_err,
                    exc_info=True,
                )
                session.messages = session.messages[:pre_attempt_msg_count]
                if events_yielded > 0:
                    # Events were already sent to the frontend and cannot be
                    # unsent.  Retrying would produce duplicate/inconsistent
                    # output, so treat this as a final error.
                    logger.warning(
                        "%s Not retrying — %d events already yielded",
                        log_prefix,
                        events_yielded,
                    )
                    skip_transcript_upload = True
                    ended_with_stream_error = True
                    break
                if not is_context_error:
                    # Non-context errors (network, auth, rate-limit) should
                    # not trigger compaction — surface the error immediately.
                    skip_transcript_upload = True
                    ended_with_stream_error = True
                    break
                continue
        else:
            # All retry attempts exhausted (loop ended without break)
            # skip_transcript_upload is already set by _reduce_context
            # when the transcript was dropped (transcript_lost=True).
            ended_with_stream_error = True
            attempts_exhausted = True
            logger.error(
                "%s All %d query attempts exhausted: %s",
                log_prefix,
                _MAX_STREAM_ATTEMPTS,
                stream_err,
            )

        if ended_with_stream_error and state is not None:
            # Flush any unresolved tool calls so the frontend can close
            # stale UI elements (e.g. spinners) that were started before
            # the exception interrupted the stream.
            error_flush: list[StreamBaseResponse] = []
            state.adapter._end_text_if_open(error_flush)
            if state.adapter.has_unresolved_tool_calls:
                logger.warning(
                    "%s Flushing %d unresolved tool(s) after stream error",
                    log_prefix,
                    len(state.adapter.current_tool_calls)
                    - len(state.adapter.resolved_tool_calls),
                )
                state.adapter._flush_unresolved_tool_calls(error_flush)
            for response in error_flush:
                yield response

        if ended_with_stream_error and stream_err is not None:
            # Use distinct error codes: "all_attempts_exhausted" when all
            # retries were consumed vs "sdk_stream_error" for non-context
            # errors that broke the loop immediately (network, auth, etc.).
            safe_err = str(stream_err).replace("\n", " ").replace("\r", "")[:500]
            if attempts_exhausted:
                error_text = (
                    "Your conversation is too long. "
                    "Please start a new chat or clear some history."
                )
            else:
                error_text = _friendly_error_text(safe_err)
            yield StreamError(
                errorText=error_text,
                code=(
                    "all_attempts_exhausted"
                    if attempts_exhausted
                    else "sdk_stream_error"
                ),
            )

        # Copy token usage from retry state to outer-scope accumulators
        # so the finally block can persist them.
        if state is not None:
            turn_prompt_tokens = state.usage.prompt_tokens
            turn_completion_tokens = state.usage.completion_tokens
            turn_cache_read_tokens = state.usage.cache_read_tokens
            turn_cache_creation_tokens = state.usage.cache_creation_tokens
            turn_cost_usd = state.usage.cost_usd

        # Emit token usage to the client (must be in try to reach SSE stream).
        # Session persistence of usage is in finally to stay consistent with
        # rate-limit recording even if an exception interrupts between here
        # and the finally block.
        if turn_prompt_tokens > 0 or turn_completion_tokens > 0:
            # total_tokens = prompt (uncached input) + completion (output).
            # Cache tokens are tracked separately and excluded from total
            # so that the semantics match the baseline path (OpenRouter)
            # which folds cache into prompt_tokens. Keeping total_tokens
            # = prompt + completion everywhere makes cross-path comparisons
            # and session-level aggregation consistent.
            total_tokens = turn_prompt_tokens + turn_completion_tokens
            yield StreamUsage(
                prompt_tokens=turn_prompt_tokens,
                completion_tokens=turn_completion_tokens,
                total_tokens=total_tokens,
                cache_read_tokens=turn_cache_read_tokens,
                cache_creation_tokens=turn_cache_creation_tokens,
            )

        if ended_with_stream_error:
            logger.warning(
                "%s Stream ended with SDK error after %d messages",
                log_prefix,
                len(session.messages),
            )
        else:
            logger.info(
                "%s Stream completed successfully with %d messages",
                log_prefix,
                len(session.messages),
            )
    except GeneratorExit:
        # GeneratorExit is raised when the async generator is closed by the
        # caller (e.g. client disconnect, page refresh).  We MUST release the
        # stream lock here because the ``finally`` block at the end of this
        # function may not execute when GeneratorExit propagates through nested
        # async generators.  Without this, the lock stays held for its full TTL
        # and the user sees "Another stream is already active" on every retry.
        logger.warning("%s GeneratorExit — releasing stream lock", log_prefix)
        await lock.release()
        raise
    except BaseException as e:
        # Catch BaseException to handle both Exception and CancelledError
        # (CancelledError inherits from BaseException in Python 3.8+)
        if isinstance(e, asyncio.CancelledError):
            logger.warning("%s Session cancelled", log_prefix)
            error_msg = "Operation cancelled"
        else:
            error_msg = str(e) or type(e).__name__
            # SDK cleanup errors are expected during client disconnect —
            # log as warning rather than error to reduce Sentry noise.
            # These are normally caught by _safe_close_sdk_client but
            # can escape in edge cases (e.g. GeneratorExit timing).
            if _is_sdk_disconnect_error(e):
                logger.warning(
                    "%s SDK cleanup error (client disconnect): %s",
                    log_prefix,
                    error_msg,
                )
            else:
                logger.error("%s Error: %s", log_prefix, error_msg, exc_info=True)

        is_transient = is_transient_api_error(error_msg)
        if is_transient:
            display_msg, code = FRIENDLY_TRANSIENT_MSG, "transient_api_error"
        else:
            display_msg, code = error_msg, "sdk_error"

        # Append error marker to session (non-invasive text parsing approach).
        # The finally block will persist the session with this error marker.
        # Skip if a marker was already appended inside the stream loop
        # (ended_with_stream_error) to avoid duplicate stale markers.
        if not ended_with_stream_error:
            _append_error_marker(session, display_msg, retryable=is_transient)
            logger.debug(
                "%s Appended error marker, will be persisted in finally",
                log_prefix,
            )

        # Yield StreamError for immediate feedback (only for non-cancellation errors)
        # Skip for CancelledError and SDK disconnect cleanup errors — these
        # are not actionable by the user and the SSE connection is already dead.
        is_cancellation = isinstance(
            e, asyncio.CancelledError
        ) or _is_sdk_disconnect_error(e)
        if not is_cancellation:
            yield StreamError(errorText=display_msg, code=code)

        raise
    finally:
        # --- Close OTEL context ---
        if _otel_ctx is not None:
            try:
                _otel_ctx.__exit__(*sys.exc_info())
            except Exception:
                logger.warning("OTEL context teardown failed", exc_info=True)

        # --- Persist token usage to session + rate-limit counters ---
        # Both must live in finally so they stay consistent even when an
        # exception interrupts the try block after StreamUsage was yielded.
        await persist_and_record_usage(
            session=session,
            user_id=user_id,
            prompt_tokens=turn_prompt_tokens,
            completion_tokens=turn_completion_tokens,
            cache_read_tokens=turn_cache_read_tokens,
            cache_creation_tokens=turn_cache_creation_tokens,
            log_prefix=log_prefix,
            cost_usd=turn_cost_usd,
        )

        # --- Persist session messages ---
        # This MUST run in finally to persist messages even when the generator
        # is stopped early (e.g., user clicks stop, processor breaks stream loop).
        # Without this, messages disappear after refresh because they were never
        # saved to the database.
        if session is not None:
            try:
                await asyncio.shield(upsert_chat_session(session))
                logger.info(
                    "%s Session persisted in finally with %d messages",
                    log_prefix,
                    len(session.messages),
                )
            except Exception as persist_err:
                logger.error(
                    "%s Failed to persist session in finally: %s",
                    log_prefix,
                    persist_err,
                    exc_info=True,
                )

        # --- Pause E2B sandbox to stop billing between turns ---
        # Fire-and-forget: pausing is best-effort and must not block the
        # response or the transcript upload.  The task is anchored to
        # _background_tasks to prevent garbage collection.
        # Use pause_sandbox_direct to skip the Redis lookup and reconnect
        # round-trip — e2b_sandbox is the live object from this turn.
        if e2b_sandbox is not None:
            task = asyncio.create_task(pause_sandbox_direct(e2b_sandbox, session_id))
            _background_tasks.add(task)
            task.add_done_callback(_background_tasks.discard)

        # --- Upload transcript for next-turn --resume ---
        # TranscriptBuilder is the single source of truth.  It mirrors the
        # CLI's active context: on compaction, replace_entries() syncs it
        # with the compacted session file.  No CLI file read needed here.
        if skip_transcript_upload:
            logger.warning(
                "%s Skipping transcript upload — transcript was dropped "
                "during prompt-too-long recovery",
                log_prefix,
            )
        elif (
            config.claude_agent_use_resume
            and user_id
            and session is not None
            and state is not None
        ):
            try:
                transcript_upload_content = state.transcript_builder.to_jsonl()
                entry_count = state.transcript_builder.entry_count

                if not transcript_upload_content:
                    logger.warning(
                        "%s No transcript to upload (builder empty)", log_prefix
                    )
                elif not validate_transcript(transcript_upload_content):
                    logger.warning(
                        "%s Transcript invalid, skipping upload (entries=%d)",
                        log_prefix,
                        entry_count,
                    )
                elif not transcript_covers_prefix:
                    logger.warning(
                        "%s Skipping transcript upload — builder does not "
                        "cover full session prefix (entries=%d, session=%d)",
                        log_prefix,
                        entry_count,
                        len(session.messages),
                    )
                else:
                    logger.info(
                        "%s Uploading transcript (entries=%d, bytes=%d)",
                        log_prefix,
                        entry_count,
                        len(transcript_upload_content),
                    )
                    await asyncio.shield(
                        upload_transcript(
                            user_id=user_id,
                            session_id=session_id,
                            content=transcript_upload_content,
                            message_count=len(session.messages),
                            log_prefix=log_prefix,
                        )
                    )
            except Exception as upload_err:
                logger.error(
                    "%s Transcript upload failed in finally: %s",
                    log_prefix,
                    upload_err,
                    exc_info=True,
                )

        try:
            if sdk_cwd:
                await _cleanup_sdk_tool_results(sdk_cwd)
        except Exception:
            logger.warning("%s SDK cleanup failed", log_prefix, exc_info=True)
        finally:
            # Release stream lock to allow new streams for this session
            await lock.release()


async def _update_title_async(
    session_id: str, message: str, user_id: str | None = None
) -> None:
    """Background task to update session title."""
    try:
        title = await _generate_session_title(
            message, user_id=user_id, session_id=session_id
        )
        if title and user_id:
            await update_session_title(session_id, user_id, title, only_if_empty=True)
            logger.debug("[SDK] Generated title for %s: %s", session_id, title)
    except Exception as e:
        logger.warning("[SDK] Failed to update session title: %s", e)