fix(chat): resolve both graph_id and library_agent_id in edit_agent

Fixes SECRT-1863 The edit_agent tool's schema claims it accepts either a graph ID or library agent ID, but CoPilot often passes library_agent_id (which it naturally grabs from context after creating/saving an agent), causing the lookup to fail. This adds explicit fallback logic in edit_agent to: 1. First try get_agent_as_json with the ID as-is (treats as graph_id) 2. If not found, try to resolve it as a library_agent_id 3. If found, use the library agent's graph_id to fetch the agent This makes the schema truthful and improves robustness for CoPilot.
2026-02-03 11:24:57 -05:00 · 2026-02-02 15:17:33 +00:00
17 changed files with 909 additions and 1820 deletions
--- a/autogpt_platform/backend/backend/api/features/chat/routes.py
+++ b/autogpt_platform/backend/backend/api/features/chat/routes.py
@@ -17,14 +17,6 @@ from .model import ChatSession, create_chat_session, get_chat_session, get_user_

 config = ChatConfig()

-# SSE response headers for streaming
-SSE_RESPONSE_HEADERS = {
-    "Cache-Control": "no-cache",
-    "Connection": "keep-alive",
-    "X-Accel-Buffering": "no",
-    "x-vercel-ai-ui-message-stream": "v1",
-}
-

 logger = logging.getLogger(__name__)

@@ -40,60 +32,6 @@ async def _validate_and_get_session(
    return session


-async def _create_stream_generator(
-    session_id: str,
-    message: str,
-    user_id: str | None,
-    session: ChatSession,
-    is_user_message: bool = True,
-    context: dict[str, str] | None = None,
-) -> AsyncGenerator[str, None]:
-    """Create SSE event generator for chat streaming.
-
-    Args:
-        session_id: Chat session ID
-        message: User message to process
-        user_id: Optional authenticated user ID
-        session: Pre-fetched chat session
-        is_user_message: Whether the message is from a user
-        context: Optional context dict with url and content
-
-    Yields:
-        SSE-formatted chunks from the chat completion stream
-    """
-    chunk_count = 0
-    first_chunk_type: str | None = None
-    async for chunk in chat_service.stream_chat_completion(
-        session_id,
-        message,
-        is_user_message=is_user_message,
-        user_id=user_id,
-        session=session,
-        context=context,
-    ):
-        if chunk_count < 3:
-            logger.info(
-                "Chat stream chunk",
-                extra={
-                    "session_id": session_id,
-                    "chunk_type": str(chunk.type),
-                },
-            )
-        if not first_chunk_type:
-            first_chunk_type = str(chunk.type)
-        chunk_count += 1
-        yield chunk.to_sse()
-    logger.info(
-        "Chat stream completed",
-        extra={
-            "session_id": session_id,
-            "chunk_count": chunk_count,
-            "first_chunk_type": first_chunk_type,
-        },
-    )
-    yield "data: [DONE]\n\n"
-
-
 router = APIRouter(
    tags=["chat"],
 )
@@ -283,17 +221,49 @@ async def stream_chat_post(
    """
    session = await _validate_and_get_session(session_id, user_id)

-    return StreamingResponse(
-        _create_stream_generator(
-            session_id=session_id,
-            message=request.message,
-            user_id=user_id,
-            session=session,
+    async def event_generator() -> AsyncGenerator[str, None]:
+        chunk_count = 0
+        first_chunk_type: str | None = None
+        async for chunk in chat_service.stream_chat_completion(
+            session_id,
+            request.message,
            is_user_message=request.is_user_message,
+            user_id=user_id,
+            session=session,  # Pass pre-fetched session to avoid double-fetch
            context=request.context,
-        ),
+        ):
+            if chunk_count < 3:
+                logger.info(
+                    "Chat stream chunk",
+                    extra={
+                        "session_id": session_id,
+                        "chunk_type": str(chunk.type),
+                    },
+                )
+            if not first_chunk_type:
+                first_chunk_type = str(chunk.type)
+            chunk_count += 1
+            yield chunk.to_sse()
+        logger.info(
+            "Chat stream completed",
+            extra={
+                "session_id": session_id,
+                "chunk_count": chunk_count,
+                "first_chunk_type": first_chunk_type,
+            },
+        )
+        # AI SDK protocol termination
+        yield "data: [DONE]\n\n"
+
+    return StreamingResponse(
+        event_generator(),
        media_type="text/event-stream",
-        headers=SSE_RESPONSE_HEADERS,
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",  # Disable nginx buffering
+            "x-vercel-ai-ui-message-stream": "v1",  # AI SDK protocol header
+        },
    )


@@ -325,16 +295,48 @@ async def stream_chat_get(
    """
    session = await _validate_and_get_session(session_id, user_id)

-    return StreamingResponse(
-        _create_stream_generator(
-            session_id=session_id,
-            message=message,
-            user_id=user_id,
-            session=session,
+    async def event_generator() -> AsyncGenerator[str, None]:
+        chunk_count = 0
+        first_chunk_type: str | None = None
+        async for chunk in chat_service.stream_chat_completion(
+            session_id,
+            message,
            is_user_message=is_user_message,
-        ),
+            user_id=user_id,
+            session=session,  # Pass pre-fetched session to avoid double-fetch
+        ):
+            if chunk_count < 3:
+                logger.info(
+                    "Chat stream chunk",
+                    extra={
+                        "session_id": session_id,
+                        "chunk_type": str(chunk.type),
+                    },
+                )
+            if not first_chunk_type:
+                first_chunk_type = str(chunk.type)
+            chunk_count += 1
+            yield chunk.to_sse()
+        logger.info(
+            "Chat stream completed",
+            extra={
+                "session_id": session_id,
+                "chunk_count": chunk_count,
+                "first_chunk_type": first_chunk_type,
+            },
+        )
+        # AI SDK protocol termination
+        yield "data: [DONE]\n\n"
+
+    return StreamingResponse(
+        event_generator(),
        media_type="text/event-stream",
-        headers=SSE_RESPONSE_HEADERS,
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",  # Disable nginx buffering
+            "x-vercel-ai-ui-message-stream": "v1",  # AI SDK protocol header
+        },
    )


--- a/autogpt_platform/backend/backend/api/features/chat/service.py
+++ b/autogpt_platform/backend/backend/api/features/chat/service.py
@@ -3,13 +3,9 @@ import logging
 import time
 from asyncio import CancelledError
 from collections.abc import AsyncGenerator
-from typing import TYPE_CHECKING, Any, cast
+from typing import Any

 import openai
-
-if TYPE_CHECKING:
-    from backend.util.prompt import CompressResult
-
 import orjson
 from langfuse import get_client
 from openai import (
@@ -19,13 +15,7 @@ from openai import (
    PermissionDeniedError,
    RateLimitError,
 )
-from openai.types.chat import (
-    ChatCompletionChunk,
-    ChatCompletionMessageParam,
-    ChatCompletionStreamOptionsParam,
-    ChatCompletionSystemMessageParam,
-    ChatCompletionToolParam,
-)
+from openai.types.chat import ChatCompletionChunk, ChatCompletionToolParam

 from backend.data.redis_client import get_redis_async
 from backend.data.understanding import (
@@ -804,58 +794,207 @@ def _is_region_blocked_error(error: Exception) -> bool:
    return "not available in your region" in str(error).lower()


-async def _manage_context_window(
+async def _summarize_messages(
    messages: list,
    model: str,
    api_key: str | None = None,
    base_url: str | None = None,
-) -> "CompressResult":
-    """
-    Manage context window using the unified compress_context function.
+    timeout: float = 30.0,
+) -> str:
+    """Summarize a list of messages into concise context.

-    This is a thin wrapper that creates an OpenAI client for summarization
-    and delegates to the shared compression logic in prompt.py.
+    Uses the same model as the chat for higher quality summaries.

    Args:
-        messages: List of messages in OpenAI format
-        model: Model name for token counting and summarization
-        api_key: API key for summarization calls
-        base_url: Base URL for summarization calls
+        messages: List of message dicts to summarize
+        model: Model to use for summarization (same as chat model)
+        api_key: API key for OpenAI client
+        base_url: Base URL for OpenAI client
+        timeout: Request timeout in seconds (default: 30.0)

    Returns:
-        CompressResult with compacted messages and metadata
+        Summarized text
    """
+    # Format messages for summarization
+    conversation = []
+    for msg in messages:
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+        # Include user, assistant, and tool messages (tool outputs are important context)
+        if content and role in ("user", "assistant", "tool"):
+            conversation.append(f"{role.upper()}: {content}")
+
+    conversation_text = "\n\n".join(conversation)
+
+    # Handle empty conversation
+    if not conversation_text:
+        return "No conversation history available."
+
+    # Truncate conversation to fit within summarization model's context
+    # gpt-4o-mini has 128k context, but we limit to ~25k tokens (~100k chars) for safety
+    MAX_CHARS = 100_000
+    if len(conversation_text) > MAX_CHARS:
+        conversation_text = conversation_text[:MAX_CHARS] + "\n\n[truncated]"
+
+    # Call LLM to summarize
    import openai

-    from backend.util.prompt import compress_context
+    summarization_client = openai.AsyncOpenAI(
+        api_key=api_key, base_url=base_url, timeout=timeout
+    )

-    # Convert messages to dict format
-    messages_dict = []
-    for msg in messages:
-        if isinstance(msg, dict):
-            msg_dict = {k: v for k, v in msg.items() if v is not None}
-        else:
-            msg_dict = dict(msg)
-        messages_dict.append(msg_dict)
+    response = await summarization_client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "system",
+                "content": (
+                    "Create a detailed summary of the conversation so far. "
+                    "This summary will be used as context when continuing the conversation.\n\n"
+                    "Before writing the summary, analyze each message chronologically to identify:\n"
+                    "- User requests and their explicit goals\n"
+                    "- Your approach and key decisions made\n"
+                    "- Technical specifics (file names, tool outputs, function signatures)\n"
+                    "- Errors encountered and resolutions applied\n\n"
+                    "You MUST include ALL of the following sections:\n\n"
+                    "## 1. Primary Request and Intent\n"
+                    "The user's explicit goals and what they are trying to accomplish.\n\n"
+                    "## 2. Key Technical Concepts\n"
+                    "Technologies, frameworks, tools, and patterns being used or discussed.\n\n"
+                    "## 3. Files and Resources Involved\n"
+                    "Specific files examined or modified, with relevant snippets and identifiers.\n\n"
+                    "## 4. Errors and Fixes\n"
+                    "Problems encountered, error messages, and their resolutions. "
+                    "Include any user feedback on fixes.\n\n"
+                    "## 5. Problem Solving\n"
+                    "Issues that have been resolved and how they were addressed.\n\n"
+                    "## 6. All User Messages\n"
+                    "A complete list of all user inputs (excluding tool outputs) to preserve their exact requests.\n\n"
+                    "## 7. Pending Tasks\n"
+                    "Work items the user explicitly requested that have not yet been completed.\n\n"
+                    "## 8. Current Work\n"
+                    "Precise description of what was being worked on most recently, including relevant context.\n\n"
+                    "## 9. Next Steps\n"
+                    "What should happen next, aligned with the user's most recent requests. "
+                    "Include verbatim quotes of recent instructions if relevant."
+                ),
+            },
+            {"role": "user", "content": f"Summarize:\n\n{conversation_text}"},
+        ],
+        max_tokens=1500,
+        temperature=0.3,
+    )

-    # Only create client if api_key is provided (enables summarization)
-    # Use context manager to avoid socket leaks
-    if api_key:
-        async with openai.AsyncOpenAI(
-            api_key=api_key, base_url=base_url, timeout=30.0
-        ) as client:
-            return await compress_context(
-                messages=messages_dict,
-                model=model,
-                client=client,
-            )
-    else:
-        # No API key - use truncation-only mode
-        return await compress_context(
-            messages=messages_dict,
-            model=model,
-            client=None,
+    summary = response.choices[0].message.content
+    return summary or "No summary available."
+
+
+def _ensure_tool_pairs_intact(
+    recent_messages: list[dict],
+    all_messages: list[dict],
+    start_index: int,
+) -> list[dict]:
+    """
+    Ensure tool_call/tool_response pairs stay together after slicing.
+
+    When slicing messages for context compaction, a naive slice can separate
+    an assistant message containing tool_calls from its corresponding tool
+    response messages. This causes API validation errors (e.g., Anthropic's
+    "unexpected tool_use_id found in tool_result blocks").
+
+    This function checks for orphan tool responses in the slice and extends
+    backwards to include their corresponding assistant messages.
+
+    Args:
+        recent_messages: The sliced messages to validate
+        all_messages: The complete message list (for looking up missing assistants)
+        start_index: The index in all_messages where recent_messages begins
+
+    Returns:
+        A potentially extended list of messages with tool pairs intact
+    """
+    if not recent_messages:
+        return recent_messages
+
+    # Collect all tool_call_ids from assistant messages in the slice
+    available_tool_call_ids: set[str] = set()
+    for msg in recent_messages:
+        if msg.get("role") == "assistant" and msg.get("tool_calls"):
+            for tc in msg["tool_calls"]:
+                tc_id = tc.get("id")
+                if tc_id:
+                    available_tool_call_ids.add(tc_id)
+
+    # Find orphan tool responses (tool messages whose tool_call_id is missing)
+    orphan_tool_call_ids: set[str] = set()
+    for msg in recent_messages:
+        if msg.get("role") == "tool":
+            tc_id = msg.get("tool_call_id")
+            if tc_id and tc_id not in available_tool_call_ids:
+                orphan_tool_call_ids.add(tc_id)
+
+    if not orphan_tool_call_ids:
+        # No orphans, slice is valid
+        return recent_messages
+
+    # Find the assistant messages that contain the orphan tool_call_ids
+    # Search backwards from start_index in all_messages
+    messages_to_prepend: list[dict] = []
+    for i in range(start_index - 1, -1, -1):
+        msg = all_messages[i]
+        if msg.get("role") == "assistant" and msg.get("tool_calls"):
+            msg_tool_ids = {tc.get("id") for tc in msg["tool_calls"] if tc.get("id")}
+            if msg_tool_ids & orphan_tool_call_ids:
+                # This assistant message has tool_calls we need
+                # Also collect its contiguous tool responses that follow it
+                assistant_and_responses: list[dict] = [msg]
+
+                # Scan forward from this assistant to collect tool responses
+                for j in range(i + 1, start_index):
+                    following_msg = all_messages[j]
+                    if following_msg.get("role") == "tool":
+                        tool_id = following_msg.get("tool_call_id")
+                        if tool_id and tool_id in msg_tool_ids:
+                            assistant_and_responses.append(following_msg)
+                    else:
+                        # Stop at first non-tool message
+                        break
+
+                # Prepend the assistant and its tool responses (maintain order)
+                messages_to_prepend = assistant_and_responses + messages_to_prepend
+                # Mark these as found
+                orphan_tool_call_ids -= msg_tool_ids
+                # Also add this assistant's tool_call_ids to available set
+                available_tool_call_ids |= msg_tool_ids
+
+        if not orphan_tool_call_ids:
+            # Found all missing assistants
+            break
+
+    if orphan_tool_call_ids:
+        # Some tool_call_ids couldn't be resolved - remove those tool responses
+        # This shouldn't happen in normal operation but handles edge cases
+        logger.warning(
+            f"Could not find assistant messages for tool_call_ids: {orphan_tool_call_ids}. "
+            "Removing orphan tool responses."
        )
+        recent_messages = [
+            msg
+            for msg in recent_messages
+            if not (
+                msg.get("role") == "tool"
+                and msg.get("tool_call_id") in orphan_tool_call_ids
+            )
+        ]
+
+    if messages_to_prepend:
+        logger.info(
+            f"Extended recent messages by {len(messages_to_prepend)} to preserve "
+            f"tool_call/tool_response pairs"
+        )
+        return messages_to_prepend + recent_messages
+
+    return recent_messages


 async def _stream_chat_chunks(
@@ -883,8 +1022,11 @@ async def _stream_chat_chunks(

    logger.info("Starting pure chat stream")

+    # Build messages with system prompt prepended
    messages = session.to_openai_messages()
    if system_prompt:
+        from openai.types.chat import ChatCompletionSystemMessageParam
+
        system_message = ChatCompletionSystemMessageParam(
            role="system",
            content=system_prompt,
@@ -892,38 +1034,314 @@ async def _stream_chat_chunks(
        messages = [system_message] + messages

    # Apply context window management
-    context_result = await _manage_context_window(
-        messages=messages,
-        model=model,
-        api_key=config.api_key,
-        base_url=config.base_url,
-    )
+    token_count = 0  # Initialize for exception handler
+    try:
+        from backend.util.prompt import estimate_token_count

-    if context_result.error:
-        if "System prompt dropped" in context_result.error:
-            # Warning only - continue with reduced context
-            yield StreamError(
-                errorText=(
-                    "Warning: System prompt dropped due to size constraints. "
-                    "Assistant behavior may be affected."
-                )
+        # Convert to dict for token counting
+        # OpenAI message types are TypedDicts, so they're already dict-like
+        messages_dict = []
+        for msg in messages:
+            # TypedDict objects are already dicts, just filter None values
+            if isinstance(msg, dict):
+                msg_dict = {k: v for k, v in msg.items() if v is not None}
+            else:
+                # Fallback for unexpected types
+                msg_dict = dict(msg)
+            messages_dict.append(msg_dict)
+
+        # Estimate tokens using appropriate tokenizer
+        # Normalize model name for token counting (tiktoken only supports OpenAI models)
+        token_count_model = model
+        if "/" in model:
+            # Strip provider prefix (e.g., "anthropic/claude-opus-4.5" -> "claude-opus-4.5")
+            token_count_model = model.split("/")[-1]
+
+        # For Claude and other non-OpenAI models, approximate with gpt-4o tokenizer
+        # Most modern LLMs have similar tokenization (~1 token per 4 chars)
+        if "claude" in token_count_model.lower() or not any(
+            known in token_count_model.lower()
+            for known in ["gpt", "o1", "chatgpt", "text-"]
+        ):
+            token_count_model = "gpt-4o"
+
+        # Attempt token counting with error handling
+        try:
+            token_count = estimate_token_count(messages_dict, model=token_count_model)
+        except Exception as token_error:
+            # If token counting fails, use gpt-4o as fallback approximation
+            logger.warning(
+                f"Token counting failed for model {token_count_model}: {token_error}. "
+                "Using gpt-4o approximation."
            )
-        else:
-            # Any other error - abort to prevent failed LLM calls
+            token_count = estimate_token_count(messages_dict, model="gpt-4o")
+
+        # If over threshold, summarize old messages
+        if token_count > 120_000:
+            KEEP_RECENT = 15
+
+            # Check if we have a system prompt at the start
+            has_system_prompt = (
+                len(messages) > 0 and messages[0].get("role") == "system"
+            )
+
+            # Always attempt mitigation when over limit, even with few messages
+            if messages:
+                # Split messages based on whether system prompt exists
+                # Calculate start index for the slice
+                slice_start = max(0, len(messages_dict) - KEEP_RECENT)
+                recent_messages = messages_dict[-KEEP_RECENT:]
+
+                # Ensure tool_call/tool_response pairs stay together
+                # This prevents API errors from orphan tool responses
+                recent_messages = _ensure_tool_pairs_intact(
+                    recent_messages, messages_dict, slice_start
+                )
+
+                if has_system_prompt:
+                    # Keep system prompt separate, summarize everything between system and recent
+                    system_msg = messages[0]
+                    old_messages_dict = messages_dict[1:-KEEP_RECENT]
+                else:
+                    # No system prompt, summarize everything except recent
+                    system_msg = None
+                    old_messages_dict = messages_dict[:-KEEP_RECENT]
+
+                # Summarize any non-empty old messages (no minimum threshold)
+                # If we're over the token limit, we need to compress whatever we can
+                if old_messages_dict:
+                    # Summarize old messages using the same model as chat
+                    summary_text = await _summarize_messages(
+                        old_messages_dict,
+                        model=model,
+                        api_key=config.api_key,
+                        base_url=config.base_url,
+                    )
+
+                    # Build new message list
+                    # Use assistant role (not system) to prevent privilege escalation
+                    # of user-influenced content to instruction-level authority
+                    from openai.types.chat import ChatCompletionAssistantMessageParam
+
+                    summary_msg = ChatCompletionAssistantMessageParam(
+                        role="assistant",
+                        content=(
+                            "[Previous conversation summary — for context only]: "
+                            f"{summary_text}"
+                        ),
+                    )
+
+                    # Rebuild messages based on whether we have a system prompt
+                    if has_system_prompt:
+                        # system_prompt + summary + recent_messages
+                        messages = [system_msg, summary_msg] + recent_messages
+                    else:
+                        # summary + recent_messages (no original system prompt)
+                        messages = [summary_msg] + recent_messages
+
+                    logger.info(
+                        f"Context summarized: {token_count} tokens, "
+                        f"summarized {len(old_messages_dict)} old messages, "
+                        f"kept last {KEEP_RECENT} messages"
+                    )
+
+                    # Fallback: If still over limit after summarization, progressively drop recent messages
+                    # This handles edge cases where recent messages are extremely large
+                    new_messages_dict = []
+                    for msg in messages:
+                        if isinstance(msg, dict):
+                            msg_dict = {k: v for k, v in msg.items() if v is not None}
+                        else:
+                            msg_dict = dict(msg)
+                        new_messages_dict.append(msg_dict)
+
+                    new_token_count = estimate_token_count(
+                        new_messages_dict, model=token_count_model
+                    )
+
+                    if new_token_count > 120_000:
+                        # Still over limit - progressively reduce KEEP_RECENT
+                        logger.warning(
+                            f"Still over limit after summarization: {new_token_count} tokens. "
+                            "Reducing number of recent messages kept."
+                        )
+
+                        for keep_count in [12, 10, 8, 5, 3, 2, 1, 0]:
+                            if keep_count == 0:
+                                # Try with just system prompt + summary (no recent messages)
+                                if has_system_prompt:
+                                    messages = [system_msg, summary_msg]
+                                else:
+                                    messages = [summary_msg]
+                                logger.info(
+                                    "Trying with 0 recent messages (system + summary only)"
+                                )
+                            else:
+                                # Slice from ORIGINAL recent_messages to avoid duplicating summary
+                                reduced_recent = (
+                                    recent_messages[-keep_count:]
+                                    if len(recent_messages) >= keep_count
+                                    else recent_messages
+                                )
+                                # Ensure tool pairs stay intact in the reduced slice
+                                reduced_slice_start = max(
+                                    0, len(recent_messages) - keep_count
+                                )
+                                reduced_recent = _ensure_tool_pairs_intact(
+                                    reduced_recent, recent_messages, reduced_slice_start
+                                )
+                                if has_system_prompt:
+                                    messages = [
+                                        system_msg,
+                                        summary_msg,
+                                    ] + reduced_recent
+                                else:
+                                    messages = [summary_msg] + reduced_recent
+
+                            new_messages_dict = []
+                            for msg in messages:
+                                if isinstance(msg, dict):
+                                    msg_dict = {
+                                        k: v for k, v in msg.items() if v is not None
+                                    }
+                                else:
+                                    msg_dict = dict(msg)
+                                new_messages_dict.append(msg_dict)
+
+                            new_token_count = estimate_token_count(
+                                new_messages_dict, model=token_count_model
+                            )
+
+                            if new_token_count <= 120_000:
+                                logger.info(
+                                    f"Reduced to {keep_count} recent messages, "
+                                    f"now {new_token_count} tokens"
+                                )
+                                break
+                        else:
+                            logger.error(
+                                f"Unable to reduce token count below threshold even with 0 messages. "
+                                f"Final count: {new_token_count} tokens"
+                            )
+                            # ABSOLUTE LAST RESORT: Drop system prompt
+                            # This should only happen if summary itself is massive
+                            if has_system_prompt and len(messages) > 1:
+                                messages = messages[1:]  # Drop system prompt
+                                logger.critical(
+                                    "CRITICAL: Dropped system prompt as absolute last resort. "
+                                    "Behavioral consistency may be affected."
+                                )
+                                # Yield error to user
+                                yield StreamError(
+                                    errorText=(
+                                        "Warning: System prompt dropped due to size constraints. "
+                                        "Assistant behavior may be affected."
+                                    )
+                                )
+                else:
+                    # No old messages to summarize - all messages are "recent"
+                    # Apply progressive truncation to reduce token count
+                    logger.warning(
+                        f"Token count {token_count} exceeds threshold but no old messages to summarize. "
+                        f"Applying progressive truncation to recent messages."
+                    )
+
+                    # Create a base list excluding system prompt to avoid duplication
+                    # This is the pool of messages we'll slice from in the loop
+                    # Use messages_dict for type consistency with _ensure_tool_pairs_intact
+                    base_msgs = (
+                        messages_dict[1:] if has_system_prompt else messages_dict
+                    )
+
+                    # Try progressively smaller keep counts
+                    new_token_count = token_count  # Initialize with current count
+                    for keep_count in [12, 10, 8, 5, 3, 2, 1, 0]:
+                        if keep_count == 0:
+                            # Try with just system prompt (no recent messages)
+                            if has_system_prompt:
+                                messages = [system_msg]
+                                logger.info(
+                                    "Trying with 0 recent messages (system prompt only)"
+                                )
+                            else:
+                                # No system prompt and no recent messages = empty messages list
+                                # This is invalid, skip this iteration
+                                continue
+                        else:
+                            if len(base_msgs) < keep_count:
+                                continue  # Skip if we don't have enough messages
+
+                            # Slice from base_msgs to get recent messages (without system prompt)
+                            recent_messages = base_msgs[-keep_count:]
+
+                            # Ensure tool pairs stay intact in the reduced slice
+                            reduced_slice_start = max(0, len(base_msgs) - keep_count)
+                            recent_messages = _ensure_tool_pairs_intact(
+                                recent_messages, base_msgs, reduced_slice_start
+                            )
+
+                            if has_system_prompt:
+                                messages = [system_msg] + recent_messages
+                            else:
+                                messages = recent_messages
+
+                        new_messages_dict = []
+                        for msg in messages:
+                            if msg is None:
+                                continue  # Skip None messages (type safety)
+                            if isinstance(msg, dict):
+                                msg_dict = {
+                                    k: v for k, v in msg.items() if v is not None
+                                }
+                            else:
+                                msg_dict = dict(msg)
+                            new_messages_dict.append(msg_dict)
+
+                        new_token_count = estimate_token_count(
+                            new_messages_dict, model=token_count_model
+                        )
+
+                        if new_token_count <= 120_000:
+                            logger.info(
+                                f"Reduced to {keep_count} recent messages, "
+                                f"now {new_token_count} tokens"
+                            )
+                            break
+                    else:
+                        # Even with 0 messages still over limit
+                        logger.error(
+                            f"Unable to reduce token count below threshold even with 0 messages. "
+                            f"Final count: {new_token_count} tokens. Messages may be extremely large."
+                        )
+                        # ABSOLUTE LAST RESORT: Drop system prompt
+                        if has_system_prompt and len(messages) > 1:
+                            messages = messages[1:]  # Drop system prompt
+                            logger.critical(
+                                "CRITICAL: Dropped system prompt as absolute last resort. "
+                                "Behavioral consistency may be affected."
+                            )
+                            # Yield error to user
+                            yield StreamError(
+                                errorText=(
+                                    "Warning: System prompt dropped due to size constraints. "
+                                    "Assistant behavior may be affected."
+                                )
+                            )
+
+    except Exception as e:
+        logger.error(f"Context summarization failed: {e}", exc_info=True)
+        # If we were over the token limit, yield error to user
+        # Don't silently continue with oversized messages that will fail
+        if token_count > 120_000:
            yield StreamError(
                errorText=(
-                    f"Context window management failed: {context_result.error}. "
-                    "Please start a new conversation."
+                    f"Unable to manage context window (token limit exceeded: {token_count} tokens). "
+                    "Context summarization failed. Please start a new conversation."
                )
            )
            yield StreamFinish()
            return
-
-    messages = context_result.messages
-    if context_result.was_compacted:
-        logger.info(
-            f"Context compacted for streaming: {context_result.token_count} tokens"
-        )
+        # Otherwise, continue with original messages (under limit)

    # Loop to handle tool calls and continue conversation
    while True:
@@ -951,6 +1369,14 @@ async def _stream_chat_chunks(
                        :128
                    ]  # OpenRouter limit

+                # Create the stream with proper types
+                from typing import cast
+
+                from openai.types.chat import (
+                    ChatCompletionMessageParam,
+                    ChatCompletionStreamOptionsParam,
+                )
+
                stream = await client.chat.completions.create(
                    model=model,
                    messages=cast(list[ChatCompletionMessageParam], messages),
@@ -1474,36 +1900,17 @@ async def _generate_llm_continuation(
        # Build system prompt
        system_prompt, _ = await _build_system_prompt(user_id)

+        # Build messages in OpenAI format
        messages = session.to_openai_messages()
        if system_prompt:
+            from openai.types.chat import ChatCompletionSystemMessageParam
+
            system_message = ChatCompletionSystemMessageParam(
                role="system",
                content=system_prompt,
            )
            messages = [system_message] + messages

-        # Apply context window management to prevent oversized requests
-        context_result = await _manage_context_window(
-            messages=messages,
-            model=config.model,
-            api_key=config.api_key,
-            base_url=config.base_url,
-        )
-
-        if context_result.error and "System prompt dropped" not in context_result.error:
-            logger.error(
-                f"Context window management failed for session {session_id}: "
-                f"{context_result.error} (tokens={context_result.token_count})"
-            )
-            return
-
-        messages = context_result.messages
-        if context_result.was_compacted:
-            logger.info(
-                f"Context compacted for LLM continuation: "
-                f"{context_result.token_count} tokens"
-            )
-
        # Build extra_body for tracing
        extra_body: dict[str, Any] = {
            "posthogProperties": {
@@ -1516,54 +1923,19 @@ async def _generate_llm_continuation(
        if session_id:
            extra_body["session_id"] = session_id[:128]

-        retry_count = 0
-        last_error: Exception | None = None
-        response = None
+        # Make non-streaming LLM call (no tools - just text response)
+        from typing import cast

-        while retry_count <= MAX_RETRIES:
-            try:
-                logger.info(
-                    f"Generating LLM continuation for session {session_id}"
-                    f"{f' (retry {retry_count}/{MAX_RETRIES})' if retry_count > 0 else ''}"
-                )
+        from openai.types.chat import ChatCompletionMessageParam

-                response = await client.chat.completions.create(
-                    model=config.model,
-                    messages=cast(list[ChatCompletionMessageParam], messages),
-                    extra_body=extra_body,
-                )
-                last_error = None  # Clear any previous error on success
-                break  # Success, exit retry loop
-            except Exception as e:
-                last_error = e
-                if _is_retryable_error(e) and retry_count < MAX_RETRIES:
-                    retry_count += 1
-                    delay = min(
-                        BASE_DELAY_SECONDS * (2 ** (retry_count - 1)),
-                        MAX_DELAY_SECONDS,
-                    )
-                    logger.warning(
-                        f"Retryable error in LLM continuation: {e!s}. "
-                        f"Retrying in {delay:.1f}s (attempt {retry_count}/{MAX_RETRIES})"
-                    )
-                    await asyncio.sleep(delay)
-                    continue
-                else:
-                    # Non-retryable error - log and exit gracefully
-                    logger.error(
-                        f"Non-retryable error in LLM continuation: {e!s}",
-                        exc_info=True,
-                    )
-                    return
+        # No tools parameter = text-only response (no tool calls)
+        response = await client.chat.completions.create(
+            model=config.model,
+            messages=cast(list[ChatCompletionMessageParam], messages),
+            extra_body=extra_body,
+        )

-        if last_error:
-            logger.error(
-                f"Max retries ({MAX_RETRIES}) exceeded for LLM continuation. "
-                f"Last error: {last_error!s}"
-            )
-            return
-
-        if response and response.choices and response.choices[0].message.content:
+        if response.choices and response.choices[0].message.content:
            assistant_content = response.choices[0].message.content

            # Reload session from DB to avoid race condition with user messages
--- a/autogpt_platform/backend/backend/api/features/chat/tools/agent_generator/service.py
+++ b/autogpt_platform/backend/backend/api/features/chat/tools/agent_generator/service.py
@@ -139,10 +139,11 @@ async def decompose_goal_external(
    """
    client = _get_client()

-    if context:
-        description = f"{description}\n\nAdditional context from user:\n{context}"
-
+    # Build the request payload
    payload: dict[str, Any] = {"description": description}
+    if context:
+        # The external service uses user_instruction for additional context
+        payload["user_instruction"] = context
    if library_agents:
        payload["library_agents"] = library_agents

--- a/autogpt_platform/backend/backend/api/features/chat/tools/edit_agent.py
+++ b/autogpt_platform/backend/backend/api/features/chat/tools/edit_agent.py
@@ -4,6 +4,7 @@ import logging
 from typing import Any

 from backend.api.features.chat.model import ChatSession
+from backend.api.features.library import db as library_db

 from .agent_generator import (
    AgentGeneratorNotConfiguredError,
@@ -120,6 +121,18 @@ class EditAgentTool(BaseTool):

        current_agent = await get_agent_as_json(agent_id, user_id)

+        # If not found by graph_id, try resolving as library_agent_id
+        if current_agent is None and user_id:
+            try:
+                library_agent = await library_db.get_library_agent(agent_id, user_id)
+                logger.debug(
+                    f"Resolved library_agent_id '{agent_id}' to graph_id "
+                    f"'{library_agent.graph_id}'"
+                )
+                current_agent = await get_agent_as_json(library_agent.graph_id, user_id)
+            except Exception as e:
+                logger.debug(f"Could not resolve '{agent_id}' as library_agent_id: {e}")
+
        if current_agent is None:
            return ErrorResponse(
                message=f"Could not find agent with ID '{agent_id}' in your library.",
--- a/autogpt_platform/backend/backend/api/features/chat/tools/helpers.py
+++ b/autogpt_platform/backend/backend/api/features/chat/tools/helpers.py
@@ -1,77 +0,0 @@
-"""Shared helpers for chat tools."""
-
-from typing import Any
-
-from .models import ErrorResponse
-
-
-def error_response(
-    message: str, session_id: str | None, **kwargs: Any
-) -> ErrorResponse:
-    """Create standardized error response.
-
-    Args:
-        message: Error message to display
-        session_id: Current session ID
-        **kwargs: Additional fields to pass to ErrorResponse
-
-    Returns:
-        ErrorResponse with the given message and session_id
-    """
-    return ErrorResponse(message=message, session_id=session_id, **kwargs)
-
-
-def get_inputs_from_schema(
-    input_schema: dict[str, Any],
-    exclude_fields: set[str] | None = None,
-) -> list[dict[str, Any]]:
-    """Extract input field info from JSON schema.
-
-    Args:
-        input_schema: JSON schema dict with 'properties' and 'required'
-        exclude_fields: Set of field names to exclude (e.g., credential fields)
-
-    Returns:
-        List of dicts with field info (name, title, type, description, required, default)
-    """
-    exclude = exclude_fields or set()
-    properties = input_schema.get("properties", {})
-    required = set(input_schema.get("required", []))
-
-    return [
-        {
-            "name": name,
-            "title": schema.get("title", name),
-            "type": schema.get("type", "string"),
-            "description": schema.get("description", ""),
-            "required": name in required,
-            "default": schema.get("default"),
-        }
-        for name, schema in properties.items()
-        if name not in exclude
-    ]
-
-
-def format_inputs_as_markdown(inputs: list[dict[str, Any]]) -> str:
-    """Format input fields as a readable markdown list.
-
-    Args:
-        inputs: List of input dicts from get_inputs_from_schema
-
-    Returns:
-        Markdown-formatted string listing the inputs
-    """
-    if not inputs:
-        return "No inputs required."
-
-    lines = []
-    for inp in inputs:
-        required_marker = " (required)" if inp.get("required") else ""
-        default = inp.get("default")
-        default_info = f" [default: {default}]" if default is not None else ""
-        description = inp.get("description", "")
-        desc_info = f" - {description}" if description else ""
-
-        lines.append(f"- **{inp['name']}**{required_marker}{default_info}{desc_info}")
-
-    return "\n".join(lines)
--- a/autogpt_platform/backend/backend/api/features/chat/tools/run_agent.py
+++ b/autogpt_platform/backend/backend/api/features/chat/tools/run_agent.py
@@ -24,7 +24,6 @@ from backend.util.timezone_utils import (
 )

 from .base import BaseTool
-from .helpers import get_inputs_from_schema
 from .models import (
    AgentDetails,
    AgentDetailsResponse,
@@ -355,7 +354,19 @@ class RunAgentTool(BaseTool):

    def _get_inputs_list(self, input_schema: dict[str, Any]) -> list[dict[str, Any]]:
        """Extract inputs list from schema."""
-        return get_inputs_from_schema(input_schema)
+        inputs_list = []
+        if isinstance(input_schema, dict) and "properties" in input_schema:
+            for field_name, field_schema in input_schema["properties"].items():
+                inputs_list.append(
+                    {
+                        "name": field_name,
+                        "title": field_schema.get("title", field_name),
+                        "type": field_schema.get("type", "string"),
+                        "description": field_schema.get("description", ""),
+                        "required": field_name in input_schema.get("required", []),
+                    }
+                )
+        return inputs_list

    def _get_execution_modes(self, graph: GraphModel) -> list[str]:
        """Get available execution modes for the graph."""
--- a/autogpt_platform/backend/backend/api/features/chat/tools/run_block.py
+++ b/autogpt_platform/backend/backend/api/features/chat/tools/run_block.py
@@ -8,13 +8,12 @@ from typing import Any
 from backend.api.features.chat.model import ChatSession
 from backend.data.block import get_block
 from backend.data.execution import ExecutionContext
-from backend.data.model import CredentialsFieldInfo, CredentialsMetaInput
+from backend.data.model import CredentialsMetaInput
 from backend.data.workspace import get_or_create_workspace
 from backend.integrations.creds_manager import IntegrationCredentialsManager
 from backend.util.exceptions import BlockError

 from .base import BaseTool
-from .helpers import get_inputs_from_schema
 from .models import (
    BlockOutputResponse,
    ErrorResponse,
@@ -23,10 +22,7 @@ from .models import (
    ToolResponseBase,
    UserReadiness,
 )
-from .utils import (
-    build_missing_credentials_from_field_info,
-    match_credentials_to_requirements,
-)
+from .utils import build_missing_credentials_from_field_info

 logger = logging.getLogger(__name__)

@@ -75,22 +71,6 @@ class RunBlockTool(BaseTool):
    def requires_auth(self) -> bool:
        return True

-    def _get_credentials_requirements(
-        self,
-        block: Any,
-    ) -> dict[str, CredentialsFieldInfo]:
-        """
-        Get credential requirements from block's input schema.
-
-        Args:
-            block: Block to get credentials for
-
-        Returns:
-            Dict mapping field names to CredentialsFieldInfo
-        """
-        credentials_fields_info = block.input_schema.get_credentials_fields_info()
-        return credentials_fields_info if credentials_fields_info else {}
-
    async def _check_block_credentials(
        self,
        user_id: str,
@@ -102,12 +82,53 @@ class RunBlockTool(BaseTool):
        Returns:
            tuple[matched_credentials, missing_credentials]
        """
-        requirements = self._get_credentials_requirements(block)
+        matched_credentials: dict[str, CredentialsMetaInput] = {}
+        missing_credentials: list[CredentialsMetaInput] = []

-        if not requirements:
-            return {}, []
+        # Get credential field info from block's input schema
+        credentials_fields_info = block.input_schema.get_credentials_fields_info()

-        return await match_credentials_to_requirements(user_id, requirements)
+        if not credentials_fields_info:
+            return matched_credentials, missing_credentials
+
+        # Get user's available credentials
+        creds_manager = IntegrationCredentialsManager()
+        available_creds = await creds_manager.store.get_all_creds(user_id)
+
+        for field_name, field_info in credentials_fields_info.items():
+            # field_info.provider is a frozenset of acceptable providers
+            # field_info.supported_types is a frozenset of acceptable types
+            matching_cred = next(
+                (
+                    cred
+                    for cred in available_creds
+                    if cred.provider in field_info.provider
+                    and cred.type in field_info.supported_types
+                ),
+                None,
+            )
+
+            if matching_cred:
+                matched_credentials[field_name] = CredentialsMetaInput(
+                    id=matching_cred.id,
+                    provider=matching_cred.provider,  # type: ignore
+                    type=matching_cred.type,
+                    title=matching_cred.title,
+                )
+            else:
+                # Create a placeholder for the missing credential
+                provider = next(iter(field_info.provider), "unknown")
+                cred_type = next(iter(field_info.supported_types), "api_key")
+                missing_credentials.append(
+                    CredentialsMetaInput(
+                        id=field_name,
+                        provider=provider,  # type: ignore
+                        type=cred_type,  # type: ignore
+                        title=field_name.replace("_", " ").title(),
+                    )
+                )
+
+        return matched_credentials, missing_credentials

    async def _execute(
        self,
@@ -299,7 +320,27 @@ class RunBlockTool(BaseTool):

    def _get_inputs_list(self, block: Any) -> list[dict[str, Any]]:
        """Extract non-credential inputs from block schema."""
+        inputs_list = []
        schema = block.input_schema.jsonschema()
+        properties = schema.get("properties", {})
+        required_fields = set(schema.get("required", []))
+
        # Get credential field names to exclude
        credentials_fields = set(block.input_schema.get_credentials_fields().keys())
-        return get_inputs_from_schema(schema, exclude_fields=credentials_fields)
+
+        for field_name, field_schema in properties.items():
+            # Skip credential fields
+            if field_name in credentials_fields:
+                continue
+
+            inputs_list.append(
+                {
+                    "name": field_name,
+                    "title": field_schema.get("title", field_name),
+                    "type": field_schema.get("type", "string"),
+                    "description": field_schema.get("description", ""),
+                    "required": field_name in required_fields,
+                }
+            )
+
+        return inputs_list
--- a/autogpt_platform/backend/backend/api/features/chat/tools/utils.py
+++ b/autogpt_platform/backend/backend/api/features/chat/tools/utils.py
@@ -225,127 +225,6 @@ async def get_or_create_library_agent(
    return library_agents[0]


-async def get_user_credentials(user_id: str) -> list:
-    """
-    Get all available credentials for a user.
-
-    Args:
-        user_id: The user's ID
-
-    Returns:
-        List of user's credentials
-    """
-    creds_manager = IntegrationCredentialsManager()
-    return await creds_manager.store.get_all_creds(user_id)
-
-
-def find_matching_credential(
-    available_creds: list,
-    field_info: CredentialsFieldInfo,
-):
-    """
-    Find a credential that matches the required provider, type, and scopes.
-
-    Args:
-        available_creds: List of user's available credentials
-        field_info: CredentialsFieldInfo with provider, type, and scope requirements
-
-    Returns:
-        Matching credential or None
-    """
-    for cred in available_creds:
-        if cred.provider not in field_info.provider:
-            continue
-        if cred.type not in field_info.supported_types:
-            continue
-        if not _credential_has_required_scopes(cred, field_info):
-            continue
-        return cred
-    return None
-
-
-def create_credential_meta_from_match(
-    matching_cred,
-) -> CredentialsMetaInput:
-    """
-    Create a CredentialsMetaInput from a matched credential.
-
-    Args:
-        matching_cred: The matched credential object
-
-    Returns:
-        CredentialsMetaInput instance
-    """
-    return CredentialsMetaInput(
-        id=matching_cred.id,
-        provider=matching_cred.provider,  # type: ignore
-        type=matching_cred.type,
-        title=matching_cred.title,
-    )
-
-
-async def match_credentials_to_requirements(
-    user_id: str,
-    requirements: dict[str, CredentialsFieldInfo],
-) -> tuple[dict[str, CredentialsMetaInput], list[CredentialsMetaInput]]:
-    """
-    Match user's credentials against a dictionary of credential requirements.
-
-    This is the core matching logic shared by both graph and block credential matching.
-
-    Args:
-        user_id: The user's ID
-        requirements: Dict mapping field names to CredentialsFieldInfo
-
-    Returns:
-        tuple[matched_credentials dict, missing_credentials list]
-    """
-    matched: dict[str, CredentialsMetaInput] = {}
-    missing: list[CredentialsMetaInput] = []
-
-    if not requirements:
-        return matched, missing
-
-    available_creds = await get_user_credentials(user_id)
-
-    for field_name, field_info in requirements.items():
-        matching_cred = find_matching_credential(available_creds, field_info)
-
-        if matching_cred:
-            try:
-                matched[field_name] = create_credential_meta_from_match(matching_cred)
-            except Exception as e:
-                logger.error(
-                    f"Failed to create CredentialsMetaInput for field '{field_name}': "
-                    f"provider={matching_cred.provider}, type={matching_cred.type}, "
-                    f"credential_id={matching_cred.id}",
-                    exc_info=True,
-                )
-                provider = next(iter(field_info.provider), "unknown")
-                cred_type = next(iter(field_info.supported_types), "api_key")
-                missing.append(
-                    CredentialsMetaInput(
-                        id=field_name,
-                        provider=provider,  # type: ignore
-                        type=cred_type,  # type: ignore
-                        title=f"{field_name} (validation failed: {e})",
-                    )
-                )
-        else:
-            provider = next(iter(field_info.provider), "unknown")
-            cred_type = next(iter(field_info.supported_types), "api_key")
-            missing.append(
-                CredentialsMetaInput(
-                    id=field_name,
-                    provider=provider,  # type: ignore
-                    type=cred_type,  # type: ignore
-                    title=field_name.replace("_", " ").title(),
-                )
-            )
-
-    return matched, missing
-
-
 async def match_user_credentials_to_graph(
    user_id: str,
    graph: GraphModel,
@@ -363,6 +242,9 @@ async def match_user_credentials_to_graph(
    Returns:
        tuple[matched_credentials dict, missing_credential_descriptions list]
    """
+    graph_credentials_inputs: dict[str, CredentialsMetaInput] = {}
+    missing_creds: list[str] = []
+
    # Get aggregated credentials requirements from the graph
    aggregated_creds = graph.aggregate_credentials_inputs()
    logger.debug(
@@ -370,30 +252,69 @@ async def match_user_credentials_to_graph(
    )

    if not aggregated_creds:
-        return {}, []
+        return graph_credentials_inputs, missing_creds

-    # Convert aggregated format to simple requirements dict
-    requirements = {
-        field_name: field_info
-        for field_name, (field_info, _node_fields) in aggregated_creds.items()
-    }
+    # Get all available credentials for the user
+    creds_manager = IntegrationCredentialsManager()
+    available_creds = await creds_manager.store.get_all_creds(user_id)

-    # Use shared matching logic
-    matched, missing_list = await match_credentials_to_requirements(
-        user_id, requirements
-    )
+    # For each required credential field, find a matching user credential
+    # field_info.provider is a frozenset because aggregate_credentials_inputs()
+    # combines requirements from multiple nodes. A credential matches if its
+    # provider is in the set of acceptable providers.
+    for credential_field_name, (
+        credential_requirements,
+        _node_fields,
+    ) in aggregated_creds.items():
+        # Find first matching credential by provider, type, and scopes
+        matching_cred = next(
+            (
+                cred
+                for cred in available_creds
+                if cred.provider in credential_requirements.provider
+                and cred.type in credential_requirements.supported_types
+                and _credential_has_required_scopes(cred, credential_requirements)
+            ),
+            None,
+        )

-    # Convert missing list to string descriptions for backward compatibility
-    missing_descriptions = [
-        f"{cred.id} (requires provider={cred.provider}, type={cred.type})"
-        for cred in missing_list
-    ]
+        if matching_cred:
+            try:
+                graph_credentials_inputs[credential_field_name] = CredentialsMetaInput(
+                    id=matching_cred.id,
+                    provider=matching_cred.provider,  # type: ignore
+                    type=matching_cred.type,
+                    title=matching_cred.title,
+                )
+            except Exception as e:
+                logger.error(
+                    f"Failed to create CredentialsMetaInput for field '{credential_field_name}': "
+                    f"provider={matching_cred.provider}, type={matching_cred.type}, "
+                    f"credential_id={matching_cred.id}",
+                    exc_info=True,
+                )
+                missing_creds.append(
+                    f"{credential_field_name} (validation failed: {e})"
+                )
+        else:
+            # Build a helpful error message including scope requirements
+            error_parts = [
+                f"provider in {list(credential_requirements.provider)}",
+                f"type in {list(credential_requirements.supported_types)}",
+            ]
+            if credential_requirements.required_scopes:
+                error_parts.append(
+                    f"scopes including {list(credential_requirements.required_scopes)}"
+                )
+            missing_creds.append(
+                f"{credential_field_name} (requires {', '.join(error_parts)})"
+            )

    logger.info(
-        f"Credential matching complete: {len(matched)}/{len(aggregated_creds)} matched"
+        f"Credential matching complete: {len(graph_credentials_inputs)}/{len(aggregated_creds)} matched"
    )

-    return matched, missing_descriptions
+    return graph_credentials_inputs, missing_creds


 def _credential_has_required_scopes(
--- a/autogpt_platform/backend/backend/api/ws_api.py
+++ b/autogpt_platform/backend/backend/api/ws_api.py
@@ -66,24 +66,18 @@ async def event_broadcaster(manager: ConnectionManager):
    execution_bus = AsyncRedisExecutionEventBus()
    notification_bus = AsyncRedisNotificationEventBus()

-    try:
+    async def execution_worker():
+        async for event in execution_bus.listen("*"):
+            await manager.send_execution_update(event)

-        async def execution_worker():
-            async for event in execution_bus.listen("*"):
-                await manager.send_execution_update(event)
+    async def notification_worker():
+        async for notification in notification_bus.listen("*"):
+            await manager.send_notification(
+                user_id=notification.user_id,
+                payload=notification.payload,
+            )

-        async def notification_worker():
-            async for notification in notification_bus.listen("*"):
-                await manager.send_notification(
-                    user_id=notification.user_id,
-                    payload=notification.payload,
-                )
-
-        await asyncio.gather(execution_worker(), notification_worker())
-    finally:
-        # Ensure PubSub connections are closed on any exit to prevent leaks
-        await execution_bus.close()
-        await notification_bus.close()
+    await asyncio.gather(execution_worker(), notification_worker())


 async def authenticate_websocket(websocket: WebSocket) -> str:
--- a/autogpt_platform/backend/backend/blocks/llm.py
+++ b/autogpt_platform/backend/backend/blocks/llm.py
@@ -32,7 +32,7 @@ from backend.data.model import (
 from backend.integrations.providers import ProviderName
 from backend.util import json
 from backend.util.logging import TruncatedLogger
-from backend.util.prompt import compress_context, estimate_token_count
+from backend.util.prompt import compress_prompt, estimate_token_count
 from backend.util.text import TextFormatter

 logger = TruncatedLogger(logging.getLogger(__name__), "[LLM-Block]")
@@ -634,18 +634,11 @@ async def llm_call(
    context_window = llm_model.context_window

    if compress_prompt_to_fit:
-        result = await compress_context(
+        prompt = compress_prompt(
            messages=prompt,
            target_tokens=llm_model.context_window // 2,
-            client=None,  # Truncation-only, no LLM summarization
-            reserve=0,  # Caller handles response token budget separately
+            lossy_ok=True,
        )
-        if result.error:
-            logger.warning(
-                f"Prompt compression did not meet target: {result.error}. "
-                f"Proceeding with {result.token_count} tokens."
-            )
-        prompt = result.messages

    # Calculate available tokens based on context window and input length
    estimated_input_tokens = estimate_token_count(prompt)
--- a/autogpt_platform/backend/backend/data/event_bus.py
+++ b/autogpt_platform/backend/backend/data/event_bus.py
@@ -133,23 +133,10 @@ class RedisEventBus(BaseRedisEventBus[M], ABC):


 class AsyncRedisEventBus(BaseRedisEventBus[M], ABC):
-    def __init__(self):
-        self._pubsub: AsyncPubSub | None = None
-
    @property
    async def connection(self) -> redis.AsyncRedis:
        return await redis.get_redis_async()

-    async def close(self) -> None:
-        """Close the PubSub connection if it exists."""
-        if self._pubsub is not None:
-            try:
-                await self._pubsub.close()
-            except Exception:
-                logger.warning("Failed to close PubSub connection", exc_info=True)
-            finally:
-                self._pubsub = None
-
    async def publish_event(self, event: M, channel_key: str):
        """
        Publish an event to Redis. Gracefully handles connection failures
@@ -170,7 +157,6 @@ class AsyncRedisEventBus(BaseRedisEventBus[M], ABC):
            await self.connection, channel_key
        )
        assert isinstance(pubsub, AsyncPubSub)
-        self._pubsub = pubsub

        if "*" in channel_key:
            await pubsub.psubscribe(full_channel_name)
--- a/autogpt_platform/backend/backend/executor/database.py
+++ b/autogpt_platform/backend/backend/executor/database.py
@@ -17,7 +17,6 @@ from backend.data.analytics import (
    get_accuracy_trends_and_alerts,
    get_marketplace_graphs_for_monitoring,
 )
-from backend.data.auth.oauth import cleanup_expired_oauth_tokens
 from backend.data.credit import UsageTransactionMetadata, get_user_credit_model
 from backend.data.execution import (
    create_graph_execution,
@@ -220,9 +219,6 @@ class DatabaseManager(AppService):
    # Onboarding
    increment_onboarding_runs = _(increment_onboarding_runs)

-    # OAuth
-    cleanup_expired_oauth_tokens = _(cleanup_expired_oauth_tokens)
-
    # Store
    get_store_agents = _(get_store_agents)
    get_store_agent_details = _(get_store_agent_details)
@@ -353,9 +349,6 @@ class DatabaseManagerAsyncClient(AppServiceClient):
    # Onboarding
    increment_onboarding_runs = d.increment_onboarding_runs

-    # OAuth
-    cleanup_expired_oauth_tokens = d.cleanup_expired_oauth_tokens
-
    # Store
    get_store_agents = d.get_store_agents
    get_store_agent_details = d.get_store_agent_details
--- a/autogpt_platform/backend/backend/executor/scheduler.py
+++ b/autogpt_platform/backend/backend/executor/scheduler.py
@@ -24,9 +24,11 @@ from dotenv import load_dotenv
 from pydantic import BaseModel, Field, ValidationError
 from sqlalchemy import MetaData, create_engine

+from backend.data.auth.oauth import cleanup_expired_oauth_tokens
 from backend.data.block import BlockInput
 from backend.data.execution import GraphExecutionWithNodes
 from backend.data.model import CredentialsMetaInput
+from backend.data.onboarding import increment_onboarding_runs
 from backend.executor import utils as execution_utils
 from backend.monitoring import (
    NotificationJobArgs,
@@ -36,11 +38,7 @@ from backend.monitoring import (
    report_execution_accuracy_alerts,
    report_late_executions,
 )
-from backend.util.clients import (
-    get_database_manager_async_client,
-    get_database_manager_client,
-    get_scheduler_client,
-)
+from backend.util.clients import get_database_manager_client, get_scheduler_client
 from backend.util.cloud_storage import cleanup_expired_files_async
 from backend.util.exceptions import (
    GraphNotFoundError,
@@ -150,7 +148,6 @@ def execute_graph(**kwargs):
 async def _execute_graph(**kwargs):
    args = GraphExecutionJobArgs(**kwargs)
    start_time = asyncio.get_event_loop().time()
-    db = get_database_manager_async_client()
    try:
        logger.info(f"Executing recurring job for graph #{args.graph_id}")
        graph_exec: GraphExecutionWithNodes = await execution_utils.add_graph_execution(
@@ -160,7 +157,7 @@ async def _execute_graph(**kwargs):
            inputs=args.input_data,
            graph_credentials_inputs=args.input_credentials,
        )
-        await db.increment_onboarding_runs(args.user_id)
+        await increment_onboarding_runs(args.user_id)
        elapsed = asyncio.get_event_loop().time() - start_time
        logger.info(
            f"Graph execution started with ID {graph_exec.id} for graph {args.graph_id} "
@@ -249,13 +246,8 @@ def cleanup_expired_files():

 def cleanup_oauth_tokens():
    """Clean up expired OAuth tokens from the database."""
-
    # Wait for completion
-    async def _cleanup():
-        db = get_database_manager_async_client()
-        return await db.cleanup_expired_oauth_tokens()
-
-    run_async(_cleanup())
+    run_async(cleanup_expired_oauth_tokens())


 def execution_accuracy_alerts():
--- a/autogpt_platform/backend/backend/util/prompt.py
+++ b/autogpt_platform/backend/backend/util/prompt.py
@@ -1,19 +1,10 @@
-from __future__ import annotations
-
-import logging
 from copy import deepcopy
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any
+from typing import Any

 from tiktoken import encoding_for_model

 from backend.util import json

-if TYPE_CHECKING:
-    from openai import AsyncOpenAI
-
-logger = logging.getLogger(__name__)
-
 # ---------------------------------------------------------------------------#
 #  CONSTANTS                                                                 #
 # ---------------------------------------------------------------------------#
@@ -109,17 +100,9 @@ def _is_objective_message(msg: dict) -> bool:
 def _truncate_tool_message_content(msg: dict, enc, max_tokens: int) -> None:
    """
    Carefully truncate tool message content while preserving tool structure.
-    Handles both Anthropic-style (list content) and OpenAI-style (string content) tool messages.
+    Only truncates tool_result content, leaves tool_use intact.
    """
    content = msg.get("content")
-
-    # OpenAI-style tool message: role="tool" with string content
-    if msg.get("role") == "tool" and isinstance(content, str):
-        if _tok_len(content, enc) > max_tokens:
-            msg["content"] = _truncate_middle_tokens(content, enc, max_tokens)
-        return
-
-    # Anthropic-style: list content with tool_result items
    if not isinstance(content, list):
        return

@@ -157,6 +140,141 @@ def _truncate_middle_tokens(text: str, enc, max_tok: int) -> str:
 # ---------------------------------------------------------------------------#


+def compress_prompt(
+    messages: list[dict],
+    target_tokens: int,
+    *,
+    model: str = "gpt-4o",
+    reserve: int = 2_048,
+    start_cap: int = 8_192,
+    floor_cap: int = 128,
+    lossy_ok: bool = True,
+) -> list[dict]:
+    """
+    Shrink *messages* so that::
+
+        token_count(prompt) + reserve  ≤  target_tokens
+
+    Strategy
+    --------
+    1. **Token-aware truncation** – progressively halve a per-message cap
+       (`start_cap`, `start_cap/2`, … `floor_cap`) and apply it to the
+       *content* of every message except the first and last.  Tool shells
+       are included: we keep the envelope but shorten huge payloads.
+    2. **Middle-out deletion** – if still over the limit, delete whole
+       messages working outward from the centre, **skipping** any message
+       that contains ``tool_calls`` or has ``role == "tool"``.
+    3. **Last-chance trim** – if still too big, truncate the *first* and
+       *last* message bodies down to `floor_cap` tokens.
+    4. If the prompt is *still* too large:
+         • raise ``ValueError``      when ``lossy_ok == False`` (default)
+         • return the partially-trimmed prompt when ``lossy_ok == True``
+
+    Parameters
+    ----------
+    messages        Complete chat history (will be deep-copied).
+    model           Model name; passed to tiktoken to pick the right
+                    tokenizer (gpt-4o → 'o200k_base', others fallback).
+    target_tokens   Hard ceiling for prompt size **excluding** the model's
+                    forthcoming answer.
+    reserve         How many tokens you want to leave available for that
+                    answer (`max_tokens` in your subsequent completion call).
+    start_cap       Initial per-message truncation ceiling (tokens).
+    floor_cap       Lowest cap we'll accept before moving to deletions.
+    lossy_ok        If *True* return best-effort prompt instead of raising
+                    after all trim passes have been exhausted.
+
+    Returns
+    -------
+    list[dict]  – A *new* messages list that abides by the rules above.
+    """
+    enc = encoding_for_model(model)  # best-match tokenizer
+    msgs = deepcopy(messages)  # never mutate caller
+
+    def total_tokens() -> int:
+        """Current size of *msgs* in tokens."""
+        return sum(_msg_tokens(m, enc) for m in msgs)
+
+    original_token_count = total_tokens()
+
+    if original_token_count + reserve <= target_tokens:
+        return msgs
+
+    # ---- STEP 0 : normalise content --------------------------------------
+    # Convert non-string payloads to strings so token counting is coherent.
+    for i, m in enumerate(msgs):
+        if not isinstance(m.get("content"), str) and m.get("content") is not None:
+            if _is_tool_message(m):
+                continue
+
+            # Keep first and last messages intact (unless they're tool messages)
+            if i == 0 or i == len(msgs) - 1:
+                continue
+
+            # Reasonable 20k-char ceiling prevents pathological blobs
+            content_str = json.dumps(m["content"], separators=(",", ":"))
+            if len(content_str) > 20_000:
+                content_str = _truncate_middle_tokens(content_str, enc, 20_000)
+            m["content"] = content_str
+
+    # ---- STEP 1 : token-aware truncation ---------------------------------
+    cap = start_cap
+    while total_tokens() + reserve > target_tokens and cap >= floor_cap:
+        for m in msgs[1:-1]:  # keep first & last intact
+            if _is_tool_message(m):
+                # For tool messages, only truncate tool result content, preserve structure
+                _truncate_tool_message_content(m, enc, cap)
+                continue
+
+            if _is_objective_message(m):
+                # Never truncate objective messages - they contain the core task
+                continue
+
+            content = m.get("content") or ""
+            if _tok_len(content, enc) > cap:
+                m["content"] = _truncate_middle_tokens(content, enc, cap)
+        cap //= 2  # tighten the screw
+
+    # ---- STEP 2 : middle-out deletion -----------------------------------
+    while total_tokens() + reserve > target_tokens and len(msgs) > 2:
+        # Identify all deletable messages (not first/last, not tool messages, not objective messages)
+        deletable_indices = []
+        for i in range(1, len(msgs) - 1):  # Skip first and last
+            if not _is_tool_message(msgs[i]) and not _is_objective_message(msgs[i]):
+                deletable_indices.append(i)
+
+        if not deletable_indices:
+            break  # nothing more we can drop
+
+        # Delete from center outward - find the index closest to center
+        centre = len(msgs) // 2
+        to_delete = min(deletable_indices, key=lambda i: abs(i - centre))
+        del msgs[to_delete]
+
+    # ---- STEP 3 : final safety-net trim on first & last ------------------
+    cap = start_cap
+    while total_tokens() + reserve > target_tokens and cap >= floor_cap:
+        for idx in (0, -1):  # first and last
+            if _is_tool_message(msgs[idx]):
+                # For tool messages at first/last position, truncate tool result content only
+                _truncate_tool_message_content(msgs[idx], enc, cap)
+                continue
+
+            text = msgs[idx].get("content") or ""
+            if _tok_len(text, enc) > cap:
+                msgs[idx]["content"] = _truncate_middle_tokens(text, enc, cap)
+        cap //= 2  # tighten the screw
+
+    # ---- STEP 4 : success or fail-gracefully -----------------------------
+    if total_tokens() + reserve > target_tokens and not lossy_ok:
+        raise ValueError(
+            "compress_prompt: prompt still exceeds budget "
+            f"({total_tokens() + reserve} > {target_tokens})."
+        )
+
+    return msgs
+
+
 def estimate_token_count(
    messages: list[dict],
    *,
@@ -175,8 +293,7 @@ def estimate_token_count(
    -------
    int  – Token count.
    """
-    token_model = _normalize_model_for_tokenizer(model)
-    enc = encoding_for_model(token_model)
+    enc = encoding_for_model(model)  # best-match tokenizer
    return sum(_msg_tokens(m, enc) for m in messages)


@@ -198,543 +315,6 @@ def estimate_token_count_str(
    -------
    int  – Token count.
    """
-    token_model = _normalize_model_for_tokenizer(model)
-    enc = encoding_for_model(token_model)
+    enc = encoding_for_model(model)  # best-match tokenizer
    text = json.dumps(text) if not isinstance(text, str) else text
    return _tok_len(text, enc)
-
-
-# ---------------------------------------------------------------------------#
-#  UNIFIED CONTEXT COMPRESSION                                               #
-# ---------------------------------------------------------------------------#
-
-# Default thresholds
-DEFAULT_TOKEN_THRESHOLD = 120_000
-DEFAULT_KEEP_RECENT = 15
-
-
-@dataclass
-class CompressResult:
-    """Result of context compression."""
-
-    messages: list[dict]
-    token_count: int
-    was_compacted: bool
-    error: str | None = None
-    original_token_count: int = 0
-    messages_summarized: int = 0
-    messages_dropped: int = 0
-
-
-def _normalize_model_for_tokenizer(model: str) -> str:
-    """Normalize model name for tiktoken tokenizer selection."""
-    if "/" in model:
-        model = model.split("/")[-1]
-    if "claude" in model.lower() or not any(
-        known in model.lower() for known in ["gpt", "o1", "chatgpt", "text-"]
-    ):
-        return "gpt-4o"
-    return model
-
-
-def _extract_tool_call_ids_from_message(msg: dict) -> set[str]:
-    """
-    Extract tool_call IDs from an assistant message.
-
-    Supports both formats:
-    - OpenAI: {"role": "assistant", "tool_calls": [{"id": "..."}]}
-    - Anthropic: {"role": "assistant", "content": [{"type": "tool_use", "id": "..."}]}
-
-    Returns:
-        Set of tool_call IDs found in the message.
-    """
-    ids: set[str] = set()
-    if msg.get("role") != "assistant":
-        return ids
-
-    # OpenAI format: tool_calls array
-    if msg.get("tool_calls"):
-        for tc in msg["tool_calls"]:
-            tc_id = tc.get("id")
-            if tc_id:
-                ids.add(tc_id)
-
-    # Anthropic format: content list with tool_use blocks
-    content = msg.get("content")
-    if isinstance(content, list):
-        for block in content:
-            if isinstance(block, dict) and block.get("type") == "tool_use":
-                tc_id = block.get("id")
-                if tc_id:
-                    ids.add(tc_id)
-
-    return ids
-
-
-def _extract_tool_response_ids_from_message(msg: dict) -> set[str]:
-    """
-    Extract tool_call IDs that this message is responding to.
-
-    Supports both formats:
-    - OpenAI: {"role": "tool", "tool_call_id": "..."}
-    - Anthropic: {"role": "user", "content": [{"type": "tool_result", "tool_use_id": "..."}]}
-
-    Returns:
-        Set of tool_call IDs this message responds to.
-    """
-    ids: set[str] = set()
-
-    # OpenAI format: role=tool with tool_call_id
-    if msg.get("role") == "tool":
-        tc_id = msg.get("tool_call_id")
-        if tc_id:
-            ids.add(tc_id)
-
-    # Anthropic format: content list with tool_result blocks
-    content = msg.get("content")
-    if isinstance(content, list):
-        for block in content:
-            if isinstance(block, dict) and block.get("type") == "tool_result":
-                tc_id = block.get("tool_use_id")
-                if tc_id:
-                    ids.add(tc_id)
-
-    return ids
-
-
-def _is_tool_response_message(msg: dict) -> bool:
-    """Check if message is a tool response (OpenAI or Anthropic format)."""
-    # OpenAI format
-    if msg.get("role") == "tool":
-        return True
-    # Anthropic format
-    content = msg.get("content")
-    if isinstance(content, list):
-        for block in content:
-            if isinstance(block, dict) and block.get("type") == "tool_result":
-                return True
-    return False
-
-
-def _remove_orphan_tool_responses(
-    messages: list[dict], orphan_ids: set[str]
-) -> list[dict]:
-    """
-    Remove tool response messages/blocks that reference orphan tool_call IDs.
-
-    Supports both OpenAI and Anthropic formats.
-    For Anthropic messages with mixed valid/orphan tool_result blocks,
-    filters out only the orphan blocks instead of dropping the entire message.
-    """
-    result = []
-    for msg in messages:
-        # OpenAI format: role=tool - drop entire message if orphan
-        if msg.get("role") == "tool":
-            tc_id = msg.get("tool_call_id")
-            if tc_id and tc_id in orphan_ids:
-                continue
-            result.append(msg)
-            continue
-
-        # Anthropic format: content list may have mixed tool_result blocks
-        content = msg.get("content")
-        if isinstance(content, list):
-            has_tool_results = any(
-                isinstance(b, dict) and b.get("type") == "tool_result" for b in content
-            )
-            if has_tool_results:
-                # Filter out orphan tool_result blocks, keep valid ones
-                filtered_content = [
-                    block
-                    for block in content
-                    if not (
-                        isinstance(block, dict)
-                        and block.get("type") == "tool_result"
-                        and block.get("tool_use_id") in orphan_ids
-                    )
-                ]
-                # Only keep message if it has remaining content
-                if filtered_content:
-                    msg = msg.copy()
-                    msg["content"] = filtered_content
-                    result.append(msg)
-                continue
-
-        result.append(msg)
-    return result
-
-
-def _ensure_tool_pairs_intact(
-    recent_messages: list[dict],
-    all_messages: list[dict],
-    start_index: int,
-) -> list[dict]:
-    """
-    Ensure tool_call/tool_response pairs stay together after slicing.
-
-    When slicing messages for context compaction, a naive slice can separate
-    an assistant message containing tool_calls from its corresponding tool
-    response messages. This causes API validation errors (e.g., Anthropic's
-    "unexpected tool_use_id found in tool_result blocks").
-
-    This function checks for orphan tool responses in the slice and extends
-    backwards to include their corresponding assistant messages.
-
-    Supports both formats:
-    - OpenAI: tool_calls array + role="tool" responses
-    - Anthropic: tool_use blocks + tool_result blocks
-
-    Args:
-        recent_messages: The sliced messages to validate
-        all_messages: The complete message list (for looking up missing assistants)
-        start_index: The index in all_messages where recent_messages begins
-
-    Returns:
-        A potentially extended list of messages with tool pairs intact
-    """
-    if not recent_messages:
-        return recent_messages
-
-    # Collect all tool_call_ids from assistant messages in the slice
-    available_tool_call_ids: set[str] = set()
-    for msg in recent_messages:
-        available_tool_call_ids |= _extract_tool_call_ids_from_message(msg)
-
-    # Find orphan tool responses (responses whose tool_call_id is missing)
-    orphan_tool_call_ids: set[str] = set()
-    for msg in recent_messages:
-        response_ids = _extract_tool_response_ids_from_message(msg)
-        for tc_id in response_ids:
-            if tc_id not in available_tool_call_ids:
-                orphan_tool_call_ids.add(tc_id)
-
-    if not orphan_tool_call_ids:
-        # No orphans, slice is valid
-        return recent_messages
-
-    # Find the assistant messages that contain the orphan tool_call_ids
-    # Search backwards from start_index in all_messages
-    messages_to_prepend: list[dict] = []
-    for i in range(start_index - 1, -1, -1):
-        msg = all_messages[i]
-        msg_tool_ids = _extract_tool_call_ids_from_message(msg)
-        if msg_tool_ids & orphan_tool_call_ids:
-            # This assistant message has tool_calls we need
-            # Also collect its contiguous tool responses that follow it
-            assistant_and_responses: list[dict] = [msg]
-
-            # Scan forward from this assistant to collect tool responses
-            for j in range(i + 1, start_index):
-                following_msg = all_messages[j]
-                following_response_ids = _extract_tool_response_ids_from_message(
-                    following_msg
-                )
-                if following_response_ids and following_response_ids & msg_tool_ids:
-                    assistant_and_responses.append(following_msg)
-                elif not _is_tool_response_message(following_msg):
-                    # Stop at first non-tool-response message
-                    break
-
-            # Prepend the assistant and its tool responses (maintain order)
-            messages_to_prepend = assistant_and_responses + messages_to_prepend
-            # Mark these as found
-            orphan_tool_call_ids -= msg_tool_ids
-            # Also add this assistant's tool_call_ids to available set
-            available_tool_call_ids |= msg_tool_ids
-
-        if not orphan_tool_call_ids:
-            # Found all missing assistants
-            break
-
-    if orphan_tool_call_ids:
-        # Some tool_call_ids couldn't be resolved - remove those tool responses
-        # This shouldn't happen in normal operation but handles edge cases
-        logger.warning(
-            f"Could not find assistant messages for tool_call_ids: {orphan_tool_call_ids}. "
-            "Removing orphan tool responses."
-        )
-        recent_messages = _remove_orphan_tool_responses(
-            recent_messages, orphan_tool_call_ids
-        )
-
-    if messages_to_prepend:
-        logger.info(
-            f"Extended recent messages by {len(messages_to_prepend)} to preserve "
-            f"tool_call/tool_response pairs"
-        )
-        return messages_to_prepend + recent_messages
-
-    return recent_messages
-
-
-async def _summarize_messages_llm(
-    messages: list[dict],
-    client: AsyncOpenAI,
-    model: str,
-    timeout: float = 30.0,
-) -> str:
-    """Summarize messages using an LLM."""
-    conversation = []
-    for msg in messages:
-        role = msg.get("role", "")
-        content = msg.get("content", "")
-        if content and role in ("user", "assistant", "tool"):
-            conversation.append(f"{role.upper()}: {content}")
-
-    conversation_text = "\n\n".join(conversation)
-
-    if not conversation_text:
-        return "No conversation history available."
-
-    # Limit to ~100k chars for safety
-    MAX_CHARS = 100_000
-    if len(conversation_text) > MAX_CHARS:
-        conversation_text = conversation_text[:MAX_CHARS] + "\n\n[truncated]"
-
-    response = await client.with_options(timeout=timeout).chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "system",
-                "content": (
-                    "Create a detailed summary of the conversation so far. "
-                    "This summary will be used as context when continuing the conversation.\n\n"
-                    "Before writing the summary, analyze each message chronologically to identify:\n"
-                    "- User requests and their explicit goals\n"
-                    "- Your approach and key decisions made\n"
-                    "- Technical specifics (file names, tool outputs, function signatures)\n"
-                    "- Errors encountered and resolutions applied\n\n"
-                    "You MUST include ALL of the following sections:\n\n"
-                    "## 1. Primary Request and Intent\n"
-                    "The user's explicit goals and what they are trying to accomplish.\n\n"
-                    "## 2. Key Technical Concepts\n"
-                    "Technologies, frameworks, tools, and patterns being used or discussed.\n\n"
-                    "## 3. Files and Resources Involved\n"
-                    "Specific files examined or modified, with relevant snippets and identifiers.\n\n"
-                    "## 4. Errors and Fixes\n"
-                    "Problems encountered, error messages, and their resolutions. "
-                    "Include any user feedback on fixes.\n\n"
-                    "## 5. Problem Solving\n"
-                    "Issues that have been resolved and how they were addressed.\n\n"
-                    "## 6. All User Messages\n"
-                    "A complete list of all user inputs (excluding tool outputs) to preserve their exact requests.\n\n"
-                    "## 7. Pending Tasks\n"
-                    "Work items the user explicitly requested that have not yet been completed.\n\n"
-                    "## 8. Current Work\n"
-                    "Precise description of what was being worked on most recently, including relevant context.\n\n"
-                    "## 9. Next Steps\n"
-                    "What should happen next, aligned with the user's most recent requests. "
-                    "Include verbatim quotes of recent instructions if relevant."
-                ),
-            },
-            {"role": "user", "content": f"Summarize:\n\n{conversation_text}"},
-        ],
-        max_tokens=1500,
-        temperature=0.3,
-    )
-
-    return response.choices[0].message.content or "No summary available."
-
-
-async def compress_context(
-    messages: list[dict],
-    target_tokens: int = DEFAULT_TOKEN_THRESHOLD,
-    *,
-    model: str = "gpt-4o",
-    client: AsyncOpenAI | None = None,
-    keep_recent: int = DEFAULT_KEEP_RECENT,
-    reserve: int = 2_048,
-    start_cap: int = 8_192,
-    floor_cap: int = 128,
-) -> CompressResult:
-    """
-    Unified context compression that combines summarization and truncation strategies.
-
-    Strategy (in order):
-    1. **LLM summarization** – If client provided, summarize old messages into a
-       single context message while keeping recent messages intact. This is the
-       primary strategy for chat service.
-    2. **Content truncation** – Progressively halve a per-message cap and truncate
-       bloated message content (tool outputs, large pastes). Preserves all messages
-       but shortens their content. Primary strategy when client=None (LLM blocks).
-    3. **Middle-out deletion** – Delete whole messages one at a time from the center
-       outward, skipping tool messages and objective messages.
-    4. **First/last trim** – Truncate first and last message content as last resort.
-
-    Parameters
-    ----------
-    messages        Complete chat history (will be deep-copied).
-    target_tokens   Hard ceiling for prompt size.
-    model           Model name for tokenization and summarization.
-    client          AsyncOpenAI client. If provided, enables LLM summarization
-                    as the first strategy. If None, skips to truncation strategies.
-    keep_recent     Number of recent messages to preserve during summarization.
-    reserve         Tokens to reserve for model response.
-    start_cap       Initial per-message truncation ceiling (tokens).
-    floor_cap       Lowest cap before moving to deletions.
-
-    Returns
-    -------
-    CompressResult with compressed messages and metadata.
-    """
-    # Guard clause for empty messages
-    if not messages:
-        return CompressResult(
-            messages=[],
-            token_count=0,
-            was_compacted=False,
-            original_token_count=0,
-        )
-
-    token_model = _normalize_model_for_tokenizer(model)
-    enc = encoding_for_model(token_model)
-    msgs = deepcopy(messages)
-
-    def total_tokens() -> int:
-        return sum(_msg_tokens(m, enc) for m in msgs)
-
-    original_count = total_tokens()
-
-    # Already under limit
-    if original_count + reserve <= target_tokens:
-        return CompressResult(
-            messages=msgs,
-            token_count=original_count,
-            was_compacted=False,
-            original_token_count=original_count,
-        )
-
-    messages_summarized = 0
-    messages_dropped = 0
-
-    # ---- STEP 1: LLM summarization (if client provided) -------------------
-    # This is the primary compression strategy for chat service.
-    # Summarize old messages while keeping recent ones intact.
-    if client is not None:
-        has_system = len(msgs) > 0 and msgs[0].get("role") == "system"
-        system_msg = msgs[0] if has_system else None
-
-        # Calculate old vs recent messages
-        if has_system:
-            if len(msgs) > keep_recent + 1:
-                old_msgs = msgs[1:-keep_recent]
-                recent_msgs = msgs[-keep_recent:]
-            else:
-                old_msgs = []
-                recent_msgs = msgs[1:] if len(msgs) > 1 else []
-        else:
-            if len(msgs) > keep_recent:
-                old_msgs = msgs[:-keep_recent]
-                recent_msgs = msgs[-keep_recent:]
-            else:
-                old_msgs = []
-                recent_msgs = msgs
-
-        # Ensure tool pairs stay intact
-        slice_start = max(0, len(msgs) - keep_recent)
-        recent_msgs = _ensure_tool_pairs_intact(recent_msgs, msgs, slice_start)
-
-        if old_msgs:
-            try:
-                summary_text = await _summarize_messages_llm(old_msgs, client, model)
-                summary_msg = {
-                    "role": "assistant",
-                    "content": f"[Previous conversation summary — for context only]: {summary_text}",
-                }
-                messages_summarized = len(old_msgs)
-
-                if has_system:
-                    msgs = [system_msg, summary_msg] + recent_msgs
-                else:
-                    msgs = [summary_msg] + recent_msgs
-
-                logger.info(
-                    f"Context summarized: {original_count} -> {total_tokens()} tokens, "
-                    f"summarized {messages_summarized} messages"
-                )
-            except Exception as e:
-                logger.warning(f"Summarization failed, continuing with truncation: {e}")
-                # Fall through to content truncation
-
-    # ---- STEP 2: Normalize content ----------------------------------------
-    # Convert non-string payloads to strings so token counting is coherent.
-    # Always run this before truncation to ensure consistent token counting.
-    for i, m in enumerate(msgs):
-        if not isinstance(m.get("content"), str) and m.get("content") is not None:
-            if _is_tool_message(m):
-                continue
-            if i == 0 or i == len(msgs) - 1:
-                continue
-            content_str = json.dumps(m["content"], separators=(",", ":"))
-            if len(content_str) > 20_000:
-                content_str = _truncate_middle_tokens(content_str, enc, 20_000)
-            m["content"] = content_str
-
-    # ---- STEP 3: Token-aware content truncation ---------------------------
-    # Progressively halve per-message cap and truncate bloated content.
-    # This preserves all messages but shortens their content.
-    cap = start_cap
-    while total_tokens() + reserve > target_tokens and cap >= floor_cap:
-        for m in msgs[1:-1]:
-            if _is_tool_message(m):
-                _truncate_tool_message_content(m, enc, cap)
-                continue
-            if _is_objective_message(m):
-                continue
-            content = m.get("content") or ""
-            if _tok_len(content, enc) > cap:
-                m["content"] = _truncate_middle_tokens(content, enc, cap)
-        cap //= 2
-
-    # ---- STEP 4: Middle-out deletion --------------------------------------
-    # Delete messages one at a time from the center outward.
-    # This is more granular than dropping all old messages at once.
-    while total_tokens() + reserve > target_tokens and len(msgs) > 2:
-        deletable: list[int] = []
-        for i in range(1, len(msgs) - 1):
-            msg = msgs[i]
-            if (
-                msg is not None
-                and not _is_tool_message(msg)
-                and not _is_objective_message(msg)
-            ):
-                deletable.append(i)
-        if not deletable:
-            break
-        centre = len(msgs) // 2
-        to_delete = min(deletable, key=lambda i: abs(i - centre))
-        del msgs[to_delete]
-        messages_dropped += 1
-
-    # ---- STEP 5: Final trim on first/last ---------------------------------
-    cap = start_cap
-    while total_tokens() + reserve > target_tokens and cap >= floor_cap:
-        for idx in (0, -1):
-            msg = msgs[idx]
-            if msg is None:
-                continue
-            if _is_tool_message(msg):
-                _truncate_tool_message_content(msg, enc, cap)
-                continue
-            text = msg.get("content") or ""
-            if _tok_len(text, enc) > cap:
-                msg["content"] = _truncate_middle_tokens(text, enc, cap)
-        cap //= 2
-
-    # Filter out any None values that may have been introduced
-    final_msgs: list[dict] = [m for m in msgs if m is not None]
-    final_count = sum(_msg_tokens(m, enc) for m in final_msgs)
-    error = None
-    if final_count + reserve > target_tokens:
-        error = f"Could not compress below target ({final_count + reserve} > {target_tokens})"
-        logger.warning(error)
-
-    return CompressResult(
-        messages=final_msgs,
-        token_count=final_count,
-        was_compacted=True,
-        error=error,
-        original_token_count=original_count,
-        messages_summarized=messages_summarized,
-        messages_dropped=messages_dropped,
-    )
--- a/autogpt_platform/backend/backend/util/prompt_test.py
+++ b/autogpt_platform/backend/backend/util/prompt_test.py
@@ -1,21 +1,10 @@
 """Tests for prompt utility functions, especially tool call token counting."""

-from unittest.mock import AsyncMock, MagicMock
-
 import pytest
 from tiktoken import encoding_for_model

 from backend.util import json
-from backend.util.prompt import (
-    CompressResult,
-    _ensure_tool_pairs_intact,
-    _msg_tokens,
-    _normalize_model_for_tokenizer,
-    _truncate_middle_tokens,
-    _truncate_tool_message_content,
-    compress_context,
-    estimate_token_count,
-)
+from backend.util.prompt import _msg_tokens, estimate_token_count


 class TestMsgTokens:
@@ -287,690 +276,3 @@ class TestEstimateTokenCount:

        assert total_tokens == expected_total
        assert total_tokens > 20  # Should be substantial
-
-
-class TestNormalizeModelForTokenizer:
-    """Test model name normalization for tiktoken."""
-
-    def test_openai_models_unchanged(self):
-        """Test that OpenAI models are returned as-is."""
-        assert _normalize_model_for_tokenizer("gpt-4o") == "gpt-4o"
-        assert _normalize_model_for_tokenizer("gpt-4") == "gpt-4"
-        assert _normalize_model_for_tokenizer("gpt-3.5-turbo") == "gpt-3.5-turbo"
-
-    def test_claude_models_normalized(self):
-        """Test that Claude models are normalized to gpt-4o."""
-        assert _normalize_model_for_tokenizer("claude-3-opus") == "gpt-4o"
-        assert _normalize_model_for_tokenizer("claude-3-sonnet") == "gpt-4o"
-        assert _normalize_model_for_tokenizer("anthropic/claude-3-haiku") == "gpt-4o"
-
-    def test_openrouter_paths_extracted(self):
-        """Test that OpenRouter model paths are handled."""
-        assert _normalize_model_for_tokenizer("openai/gpt-4o") == "gpt-4o"
-        assert _normalize_model_for_tokenizer("anthropic/claude-3-opus") == "gpt-4o"
-
-    def test_unknown_models_default_to_gpt4o(self):
-        """Test that unknown models default to gpt-4o."""
-        assert _normalize_model_for_tokenizer("some-random-model") == "gpt-4o"
-        assert _normalize_model_for_tokenizer("llama-3-70b") == "gpt-4o"
-
-
-class TestTruncateToolMessageContent:
-    """Test tool message content truncation."""
-
-    @pytest.fixture
-    def enc(self):
-        return encoding_for_model("gpt-4o")
-
-    def test_truncate_openai_tool_message(self, enc):
-        """Test truncation of OpenAI-style tool message with string content."""
-        long_content = "x" * 10000
-        msg = {"role": "tool", "tool_call_id": "call_123", "content": long_content}
-
-        _truncate_tool_message_content(msg, enc, max_tokens=100)
-
-        # Content should be truncated
-        assert len(msg["content"]) < len(long_content)
-        assert "…" in msg["content"]  # Has ellipsis marker
-
-    def test_truncate_anthropic_tool_result(self, enc):
-        """Test truncation of Anthropic-style tool_result."""
-        long_content = "y" * 10000
-        msg = {
-            "role": "user",
-            "content": [
-                {
-                    "type": "tool_result",
-                    "tool_use_id": "toolu_123",
-                    "content": long_content,
-                }
-            ],
-        }
-
-        _truncate_tool_message_content(msg, enc, max_tokens=100)
-
-        # Content should be truncated
-        result_content = msg["content"][0]["content"]
-        assert len(result_content) < len(long_content)
-        assert "…" in result_content
-
-    def test_preserve_tool_use_blocks(self, enc):
-        """Test that tool_use blocks are not truncated."""
-        msg = {
-            "role": "assistant",
-            "content": [
-                {
-                    "type": "tool_use",
-                    "id": "toolu_123",
-                    "name": "some_function",
-                    "input": {"key": "value" * 1000},  # Large input
-                }
-            ],
-        }
-
-        original = json.dumps(msg["content"][0]["input"])
-        _truncate_tool_message_content(msg, enc, max_tokens=10)
-
-        # tool_use should be unchanged
-        assert json.dumps(msg["content"][0]["input"]) == original
-
-    def test_no_truncation_when_under_limit(self, enc):
-        """Test that short content is not modified."""
-        msg = {"role": "tool", "tool_call_id": "call_123", "content": "Short content"}
-
-        original = msg["content"]
-        _truncate_tool_message_content(msg, enc, max_tokens=1000)
-
-        assert msg["content"] == original
-
-
-class TestTruncateMiddleTokens:
-    """Test middle truncation of text."""
-
-    @pytest.fixture
-    def enc(self):
-        return encoding_for_model("gpt-4o")
-
-    def test_truncates_long_text(self, enc):
-        """Test that long text is truncated with ellipsis in middle."""
-        long_text = "word " * 1000
-        result = _truncate_middle_tokens(long_text, enc, max_tok=50)
-
-        assert len(enc.encode(result)) <= 52  # Allow some slack for ellipsis
-        assert "…" in result
-        assert result.startswith("word")  # Head preserved
-        assert result.endswith("word ")  # Tail preserved
-
-    def test_preserves_short_text(self, enc):
-        """Test that short text is not modified."""
-        short_text = "Hello world"
-        result = _truncate_middle_tokens(short_text, enc, max_tok=100)
-
-        assert result == short_text
-
-
-class TestEnsureToolPairsIntact:
-    """Test tool call/response pair preservation for both OpenAI and Anthropic formats."""
-
-    # ---- OpenAI Format Tests ----
-
-    def test_openai_adds_missing_tool_call(self):
-        """Test that orphaned OpenAI tool_response gets its tool_call prepended."""
-        all_msgs = [
-            {"role": "system", "content": "You are helpful."},
-            {
-                "role": "assistant",
-                "tool_calls": [
-                    {"id": "call_1", "type": "function", "function": {"name": "f1"}}
-                ],
-            },
-            {"role": "tool", "tool_call_id": "call_1", "content": "result"},
-            {"role": "user", "content": "Thanks!"},
-        ]
-        # Recent messages start at index 2 (the tool response)
-        recent = [all_msgs[2], all_msgs[3]]
-        start_index = 2
-
-        result = _ensure_tool_pairs_intact(recent, all_msgs, start_index)
-
-        # Should prepend the tool_call message
-        assert len(result) == 3
-        assert result[0]["role"] == "assistant"
-        assert "tool_calls" in result[0]
-
-    def test_openai_keeps_complete_pairs(self):
-        """Test that complete OpenAI pairs are unchanged."""
-        all_msgs = [
-            {"role": "system", "content": "System"},
-            {
-                "role": "assistant",
-                "tool_calls": [
-                    {"id": "call_1", "type": "function", "function": {"name": "f1"}}
-                ],
-            },
-            {"role": "tool", "tool_call_id": "call_1", "content": "result"},
-        ]
-        recent = all_msgs[1:]  # Include both tool_call and response
-        start_index = 1
-
-        result = _ensure_tool_pairs_intact(recent, all_msgs, start_index)
-
-        assert len(result) == 2  # No messages added
-
-    def test_openai_multiple_tool_calls(self):
-        """Test multiple OpenAI tool calls in one assistant message."""
-        all_msgs = [
-            {"role": "system", "content": "System"},
-            {
-                "role": "assistant",
-                "tool_calls": [
-                    {"id": "call_1", "type": "function", "function": {"name": "f1"}},
-                    {"id": "call_2", "type": "function", "function": {"name": "f2"}},
-                ],
-            },
-            {"role": "tool", "tool_call_id": "call_1", "content": "result1"},
-            {"role": "tool", "tool_call_id": "call_2", "content": "result2"},
-            {"role": "user", "content": "Thanks!"},
-        ]
-        # Recent messages start at index 2 (first tool response)
-        recent = [all_msgs[2], all_msgs[3], all_msgs[4]]
-        start_index = 2
-
-        result = _ensure_tool_pairs_intact(recent, all_msgs, start_index)
-
-        # Should prepend the assistant message with both tool_calls
-        assert len(result) == 4
-        assert result[0]["role"] == "assistant"
-        assert len(result[0]["tool_calls"]) == 2
-
-    # ---- Anthropic Format Tests ----
-
-    def test_anthropic_adds_missing_tool_use(self):
-        """Test that orphaned Anthropic tool_result gets its tool_use prepended."""
-        all_msgs = [
-            {"role": "system", "content": "You are helpful."},
-            {
-                "role": "assistant",
-                "content": [
-                    {
-                        "type": "tool_use",
-                        "id": "toolu_123",
-                        "name": "get_weather",
-                        "input": {"location": "SF"},
-                    }
-                ],
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "tool_result",
-                        "tool_use_id": "toolu_123",
-                        "content": "22°C and sunny",
-                    }
-                ],
-            },
-            {"role": "user", "content": "Thanks!"},
-        ]
-        # Recent messages start at index 2 (the tool_result)
-        recent = [all_msgs[2], all_msgs[3]]
-        start_index = 2
-
-        result = _ensure_tool_pairs_intact(recent, all_msgs, start_index)
-
-        # Should prepend the tool_use message
-        assert len(result) == 3
-        assert result[0]["role"] == "assistant"
-        assert result[0]["content"][0]["type"] == "tool_use"
-
-    def test_anthropic_keeps_complete_pairs(self):
-        """Test that complete Anthropic pairs are unchanged."""
-        all_msgs = [
-            {"role": "system", "content": "System"},
-            {
-                "role": "assistant",
-                "content": [
-                    {
-                        "type": "tool_use",
-                        "id": "toolu_456",
-                        "name": "calculator",
-                        "input": {"expr": "2+2"},
-                    }
-                ],
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "tool_result",
-                        "tool_use_id": "toolu_456",
-                        "content": "4",
-                    }
-                ],
-            },
-        ]
-        recent = all_msgs[1:]  # Include both tool_use and result
-        start_index = 1
-
-        result = _ensure_tool_pairs_intact(recent, all_msgs, start_index)
-
-        assert len(result) == 2  # No messages added
-
-    def test_anthropic_multiple_tool_uses(self):
-        """Test multiple Anthropic tool_use blocks in one message."""
-        all_msgs = [
-            {"role": "system", "content": "System"},
-            {
-                "role": "assistant",
-                "content": [
-                    {"type": "text", "text": "Let me check both..."},
-                    {
-                        "type": "tool_use",
-                        "id": "toolu_1",
-                        "name": "get_weather",
-                        "input": {"city": "NYC"},
-                    },
-                    {
-                        "type": "tool_use",
-                        "id": "toolu_2",
-                        "name": "get_weather",
-                        "input": {"city": "LA"},
-                    },
-                ],
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "tool_result",
-                        "tool_use_id": "toolu_1",
-                        "content": "Cold",
-                    },
-                    {
-                        "type": "tool_result",
-                        "tool_use_id": "toolu_2",
-                        "content": "Warm",
-                    },
-                ],
-            },
-            {"role": "user", "content": "Thanks!"},
-        ]
-        # Recent messages start at index 2 (tool_result)
-        recent = [all_msgs[2], all_msgs[3]]
-        start_index = 2
-
-        result = _ensure_tool_pairs_intact(recent, all_msgs, start_index)
-
-        # Should prepend the assistant message with both tool_uses
-        assert len(result) == 3
-        assert result[0]["role"] == "assistant"
-        tool_use_count = sum(
-            1 for b in result[0]["content"] if b.get("type") == "tool_use"
-        )
-        assert tool_use_count == 2
-
-    # ---- Mixed/Edge Case Tests ----
-
-    def test_anthropic_with_type_message_field(self):
-        """Test Anthropic format with 'type': 'message' field (smart_decision_maker style)."""
-        all_msgs = [
-            {"role": "system", "content": "You are helpful."},
-            {
-                "role": "assistant",
-                "content": [
-                    {
-                        "type": "tool_use",
-                        "id": "toolu_abc",
-                        "name": "search",
-                        "input": {"q": "test"},
-                    }
-                ],
-            },
-            {
-                "role": "user",
-                "type": "message",  # Extra field from smart_decision_maker
-                "content": [
-                    {
-                        "type": "tool_result",
-                        "tool_use_id": "toolu_abc",
-                        "content": "Found results",
-                    }
-                ],
-            },
-            {"role": "user", "content": "Thanks!"},
-        ]
-        # Recent messages start at index 2 (the tool_result with 'type': 'message')
-        recent = [all_msgs[2], all_msgs[3]]
-        start_index = 2
-
-        result = _ensure_tool_pairs_intact(recent, all_msgs, start_index)
-
-        # Should prepend the tool_use message
-        assert len(result) == 3
-        assert result[0]["role"] == "assistant"
-        assert result[0]["content"][0]["type"] == "tool_use"
-
-    def test_handles_no_tool_messages(self):
-        """Test messages without tool calls."""
-        all_msgs = [
-            {"role": "user", "content": "Hello"},
-            {"role": "assistant", "content": "Hi there!"},
-        ]
-        recent = all_msgs
-        start_index = 0
-
-        result = _ensure_tool_pairs_intact(recent, all_msgs, start_index)
-
-        assert result == all_msgs
-
-    def test_handles_empty_messages(self):
-        """Test empty message list."""
-        result = _ensure_tool_pairs_intact([], [], 0)
-        assert result == []
-
-    def test_mixed_text_and_tool_content(self):
-        """Test Anthropic message with mixed text and tool_use content."""
-        all_msgs = [
-            {
-                "role": "assistant",
-                "content": [
-                    {"type": "text", "text": "I'll help you with that."},
-                    {
-                        "type": "tool_use",
-                        "id": "toolu_mixed",
-                        "name": "search",
-                        "input": {"q": "test"},
-                    },
-                ],
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "tool_result",
-                        "tool_use_id": "toolu_mixed",
-                        "content": "Found results",
-                    }
-                ],
-            },
-            {"role": "assistant", "content": "Here are the results..."},
-        ]
-        # Start from tool_result
-        recent = [all_msgs[1], all_msgs[2]]
-        start_index = 1
-
-        result = _ensure_tool_pairs_intact(recent, all_msgs, start_index)
-
-        # Should prepend the assistant message with tool_use
-        assert len(result) == 3
-        assert result[0]["content"][0]["type"] == "text"
-        assert result[0]["content"][1]["type"] == "tool_use"
-
-
-class TestCompressContext:
-    """Test the async compress_context function."""
-
-    @pytest.mark.asyncio
-    async def test_no_compression_needed(self):
-        """Test messages under limit return without compression."""
-        messages = [
-            {"role": "system", "content": "You are helpful."},
-            {"role": "user", "content": "Hello!"},
-        ]
-
-        result = await compress_context(messages, target_tokens=100000)
-
-        assert isinstance(result, CompressResult)
-        assert result.was_compacted is False
-        assert len(result.messages) == 2
-        assert result.error is None
-
-    @pytest.mark.asyncio
-    async def test_truncation_without_client(self):
-        """Test that truncation works without LLM client."""
-        long_content = "x" * 50000
-        messages = [
-            {"role": "system", "content": "System"},
-            {"role": "user", "content": long_content},
-            {"role": "assistant", "content": "Response"},
-        ]
-
-        result = await compress_context(
-            messages, target_tokens=1000, client=None, reserve=100
-        )
-
-        assert result.was_compacted is True
-        # Should have truncated without summarization
-        assert result.messages_summarized == 0
-
-    @pytest.mark.asyncio
-    async def test_with_mocked_llm_client(self):
-        """Test summarization with mocked LLM client."""
-        # Create many messages to trigger summarization
-        messages = [{"role": "system", "content": "System prompt"}]
-        for i in range(30):
-            messages.append({"role": "user", "content": f"User message {i} " * 100})
-            messages.append(
-                {"role": "assistant", "content": f"Assistant response {i} " * 100}
-            )
-
-        # Mock the AsyncOpenAI client
-        mock_client = AsyncMock()
-        mock_response = MagicMock()
-        mock_response.choices = [MagicMock()]
-        mock_response.choices[0].message.content = "Summary of conversation"
-        mock_client.with_options.return_value.chat.completions.create = AsyncMock(
-            return_value=mock_response
-        )
-
-        result = await compress_context(
-            messages,
-            target_tokens=5000,
-            client=mock_client,
-            keep_recent=5,
-            reserve=500,
-        )
-
-        assert result.was_compacted is True
-        # Should have attempted summarization
-        assert mock_client.with_options.called or result.messages_summarized > 0
-
-    @pytest.mark.asyncio
-    async def test_preserves_tool_pairs(self):
-        """Test that tool call/response pairs stay together."""
-        messages = [
-            {"role": "system", "content": "System"},
-            {"role": "user", "content": "Do something"},
-            {
-                "role": "assistant",
-                "tool_calls": [
-                    {"id": "call_1", "type": "function", "function": {"name": "func"}}
-                ],
-            },
-            {"role": "tool", "tool_call_id": "call_1", "content": "Result " * 1000},
-            {"role": "assistant", "content": "Done!"},
-        ]
-
-        result = await compress_context(
-            messages, target_tokens=500, client=None, reserve=50
-        )
-
-        # Check that if tool response exists, its call exists too
-        tool_call_ids = set()
-        tool_response_ids = set()
-        for msg in result.messages:
-            if "tool_calls" in msg:
-                for tc in msg["tool_calls"]:
-                    tool_call_ids.add(tc["id"])
-            if msg.get("role") == "tool":
-                tool_response_ids.add(msg.get("tool_call_id"))
-
-        # All tool responses should have their calls
-        assert tool_response_ids <= tool_call_ids
-
-    @pytest.mark.asyncio
-    async def test_returns_error_when_cannot_compress(self):
-        """Test that error is returned when compression fails."""
-        # Single huge message that can't be compressed enough
-        messages = [
-            {"role": "user", "content": "x" * 100000},
-        ]
-
-        result = await compress_context(
-            messages, target_tokens=100, client=None, reserve=50
-        )
-
-        # Should have an error since we can't get below 100 tokens
-        assert result.error is not None
-        assert result.was_compacted is True
-
-    @pytest.mark.asyncio
-    async def test_empty_messages(self):
-        """Test that empty messages list returns early without error."""
-        result = await compress_context([], target_tokens=1000)
-
-        assert result.messages == []
-        assert result.token_count == 0
-        assert result.was_compacted is False
-        assert result.error is None
-
-
-class TestRemoveOrphanToolResponses:
-    """Test _remove_orphan_tool_responses helper function."""
-
-    def test_removes_openai_orphan(self):
-        """Test removal of orphan OpenAI tool response."""
-        from backend.util.prompt import _remove_orphan_tool_responses
-
-        messages = [
-            {"role": "tool", "tool_call_id": "call_orphan", "content": "result"},
-            {"role": "user", "content": "Hello"},
-        ]
-        orphan_ids = {"call_orphan"}
-
-        result = _remove_orphan_tool_responses(messages, orphan_ids)
-
-        assert len(result) == 1
-        assert result[0]["role"] == "user"
-
-    def test_keeps_valid_openai_tool(self):
-        """Test that valid OpenAI tool responses are kept."""
-        from backend.util.prompt import _remove_orphan_tool_responses
-
-        messages = [
-            {"role": "tool", "tool_call_id": "call_valid", "content": "result"},
-        ]
-        orphan_ids = {"call_other"}
-
-        result = _remove_orphan_tool_responses(messages, orphan_ids)
-
-        assert len(result) == 1
-        assert result[0]["tool_call_id"] == "call_valid"
-
-    def test_filters_anthropic_mixed_blocks(self):
-        """Test filtering individual orphan blocks from Anthropic message with mixed valid/orphan."""
-        from backend.util.prompt import _remove_orphan_tool_responses
-
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "tool_result",
-                        "tool_use_id": "toolu_valid",
-                        "content": "valid result",
-                    },
-                    {
-                        "type": "tool_result",
-                        "tool_use_id": "toolu_orphan",
-                        "content": "orphan result",
-                    },
-                ],
-            },
-        ]
-        orphan_ids = {"toolu_orphan"}
-
-        result = _remove_orphan_tool_responses(messages, orphan_ids)
-
-        assert len(result) == 1
-        # Should only have the valid tool_result, orphan filtered out
-        assert len(result[0]["content"]) == 1
-        assert result[0]["content"][0]["tool_use_id"] == "toolu_valid"
-
-    def test_removes_anthropic_all_orphan(self):
-        """Test removal of Anthropic message when all tool_results are orphans."""
-        from backend.util.prompt import _remove_orphan_tool_responses
-
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "tool_result",
-                        "tool_use_id": "toolu_orphan1",
-                        "content": "result1",
-                    },
-                    {
-                        "type": "tool_result",
-                        "tool_use_id": "toolu_orphan2",
-                        "content": "result2",
-                    },
-                ],
-            },
-        ]
-        orphan_ids = {"toolu_orphan1", "toolu_orphan2"}
-
-        result = _remove_orphan_tool_responses(messages, orphan_ids)
-
-        # Message should be completely removed since no content left
-        assert len(result) == 0
-
-    def test_preserves_non_tool_messages(self):
-        """Test that non-tool messages are preserved."""
-        from backend.util.prompt import _remove_orphan_tool_responses
-
-        messages = [
-            {"role": "user", "content": "Hello"},
-            {"role": "assistant", "content": "Hi there!"},
-        ]
-        orphan_ids = {"some_id"}
-
-        result = _remove_orphan_tool_responses(messages, orphan_ids)
-
-        assert result == messages
-
-
-class TestCompressResultDataclass:
-    """Test CompressResult dataclass."""
-
-    def test_default_values(self):
-        """Test default values are set correctly."""
-        result = CompressResult(
-            messages=[{"role": "user", "content": "test"}],
-            token_count=10,
-            was_compacted=False,
-        )
-
-        assert result.error is None
-        assert result.original_token_count == 0  # Defaults to 0, not None
-        assert result.messages_summarized == 0
-        assert result.messages_dropped == 0
-
-    def test_all_fields(self):
-        """Test all fields can be set."""
-        result = CompressResult(
-            messages=[{"role": "user", "content": "test"}],
-            token_count=100,
-            was_compacted=True,
-            error="Some error",
-            original_token_count=500,
-            messages_summarized=10,
-            messages_dropped=5,
-        )
-
-        assert result.token_count == 100
-        assert result.was_compacted is True
-        assert result.error == "Some error"
-        assert result.original_token_count == 500
-        assert result.messages_summarized == 10
-        assert result.messages_dropped == 5
--- a/autogpt_platform/backend/backend/util/validation.py
+++ b/autogpt_platform/backend/backend/util/validation.py
@@ -1,32 +0,0 @@
-"""Validation utilities."""
-
-import re
-
-_UUID_V4_PATTERN = re.compile(
-    r"[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}",
-    re.IGNORECASE,
-)
-
-
-def is_uuid_v4(text: str) -> bool:
-    """Check if text is a valid UUID v4.
-
-    Args:
-        text: String to validate
-
-    Returns:
-        True if the text is a valid UUID v4, False otherwise
-    """
-    return bool(_UUID_V4_PATTERN.fullmatch(text.strip()))
-
-
-def extract_uuids(text: str) -> list[str]:
-    """Extract all UUID v4 strings from text.
-
-    Args:
-        text: String to search for UUIDs
-
-    Returns:
-        List of unique UUIDs found (lowercase)
-    """
-    return list({m.lower() for m in _UUID_V4_PATTERN.findall(text)})
--- a/autogpt_platform/backend/test/agent_generator/test_service.py
+++ b/autogpt_platform/backend/test/agent_generator/test_service.py
@@ -102,7 +102,7 @@ class TestDecomposeGoalExternal:

    @pytest.mark.asyncio
    async def test_decompose_goal_with_context(self):
-        """Test decomposition with additional context enriched into description."""
+        """Test decomposition with additional context."""
        mock_response = MagicMock()
        mock_response.json.return_value = {
            "success": True,
@@ -119,12 +119,9 @@ class TestDecomposeGoalExternal:
                "Build a chatbot", context="Use Python"
            )

-        expected_description = (
-            "Build a chatbot\n\nAdditional context from user:\nUse Python"
-        )
        mock_client.post.assert_called_once_with(
            "/api/decompose-description",
-            json={"description": expected_description},
+            json={"description": "Build a chatbot", "user_instruction": "Use Python"},
        )

    @pytest.mark.asyncio