Improve chat context summarization logic

Added timeout parameter to summarization client and limited conversation text length for safety. Enhanced message summarization to handle system prompts, avoid summarizing when too few old messages, and improved logging for summarization actions.
Add context window management with message summarization
2026-01-26 15:38:14 -05:00 · 2026-01-26 20:05:42 +00:00 · 2026-01-26 19:55:27 +00:00
1 changed files with 146 additions and 0 deletions
--- a/autogpt_platform/backend/backend/api/features/chat/service.py
+++ b/autogpt_platform/backend/backend/api/features/chat/service.py
@@ -673,6 +673,69 @@ def _is_region_blocked_error(error: Exception) -> bool:
    return "not available in your region" in str(error).lower()


+async def _summarize_messages(
+    messages: list,
+    model: str = "openai/gpt-4o-mini",
+    api_key: str | None = None,
+    base_url: str | None = None,
+    timeout: float = 30.0,
+) -> str:
+    """Summarize a list of messages into concise context.
+
+    Args:
+        messages: List of message dicts to summarize
+        model: Model to use for summarization (default: gpt-4o-mini)
+        api_key: API key for OpenAI client
+        base_url: Base URL for OpenAI client
+        timeout: Request timeout in seconds (default: 30.0)
+
+    Returns:
+        Summarized text
+    """
+    # Format messages for summarization
+    conversation = []
+    for msg in messages:
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+        if content and role in ("user", "assistant"):
+            conversation.append(f"{role.upper()}: {content}")
+
+    conversation_text = "\n\n".join(conversation)
+
+    # Truncate conversation to fit within summarization model's context
+    # gpt-4o-mini has 128k context, but we limit to ~25k tokens (~100k chars) for safety
+    MAX_CHARS = 100_000
+    if len(conversation_text) > MAX_CHARS:
+        conversation_text = conversation_text[:MAX_CHARS] + "\n\n[truncated]"
+
+    # Call LLM to summarize
+    import openai
+
+    summarization_client = openai.AsyncOpenAI(
+        api_key=api_key, base_url=base_url, timeout=timeout
+    )
+
+    response = await summarization_client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "system",
+                "content": (
+                    "Summarize this conversation history concisely. "
+                    "Preserve key facts, decisions, and context. "
+                    "Format as 2-3 short paragraphs."
+                ),
+            },
+            {"role": "user", "content": f"Summarize:\n\n{conversation_text}"},
+        ],
+        max_tokens=500,
+        temperature=0.3,
+    )
+
+    summary = response.choices[0].message.content
+    return summary or "No summary available."
+
+
 async def _stream_chat_chunks(
    session: ChatSession,
    tools: list[ChatCompletionToolParam],
@@ -709,6 +772,89 @@ async def _stream_chat_chunks(
        )
        messages = [system_message] + messages

+    # Apply context window management
+    try:
+        from backend.util.prompt import estimate_token_count
+
+        # Convert to dict for token counting
+        # OpenAI message types are TypedDicts, so they're already dict-like
+        messages_dict = []
+        for msg in messages:
+            # TypedDict objects are already dicts, just filter None values
+            if isinstance(msg, dict):
+                msg_dict = {k: v for k, v in msg.items() if v is not None}
+            else:
+                # Fallback for unexpected types
+                msg_dict = dict(msg)
+            messages_dict.append(msg_dict)
+
+        # Estimate tokens
+        token_count = estimate_token_count(messages_dict, model="gpt-4o")
+
+        # If over threshold, summarize old messages
+        if token_count > 120_000:
+            KEEP_RECENT = 15
+            MIN_MESSAGES_TO_SUMMARIZE = 5  # Don't summarize if too few old messages
+
+            # Check if we have a system prompt at the start
+            has_system_prompt = (
+                len(messages) > 0 and messages[0].get("role") == "system"
+            )
+
+            if len(messages) > KEEP_RECENT:
+                # Split messages based on whether system prompt exists
+                recent_messages = messages[-KEEP_RECENT:]
+
+                if has_system_prompt:
+                    # Keep system prompt separate, summarize everything between system and recent
+                    system_msg = messages[0]
+                    old_messages_dict = messages_dict[1:-KEEP_RECENT]
+                else:
+                    # No system prompt, summarize everything except recent
+                    system_msg = None
+                    old_messages_dict = messages_dict[:-KEEP_RECENT]
+
+                # Only summarize if we have enough old messages
+                if len(old_messages_dict) >= MIN_MESSAGES_TO_SUMMARIZE:
+                    # Summarize old messages
+                    summary_text = await _summarize_messages(
+                        old_messages_dict,
+                        model="openai/gpt-4o-mini",
+                        api_key=config.api_key,
+                        base_url=config.base_url,
+                    )
+
+                    # Build new message list
+                    from openai.types.chat import ChatCompletionSystemMessageParam
+
+                    summary_msg = ChatCompletionSystemMessageParam(
+                        role="system",
+                        content=f"[Previous conversation summary]: {summary_text}",
+                    )
+
+                    # Rebuild messages based on whether we have a system prompt
+                    if has_system_prompt:
+                        # system_prompt + summary + recent_messages
+                        messages = [system_msg, summary_msg] + recent_messages
+                    else:
+                        # summary + recent_messages (no original system prompt)
+                        messages = [summary_msg] + recent_messages
+
+                    logger.info(
+                        f"Context summarized: {token_count} tokens, "
+                        f"summarized {len(old_messages_dict)} old messages, "
+                        f"kept last {KEEP_RECENT} messages"
+                    )
+                else:
+                    logger.info(
+                        f"Skipping summarization: only {len(old_messages_dict)} old messages "
+                        f"(minimum {MIN_MESSAGES_TO_SUMMARIZE} required)"
+                    )
+
+    except Exception as e:
+        logger.error(f"Context summarization failed: {e}", exc_info=True)
+        # Continue with original messages (fallback)
+
    # Loop to handle tool calls and continue conversation
    while True:
        retry_count = 0