Compare commits

..

2 Commits

Author SHA1 Message Date
Bentlybro
90dfed68af Improve chat context summarization logic
Added timeout parameter to summarization client and limited conversation text length for safety. Enhanced message summarization to handle system prompts, avoid summarizing when too few old messages, and improved logging for summarization actions.
2026-01-26 20:05:42 +00:00
Bentlybro
2c84ab1d55 Add context window management with message summarization
Introduces logic to summarize older chat messages when the token count exceeds 120,000, preserving recent messages and inserting a summary to maintain context. Adds an async helper function to perform summarization using an OpenAI model, improving efficiency and preventing context overflow in long conversations.
2026-01-26 19:55:27 +00:00

View File

@@ -673,6 +673,69 @@ def _is_region_blocked_error(error: Exception) -> bool:
return "not available in your region" in str(error).lower()
async def _summarize_messages(
messages: list,
model: str = "openai/gpt-4o-mini",
api_key: str | None = None,
base_url: str | None = None,
timeout: float = 30.0,
) -> str:
"""Summarize a list of messages into concise context.
Args:
messages: List of message dicts to summarize
model: Model to use for summarization (default: gpt-4o-mini)
api_key: API key for OpenAI client
base_url: Base URL for OpenAI client
timeout: Request timeout in seconds (default: 30.0)
Returns:
Summarized text
"""
# Format messages for summarization
conversation = []
for msg in messages:
role = msg.get("role", "")
content = msg.get("content", "")
if content and role in ("user", "assistant"):
conversation.append(f"{role.upper()}: {content}")
conversation_text = "\n\n".join(conversation)
# Truncate conversation to fit within summarization model's context
# gpt-4o-mini has 128k context, but we limit to ~25k tokens (~100k chars) for safety
MAX_CHARS = 100_000
if len(conversation_text) > MAX_CHARS:
conversation_text = conversation_text[:MAX_CHARS] + "\n\n[truncated]"
# Call LLM to summarize
import openai
summarization_client = openai.AsyncOpenAI(
api_key=api_key, base_url=base_url, timeout=timeout
)
response = await summarization_client.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": (
"Summarize this conversation history concisely. "
"Preserve key facts, decisions, and context. "
"Format as 2-3 short paragraphs."
),
},
{"role": "user", "content": f"Summarize:\n\n{conversation_text}"},
],
max_tokens=500,
temperature=0.3,
)
summary = response.choices[0].message.content
return summary or "No summary available."
async def _stream_chat_chunks(
session: ChatSession,
tools: list[ChatCompletionToolParam],
@@ -709,6 +772,89 @@ async def _stream_chat_chunks(
)
messages = [system_message] + messages
# Apply context window management
try:
from backend.util.prompt import estimate_token_count
# Convert to dict for token counting
# OpenAI message types are TypedDicts, so they're already dict-like
messages_dict = []
for msg in messages:
# TypedDict objects are already dicts, just filter None values
if isinstance(msg, dict):
msg_dict = {k: v for k, v in msg.items() if v is not None}
else:
# Fallback for unexpected types
msg_dict = dict(msg)
messages_dict.append(msg_dict)
# Estimate tokens
token_count = estimate_token_count(messages_dict, model="gpt-4o")
# If over threshold, summarize old messages
if token_count > 120_000:
KEEP_RECENT = 15
MIN_MESSAGES_TO_SUMMARIZE = 5 # Don't summarize if too few old messages
# Check if we have a system prompt at the start
has_system_prompt = (
len(messages) > 0 and messages[0].get("role") == "system"
)
if len(messages) > KEEP_RECENT:
# Split messages based on whether system prompt exists
recent_messages = messages[-KEEP_RECENT:]
if has_system_prompt:
# Keep system prompt separate, summarize everything between system and recent
system_msg = messages[0]
old_messages_dict = messages_dict[1:-KEEP_RECENT]
else:
# No system prompt, summarize everything except recent
system_msg = None
old_messages_dict = messages_dict[:-KEEP_RECENT]
# Only summarize if we have enough old messages
if len(old_messages_dict) >= MIN_MESSAGES_TO_SUMMARIZE:
# Summarize old messages
summary_text = await _summarize_messages(
old_messages_dict,
model="openai/gpt-4o-mini",
api_key=config.api_key,
base_url=config.base_url,
)
# Build new message list
from openai.types.chat import ChatCompletionSystemMessageParam
summary_msg = ChatCompletionSystemMessageParam(
role="system",
content=f"[Previous conversation summary]: {summary_text}",
)
# Rebuild messages based on whether we have a system prompt
if has_system_prompt:
# system_prompt + summary + recent_messages
messages = [system_msg, summary_msg] + recent_messages
else:
# summary + recent_messages (no original system prompt)
messages = [summary_msg] + recent_messages
logger.info(
f"Context summarized: {token_count} tokens, "
f"summarized {len(old_messages_dict)} old messages, "
f"kept last {KEEP_RECENT} messages"
)
else:
logger.info(
f"Skipping summarization: only {len(old_messages_dict)} old messages "
f"(minimum {MIN_MESSAGES_TO_SUMMARIZE} required)"
)
except Exception as e:
logger.error(f"Context summarization failed: {e}", exc_info=True)
# Continue with original messages (fallback)
# Loop to handle tool calls and continue conversation
while True:
retry_count = 0