mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
## Summary - Fixes SmartDecisionMakerBlock conversation management to work with OpenAI's Responses API, which was introduced in #12099 (commit1240f38) - The migration to `responses.create` updated the outbound LLM call but missed the conversation history serialization — the `raw_response` is now the entire `Response` object (not a `ChatCompletionMessage`), and tool calls/results use `function_call` / `function_call_output` types instead of role-based messages - This caused a 400 error on the second LLM call in agent mode: `"Invalid value: ''. Supported values are: 'assistant', 'system', 'developer', and 'user'."` ### Changes **`smart_decision_maker.py`** — 6 functions updated: | Function | Fix | |---|---| | `_convert_raw_response_to_dict` | Detects Responses API `Response` objects, extracts output items as a list | | `_get_tool_requests` | Recognizes `type: "function_call"` items | | `_get_tool_responses` | Recognizes `type: "function_call_output"` items | | `_create_tool_response` | New `responses_api` kwarg produces `function_call_output` format | | `_update_conversation` | Handles list return from `_convert_raw_response_to_dict` | | Non-agent mode path | Same list handling for traditional execution | **`test_smart_decision_maker_responses_api.py`** — 61 tests covering: - Every branch of all 6 affected helper functions - Chat Completions, Anthropic, and Responses API formats - End-to-end agent mode and traditional mode conversation validity ## Test plan - [x] 61 new unit tests all pass - [x] 11 existing SmartDecisionMakerBlock tests still pass (no regressions) - [x] All pre-commit hooks pass (ruff, black, isort, pyright) - [ ] CI integration tests 🤖 Generated with [Claude Code](https://claude.com/claude-code) <!-- CURSOR_SUMMARY --> --- > [!NOTE] > **Medium Risk** > Updates core LLM invocation and agent conversation/tool-call bookkeeping to match OpenAI’s Responses API, which can affect tool execution loops and prompt serialization across providers. Risk is mitigated by extensive new unit tests, but regressions could surface in production agent-mode flows or token/usage accounting. > > **Overview** > **Migrates OpenAI calls from Chat Completions to the Responses API end-to-end**, including tool schema conversion, output parsing, reasoning/text extraction, and updated token usage fields in `LLMResponse`. > > **Fixes SmartDecisionMakerBlock conversation/tool handling for Responses API** by treating `raw_response` as a Response object (splitting it into `output` items for replay), recognizing `function_call`/`function_call_output` entries, and emitting tool outputs in the correct Responses format to prevent invalid follow-up prompts. > > Also adjusts prompt compaction/token estimation to understand Responses API tool items, changes `get_execution_outputs_by_node_exec_id` to return list-valued `CompletedBlockOutput`, removes `gpt-3.5-turbo` from model/cost/docs lists, and adds focused unit tests plus a lightweight `conftest.py` to run these tests without the full server stack. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commitff292efd3d. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY --> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Otto <otto@agpt.co> Co-authored-by: Krzysztof Czerwinski <kpczerwinski@gmail.com>
874 lines
33 KiB
Python
874 lines
33 KiB
Python
from __future__ import annotations
|
||
|
||
import logging
|
||
from copy import deepcopy
|
||
from dataclasses import dataclass
|
||
from typing import TYPE_CHECKING, Any
|
||
|
||
from tiktoken import encoding_for_model
|
||
|
||
from backend.util import json
|
||
|
||
if TYPE_CHECKING:
|
||
from openai import AsyncOpenAI
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ---------------------------------------------------------------------------#
|
||
# CONSTANTS #
|
||
# ---------------------------------------------------------------------------#
|
||
|
||
# Message prefixes for important system messages that should be protected during compression
|
||
MAIN_OBJECTIVE_PREFIX = "[Main Objective Prompt]: "
|
||
|
||
# ---------------------------------------------------------------------------#
|
||
# INTERNAL UTILITIES #
|
||
# ---------------------------------------------------------------------------#
|
||
|
||
|
||
def _tok_len(text: str, enc) -> int:
|
||
"""True token length of *text* in tokenizer *enc* (no wrapper cost)."""
|
||
return len(enc.encode(str(text)))
|
||
|
||
|
||
def _msg_tokens(msg: dict, enc) -> int:
|
||
"""
|
||
OpenAI counts ≈3 wrapper tokens per chat message, plus 1 if "name"
|
||
is present, plus the tokenised content length.
|
||
For tool calls, we need to count tokens in tool_calls and content fields.
|
||
Supports Chat Completions, Anthropic, and Responses API formats.
|
||
"""
|
||
WRAPPER = 3 + (1 if "name" in msg else 0)
|
||
|
||
# Responses API: function_call items have arguments + name
|
||
if msg.get("type") == "function_call":
|
||
return (
|
||
WRAPPER
|
||
+ _tok_len(msg.get("name", ""), enc)
|
||
+ _tok_len(msg.get("arguments", ""), enc)
|
||
+ _tok_len(msg.get("call_id", ""), enc)
|
||
)
|
||
|
||
# Responses API: function_call_output items have output
|
||
if msg.get("type") == "function_call_output":
|
||
return (
|
||
WRAPPER
|
||
+ _tok_len(msg.get("output", ""), enc)
|
||
+ _tok_len(msg.get("call_id", ""), enc)
|
||
)
|
||
|
||
# Count content tokens
|
||
content_tokens = _tok_len(msg.get("content") or "", enc)
|
||
|
||
# Count tool call tokens for both OpenAI and Anthropic formats
|
||
tool_call_tokens = 0
|
||
|
||
# OpenAI Chat Completions format: tool_calls array at message level
|
||
if "tool_calls" in msg and isinstance(msg["tool_calls"], list):
|
||
for tool_call in msg["tool_calls"]:
|
||
# Count the tool call structure tokens
|
||
tool_call_tokens += _tok_len(tool_call.get("id", ""), enc)
|
||
tool_call_tokens += _tok_len(tool_call.get("type", ""), enc)
|
||
if "function" in tool_call:
|
||
tool_call_tokens += _tok_len(tool_call["function"].get("name", ""), enc)
|
||
tool_call_tokens += _tok_len(
|
||
tool_call["function"].get("arguments", ""), enc
|
||
)
|
||
|
||
# Anthropic format: tool_use within content array
|
||
content = msg.get("content")
|
||
if isinstance(content, list):
|
||
for item in content:
|
||
if isinstance(item, dict) and item.get("type") == "tool_use":
|
||
# Count the tool use structure tokens
|
||
tool_call_tokens += _tok_len(item.get("id", ""), enc)
|
||
tool_call_tokens += _tok_len(item.get("name", ""), enc)
|
||
tool_call_tokens += _tok_len(json.dumps(item.get("input", {})), enc)
|
||
elif isinstance(item, dict) and item.get("type") == "tool_result":
|
||
# Count tool result tokens
|
||
tool_call_tokens += _tok_len(item.get("tool_use_id", ""), enc)
|
||
tool_call_tokens += _tok_len(item.get("content", ""), enc)
|
||
elif isinstance(item, dict) and item.get("type") == "text":
|
||
# Count text block tokens (standard: "text" key, fallback: "content")
|
||
text_val = item.get("text") or item.get("content", "")
|
||
tool_call_tokens += _tok_len(text_val, enc)
|
||
elif isinstance(item, dict) and "content" in item:
|
||
# Other content types with content field
|
||
tool_call_tokens += _tok_len(item.get("content", ""), enc)
|
||
# For list content, override content_tokens since we counted everything above
|
||
content_tokens = 0
|
||
|
||
return WRAPPER + content_tokens + tool_call_tokens
|
||
|
||
|
||
def _is_tool_message(msg: dict) -> bool:
|
||
"""Check if a message contains tool calls or results that should be protected."""
|
||
# Responses API: standalone function_call / function_call_output items
|
||
if msg.get("type") in ("function_call", "function_call_output"):
|
||
return True
|
||
|
||
content = msg.get("content")
|
||
|
||
# Check for Anthropic-style tool messages
|
||
if isinstance(content, list) and any(
|
||
isinstance(item, dict) and item.get("type") in ("tool_use", "tool_result")
|
||
for item in content
|
||
):
|
||
return True
|
||
|
||
# Check for OpenAI Chat Completions-style tool calls in the message
|
||
if "tool_calls" in msg or msg.get("role") == "tool":
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
def _is_objective_message(msg: dict) -> bool:
|
||
"""Check if a message contains objective/system prompts that should be absolutely protected."""
|
||
content = msg.get("content", "")
|
||
if isinstance(content, str):
|
||
# Protect any message with the main objective prefix
|
||
return content.startswith(MAIN_OBJECTIVE_PREFIX)
|
||
return False
|
||
|
||
|
||
def _truncate_tool_message_content(msg: dict, enc, max_tokens: int) -> None:
|
||
"""
|
||
Carefully truncate tool message content while preserving tool structure.
|
||
Handles Anthropic, Chat Completions, and Responses API tool messages.
|
||
"""
|
||
# Responses API: function_call_output has "output" field
|
||
if msg.get("type") == "function_call_output":
|
||
output = msg.get("output", "")
|
||
if isinstance(output, str) and _tok_len(output, enc) > max_tokens:
|
||
msg["output"] = _truncate_middle_tokens(output, enc, max_tokens)
|
||
return
|
||
|
||
content = msg.get("content")
|
||
|
||
# OpenAI Chat Completions tool message: role="tool" with string content
|
||
if msg.get("role") == "tool" and isinstance(content, str):
|
||
if _tok_len(content, enc) > max_tokens:
|
||
msg["content"] = _truncate_middle_tokens(content, enc, max_tokens)
|
||
return
|
||
|
||
# Anthropic-style: list content with tool_result items
|
||
if not isinstance(content, list):
|
||
return
|
||
|
||
for item in content:
|
||
# Only process tool_result items, leave tool_use blocks completely intact
|
||
if not (isinstance(item, dict) and item.get("type") == "tool_result"):
|
||
continue
|
||
|
||
result_content = item.get("content", "")
|
||
if (
|
||
isinstance(result_content, str)
|
||
and _tok_len(result_content, enc) > max_tokens
|
||
):
|
||
item["content"] = _truncate_middle_tokens(result_content, enc, max_tokens)
|
||
|
||
|
||
def _truncate_middle_tokens(text: str, enc, max_tok: int) -> str:
|
||
"""
|
||
Return *text* shortened to ≈max_tok tokens by keeping the head & tail
|
||
and inserting an ellipsis token in the middle.
|
||
"""
|
||
ids = enc.encode(str(text))
|
||
if len(ids) <= max_tok:
|
||
return text # nothing to do
|
||
|
||
# Need at least 3 tokens (head + ellipsis + tail) for meaningful truncation
|
||
if max_tok < 1:
|
||
return ""
|
||
mid = enc.encode(" … ")
|
||
if max_tok < 3:
|
||
return enc.decode(ids[:max_tok])
|
||
|
||
# Split the allowance between the two ends:
|
||
head = max_tok // 2 - 1 # -1 for the ellipsis
|
||
tail = max_tok - head - 1
|
||
return enc.decode(ids[:head] + mid + ids[-tail:])
|
||
|
||
|
||
# ---------------------------------------------------------------------------#
|
||
# PUBLIC API #
|
||
# ---------------------------------------------------------------------------#
|
||
|
||
|
||
def estimate_token_count(
|
||
messages: list[dict],
|
||
*,
|
||
model: str = "gpt-4o",
|
||
) -> int:
|
||
"""
|
||
Return the true token count of *messages* when encoded for *model*.
|
||
|
||
Parameters
|
||
----------
|
||
messages Complete chat history.
|
||
model Model name; passed to tiktoken to pick the right
|
||
tokenizer (gpt-4o → 'o200k_base', others fallback).
|
||
|
||
Returns
|
||
-------
|
||
int – Token count.
|
||
"""
|
||
token_model = _normalize_model_for_tokenizer(model)
|
||
enc = encoding_for_model(token_model)
|
||
return sum(_msg_tokens(m, enc) for m in messages)
|
||
|
||
|
||
def estimate_token_count_str(
|
||
text: Any,
|
||
*,
|
||
model: str = "gpt-4o",
|
||
) -> int:
|
||
"""
|
||
Return the true token count of *text* when encoded for *model*.
|
||
|
||
Parameters
|
||
----------
|
||
text Input text.
|
||
model Model name; passed to tiktoken to pick the right
|
||
tokenizer (gpt-4o → 'o200k_base', others fallback).
|
||
|
||
Returns
|
||
-------
|
||
int – Token count.
|
||
"""
|
||
token_model = _normalize_model_for_tokenizer(model)
|
||
enc = encoding_for_model(token_model)
|
||
text = json.dumps(text) if not isinstance(text, str) else text
|
||
return _tok_len(text, enc)
|
||
|
||
|
||
# ---------------------------------------------------------------------------#
|
||
# UNIFIED CONTEXT COMPRESSION #
|
||
# ---------------------------------------------------------------------------#
|
||
|
||
# Default thresholds
|
||
DEFAULT_TOKEN_THRESHOLD = 120_000
|
||
DEFAULT_KEEP_RECENT = 15
|
||
|
||
|
||
@dataclass
|
||
class CompressResult:
|
||
"""Result of context compression."""
|
||
|
||
messages: list[dict]
|
||
token_count: int
|
||
was_compacted: bool
|
||
error: str | None = None
|
||
original_token_count: int = 0
|
||
messages_summarized: int = 0
|
||
messages_dropped: int = 0
|
||
|
||
|
||
def _normalize_model_for_tokenizer(model: str) -> str:
|
||
"""Normalize model name for tiktoken tokenizer selection."""
|
||
if "/" in model:
|
||
model = model.split("/")[-1]
|
||
if "claude" in model.lower() or not any(
|
||
known in model.lower() for known in ["gpt", "o1", "chatgpt", "text-"]
|
||
):
|
||
return "gpt-4o"
|
||
return model
|
||
|
||
|
||
def _extract_tool_call_ids_from_message(msg: dict) -> set[str]:
|
||
"""
|
||
Extract tool_call IDs from an assistant message.
|
||
|
||
Supports all formats:
|
||
- OpenAI Chat Completions: {"role": "assistant", "tool_calls": [{"id": "..."}]}
|
||
- Anthropic: {"role": "assistant", "content": [{"type": "tool_use", "id": "..."}]}
|
||
- OpenAI Responses API: {"type": "function_call", "call_id": "..."}
|
||
|
||
Returns:
|
||
Set of tool_call IDs found in the message.
|
||
"""
|
||
ids: set[str] = set()
|
||
|
||
# Responses API: standalone function_call item
|
||
if msg.get("type") == "function_call":
|
||
if call_id := msg.get("call_id"):
|
||
ids.add(call_id)
|
||
return ids
|
||
|
||
if msg.get("role") != "assistant":
|
||
return ids
|
||
|
||
# OpenAI Chat Completions format: tool_calls array
|
||
if msg.get("tool_calls"):
|
||
for tc in msg["tool_calls"]:
|
||
tc_id = tc.get("id")
|
||
if tc_id:
|
||
ids.add(tc_id)
|
||
|
||
# Anthropic format: content list with tool_use blocks
|
||
content = msg.get("content")
|
||
if isinstance(content, list):
|
||
for block in content:
|
||
if isinstance(block, dict) and block.get("type") == "tool_use":
|
||
tc_id = block.get("id")
|
||
if tc_id:
|
||
ids.add(tc_id)
|
||
|
||
return ids
|
||
|
||
|
||
def _extract_tool_response_ids_from_message(msg: dict) -> set[str]:
|
||
"""
|
||
Extract tool_call IDs that this message is responding to.
|
||
|
||
Supports all formats:
|
||
- OpenAI Chat Completions: {"role": "tool", "tool_call_id": "..."}
|
||
- Anthropic: {"role": "user", "content": [{"type": "tool_result", "tool_use_id": "..."}]}
|
||
- OpenAI Responses API: {"type": "function_call_output", "call_id": "..."}
|
||
|
||
Returns:
|
||
Set of tool_call IDs this message responds to.
|
||
"""
|
||
ids: set[str] = set()
|
||
|
||
# Responses API: standalone function_call_output item
|
||
if msg.get("type") == "function_call_output":
|
||
if call_id := msg.get("call_id"):
|
||
ids.add(call_id)
|
||
return ids
|
||
|
||
# OpenAI Chat Completions format: role=tool with tool_call_id
|
||
if msg.get("role") == "tool":
|
||
tc_id = msg.get("tool_call_id")
|
||
if tc_id:
|
||
ids.add(tc_id)
|
||
|
||
# Anthropic format: content list with tool_result blocks
|
||
content = msg.get("content")
|
||
if isinstance(content, list):
|
||
for block in content:
|
||
if isinstance(block, dict) and block.get("type") == "tool_result":
|
||
tc_id = block.get("tool_use_id")
|
||
if tc_id:
|
||
ids.add(tc_id)
|
||
|
||
return ids
|
||
|
||
|
||
def _is_tool_response_message(msg: dict) -> bool:
|
||
"""Check if message is a tool response (Chat Completions, Anthropic, or Responses API)."""
|
||
# Responses API format
|
||
if msg.get("type") == "function_call_output":
|
||
return True
|
||
# OpenAI Chat Completions format
|
||
if msg.get("role") == "tool":
|
||
return True
|
||
# Anthropic format
|
||
content = msg.get("content")
|
||
if isinstance(content, list):
|
||
for block in content:
|
||
if isinstance(block, dict) and block.get("type") == "tool_result":
|
||
return True
|
||
return False
|
||
|
||
|
||
def _remove_orphan_tool_responses(
|
||
messages: list[dict], orphan_ids: set[str]
|
||
) -> list[dict]:
|
||
"""
|
||
Remove tool response messages/blocks that reference orphan tool_call IDs.
|
||
|
||
Supports OpenAI Chat Completions, Anthropic, and Responses API formats.
|
||
For Anthropic messages with mixed valid/orphan tool_result blocks,
|
||
filters out only the orphan blocks instead of dropping the entire message.
|
||
"""
|
||
result = []
|
||
for msg in messages:
|
||
# Responses API: function_call_output - drop if orphan
|
||
if msg.get("type") == "function_call_output":
|
||
if msg.get("call_id") in orphan_ids:
|
||
continue
|
||
result.append(msg)
|
||
continue
|
||
|
||
# OpenAI Chat Completions: role=tool - drop entire message if orphan
|
||
if msg.get("role") == "tool":
|
||
tc_id = msg.get("tool_call_id")
|
||
if tc_id and tc_id in orphan_ids:
|
||
continue
|
||
result.append(msg)
|
||
continue
|
||
|
||
# Anthropic format: content list may have mixed tool_result blocks
|
||
content = msg.get("content")
|
||
if isinstance(content, list):
|
||
has_tool_results = any(
|
||
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
||
)
|
||
if has_tool_results:
|
||
# Filter out orphan tool_result blocks, keep valid ones
|
||
filtered_content = [
|
||
block
|
||
for block in content
|
||
if not (
|
||
isinstance(block, dict)
|
||
and block.get("type") == "tool_result"
|
||
and block.get("tool_use_id") in orphan_ids
|
||
)
|
||
]
|
||
# Only keep message if it has remaining content
|
||
if filtered_content:
|
||
msg = msg.copy()
|
||
msg["content"] = filtered_content
|
||
result.append(msg)
|
||
continue
|
||
|
||
result.append(msg)
|
||
return result
|
||
|
||
|
||
def validate_and_remove_orphan_tool_responses(
|
||
messages: list[dict],
|
||
log_warning: bool = True,
|
||
) -> list[dict]:
|
||
"""
|
||
Validate tool_call/tool_response pairs and remove orphaned responses.
|
||
|
||
Scans messages in order, tracking all tool_call IDs. Any tool response
|
||
referencing an ID not seen in a preceding message is considered orphaned
|
||
and removed. This prevents API errors like Anthropic's "unexpected tool_use_id".
|
||
|
||
Args:
|
||
messages: List of messages to validate (OpenAI or Anthropic format)
|
||
log_warning: Whether to log a warning when orphans are found
|
||
|
||
Returns:
|
||
A new list with orphaned tool responses removed
|
||
"""
|
||
available_ids: set[str] = set()
|
||
orphan_ids: set[str] = set()
|
||
|
||
for msg in messages:
|
||
available_ids |= _extract_tool_call_ids_from_message(msg)
|
||
for resp_id in _extract_tool_response_ids_from_message(msg):
|
||
if resp_id not in available_ids:
|
||
orphan_ids.add(resp_id)
|
||
|
||
if not orphan_ids:
|
||
return messages
|
||
|
||
if log_warning:
|
||
logger.warning(
|
||
f"Removing {len(orphan_ids)} orphan tool response(s): {orphan_ids}"
|
||
)
|
||
|
||
return _remove_orphan_tool_responses(messages, orphan_ids)
|
||
|
||
|
||
def _ensure_tool_pairs_intact(
|
||
recent_messages: list[dict],
|
||
all_messages: list[dict],
|
||
start_index: int,
|
||
) -> list[dict]:
|
||
"""
|
||
Ensure tool_call/tool_response pairs stay together after slicing.
|
||
|
||
When slicing messages for context compaction, a naive slice can separate
|
||
an assistant message containing tool_calls from its corresponding tool
|
||
response messages. This causes API validation errors (e.g., Anthropic's
|
||
"unexpected tool_use_id found in tool_result blocks").
|
||
|
||
This function checks for orphan tool responses in the slice and extends
|
||
backwards to include their corresponding assistant messages.
|
||
|
||
Supports both formats:
|
||
- OpenAI: tool_calls array + role="tool" responses
|
||
- Anthropic: tool_use blocks + tool_result blocks
|
||
|
||
Args:
|
||
recent_messages: The sliced messages to validate
|
||
all_messages: The complete message list (for looking up missing assistants)
|
||
start_index: The index in all_messages where recent_messages begins
|
||
|
||
Returns:
|
||
A potentially extended list of messages with tool pairs intact
|
||
"""
|
||
if not recent_messages:
|
||
return recent_messages
|
||
|
||
# Collect all tool_call_ids from assistant messages in the slice
|
||
available_tool_call_ids: set[str] = set()
|
||
for msg in recent_messages:
|
||
available_tool_call_ids |= _extract_tool_call_ids_from_message(msg)
|
||
|
||
# Find orphan tool responses (responses whose tool_call_id is missing)
|
||
orphan_tool_call_ids: set[str] = set()
|
||
for msg in recent_messages:
|
||
response_ids = _extract_tool_response_ids_from_message(msg)
|
||
for tc_id in response_ids:
|
||
if tc_id not in available_tool_call_ids:
|
||
orphan_tool_call_ids.add(tc_id)
|
||
|
||
if not orphan_tool_call_ids:
|
||
# No orphans, slice is valid
|
||
return recent_messages
|
||
|
||
# Find the assistant messages that contain the orphan tool_call_ids
|
||
# Search backwards from start_index in all_messages
|
||
messages_to_prepend: list[dict] = []
|
||
for i in range(start_index - 1, -1, -1):
|
||
msg = all_messages[i]
|
||
msg_tool_ids = _extract_tool_call_ids_from_message(msg)
|
||
if msg_tool_ids & orphan_tool_call_ids:
|
||
# This assistant message has tool_calls we need
|
||
# Also collect its contiguous tool responses that follow it
|
||
assistant_and_responses: list[dict] = [msg]
|
||
|
||
# Scan forward from this assistant to collect tool responses
|
||
for j in range(i + 1, start_index):
|
||
following_msg = all_messages[j]
|
||
following_response_ids = _extract_tool_response_ids_from_message(
|
||
following_msg
|
||
)
|
||
if following_response_ids and following_response_ids & msg_tool_ids:
|
||
assistant_and_responses.append(following_msg)
|
||
elif not _is_tool_response_message(following_msg):
|
||
# Stop at first non-tool-response message
|
||
break
|
||
|
||
# Prepend the assistant and its tool responses (maintain order)
|
||
messages_to_prepend = assistant_and_responses + messages_to_prepend
|
||
# Mark these as found
|
||
orphan_tool_call_ids -= msg_tool_ids
|
||
# Also add this assistant's tool_call_ids to available set
|
||
available_tool_call_ids |= msg_tool_ids
|
||
|
||
if not orphan_tool_call_ids:
|
||
# Found all missing assistants
|
||
break
|
||
|
||
if orphan_tool_call_ids:
|
||
# Some tool_call_ids couldn't be resolved - remove those tool responses
|
||
# This shouldn't happen in normal operation but handles edge cases
|
||
logger.warning(
|
||
f"Could not find assistant messages for tool_call_ids: {orphan_tool_call_ids}. "
|
||
"Removing orphan tool responses."
|
||
)
|
||
recent_messages = _remove_orphan_tool_responses(
|
||
recent_messages, orphan_tool_call_ids
|
||
)
|
||
|
||
if messages_to_prepend:
|
||
logger.info(
|
||
f"Extended recent messages by {len(messages_to_prepend)} to preserve "
|
||
f"tool_call/tool_response pairs"
|
||
)
|
||
return messages_to_prepend + recent_messages
|
||
|
||
return recent_messages
|
||
|
||
|
||
async def _summarize_messages_llm(
|
||
messages: list[dict],
|
||
client: AsyncOpenAI,
|
||
model: str,
|
||
timeout: float = 30.0,
|
||
) -> str:
|
||
"""Summarize messages using an LLM."""
|
||
conversation = []
|
||
for msg in messages:
|
||
# Responses API: function_call items
|
||
if msg.get("type") == "function_call":
|
||
name = msg.get("name", "unknown_tool")
|
||
args = msg.get("arguments", "")
|
||
conversation.append(f"TOOL CALL ({name}): {args}")
|
||
continue
|
||
# Responses API: function_call_output items
|
||
if msg.get("type") == "function_call_output":
|
||
output = msg.get("output", "")
|
||
conversation.append(f"TOOL OUTPUT: {output}")
|
||
continue
|
||
|
||
role = msg.get("role", "")
|
||
content = msg.get("content", "")
|
||
if content and role in ("user", "assistant", "tool"):
|
||
conversation.append(f"{role.upper()}: {content}")
|
||
|
||
conversation_text = "\n\n".join(conversation)
|
||
|
||
if not conversation_text:
|
||
return "No conversation history available."
|
||
|
||
# Limit to ~100k chars for safety
|
||
MAX_CHARS = 100_000
|
||
if len(conversation_text) > MAX_CHARS:
|
||
conversation_text = conversation_text[:MAX_CHARS] + "\n\n[truncated]"
|
||
|
||
response = await client.with_options(timeout=timeout).chat.completions.create(
|
||
model=model,
|
||
messages=[
|
||
{
|
||
"role": "system",
|
||
"content": (
|
||
"Create a factual summary of the conversation so far. "
|
||
"This summary will be used as context when continuing the conversation.\n\n"
|
||
"CRITICAL: Only include information that is EXPLICITLY present in the "
|
||
"conversation. Do NOT fabricate, infer, or invent any details. "
|
||
"If a section has no relevant content in the conversation, skip it entirely.\n\n"
|
||
"Before writing the summary, analyze each message chronologically to identify:\n"
|
||
"- User requests and their explicit goals\n"
|
||
"- Actions taken and key decisions made\n"
|
||
"- Technical specifics (file names, tool outputs, function signatures)\n"
|
||
"- Errors encountered and resolutions applied\n\n"
|
||
"IMPORTANT: Preserve all concrete references verbatim — these are small but "
|
||
"critical for continuing the conversation:\n"
|
||
"- File paths and directory paths (e.g. /src/app/page.tsx, ./output/result.csv)\n"
|
||
"- Image/media file paths from tool outputs\n"
|
||
"- URLs, API endpoints, and webhook addresses\n"
|
||
"- Resource IDs, session IDs, and identifiers\n"
|
||
"- Tool names that were called and their key parameters\n"
|
||
"- Environment variables, config keys, and credentials names (not values)\n\n"
|
||
"Include ONLY the sections below that have relevant content "
|
||
"(skip sections with nothing to report):\n\n"
|
||
"## 1. Primary Request and Intent\n"
|
||
"The user's explicit goals and what they are trying to accomplish.\n\n"
|
||
"## 2. Key Technical Concepts\n"
|
||
"Technologies, frameworks, tools, and patterns being used or discussed.\n\n"
|
||
"## 3. Files and Resources Involved\n"
|
||
"Specific files examined or modified, with relevant snippets and identifiers. "
|
||
"Include exact file paths, image paths from tool outputs, and resource URLs.\n\n"
|
||
"## 4. Errors and Fixes\n"
|
||
"Problems encountered, error messages, and their resolutions.\n\n"
|
||
"## 5. All User Messages\n"
|
||
"A complete list of all user inputs (excluding tool outputs) "
|
||
"to preserve their exact requests.\n\n"
|
||
"## 6. Pending Tasks\n"
|
||
"Work items the user explicitly requested that have not yet been completed.\n\n"
|
||
"## 7. Current State\n"
|
||
"What was happening most recently in the conversation."
|
||
),
|
||
},
|
||
{"role": "user", "content": f"Summarize:\n\n{conversation_text}"},
|
||
],
|
||
max_tokens=2000,
|
||
temperature=0.3,
|
||
)
|
||
|
||
return response.choices[0].message.content or "No summary available."
|
||
|
||
|
||
async def compress_context(
|
||
messages: list[dict],
|
||
target_tokens: int = DEFAULT_TOKEN_THRESHOLD,
|
||
*,
|
||
model: str = "gpt-4o",
|
||
client: AsyncOpenAI | None = None,
|
||
keep_recent: int = DEFAULT_KEEP_RECENT,
|
||
reserve: int = 2_048,
|
||
start_cap: int = 8_192,
|
||
floor_cap: int = 128,
|
||
) -> CompressResult:
|
||
"""
|
||
Unified context compression that combines summarization and truncation strategies.
|
||
|
||
Strategy (in order):
|
||
1. **LLM summarization** – If client provided, summarize old messages into a
|
||
single context message while keeping recent messages intact. This is the
|
||
primary strategy for chat service.
|
||
2. **Content truncation** – Progressively halve a per-message cap and truncate
|
||
bloated message content (tool outputs, large pastes). Preserves all messages
|
||
but shortens their content. Primary strategy when client=None (LLM blocks).
|
||
3. **Middle-out deletion** – Delete whole messages one at a time from the center
|
||
outward, skipping tool messages and objective messages.
|
||
4. **First/last trim** – Truncate first and last message content as last resort.
|
||
|
||
Parameters
|
||
----------
|
||
messages Complete chat history (will be deep-copied).
|
||
target_tokens Hard ceiling for prompt size.
|
||
model Model name for tokenization and summarization.
|
||
client AsyncOpenAI client. If provided, enables LLM summarization
|
||
as the first strategy. If None, skips to truncation strategies.
|
||
keep_recent Number of recent messages to preserve during summarization.
|
||
reserve Tokens to reserve for model response.
|
||
start_cap Initial per-message truncation ceiling (tokens).
|
||
floor_cap Lowest cap before moving to deletions.
|
||
|
||
Returns
|
||
-------
|
||
CompressResult with compressed messages and metadata.
|
||
"""
|
||
# Guard clause for empty messages
|
||
if not messages:
|
||
return CompressResult(
|
||
messages=[],
|
||
token_count=0,
|
||
was_compacted=False,
|
||
original_token_count=0,
|
||
)
|
||
|
||
token_model = _normalize_model_for_tokenizer(model)
|
||
enc = encoding_for_model(token_model)
|
||
msgs = deepcopy(messages)
|
||
|
||
def total_tokens() -> int:
|
||
return sum(_msg_tokens(m, enc) for m in msgs)
|
||
|
||
original_count = total_tokens()
|
||
|
||
# Already under limit
|
||
if original_count + reserve <= target_tokens:
|
||
return CompressResult(
|
||
messages=msgs,
|
||
token_count=original_count,
|
||
was_compacted=False,
|
||
original_token_count=original_count,
|
||
)
|
||
|
||
messages_summarized = 0
|
||
messages_dropped = 0
|
||
|
||
# ---- STEP 1: LLM summarization (if client provided) -------------------
|
||
# This is the primary compression strategy for chat service.
|
||
# Summarize old messages while keeping recent ones intact.
|
||
if client is not None:
|
||
has_system = len(msgs) > 0 and msgs[0].get("role") == "system"
|
||
system_msg = msgs[0] if has_system else None
|
||
|
||
# Calculate old vs recent messages
|
||
if has_system:
|
||
if len(msgs) > keep_recent + 1:
|
||
old_msgs = msgs[1:-keep_recent]
|
||
recent_msgs = msgs[-keep_recent:]
|
||
else:
|
||
old_msgs = []
|
||
recent_msgs = msgs[1:] if len(msgs) > 1 else []
|
||
else:
|
||
if len(msgs) > keep_recent:
|
||
old_msgs = msgs[:-keep_recent]
|
||
recent_msgs = msgs[-keep_recent:]
|
||
else:
|
||
old_msgs = []
|
||
recent_msgs = msgs
|
||
|
||
# Ensure tool pairs stay intact
|
||
slice_start = max(0, len(msgs) - keep_recent)
|
||
recent_msgs = _ensure_tool_pairs_intact(recent_msgs, msgs, slice_start)
|
||
|
||
if old_msgs:
|
||
try:
|
||
summary_text = await _summarize_messages_llm(old_msgs, client, model)
|
||
summary_msg = {
|
||
"role": "assistant",
|
||
"content": f"[Previous conversation summary — for context only]: {summary_text}",
|
||
}
|
||
messages_summarized = len(old_msgs)
|
||
|
||
if has_system:
|
||
msgs = [system_msg, summary_msg] + recent_msgs
|
||
else:
|
||
msgs = [summary_msg] + recent_msgs
|
||
|
||
logger.info(
|
||
"Context summarized: %d -> %d tokens, summarized %d messages",
|
||
original_count,
|
||
total_tokens(),
|
||
messages_summarized,
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"Summarization failed, continuing with truncation: %s", e
|
||
)
|
||
# Fall through to content truncation
|
||
|
||
# ---- STEP 2: Normalize content ----------------------------------------
|
||
# Convert non-string payloads to strings so token counting is coherent.
|
||
# Always run this before truncation to ensure consistent token counting.
|
||
for i, m in enumerate(msgs):
|
||
if not isinstance(m.get("content"), str) and m.get("content") is not None:
|
||
if _is_tool_message(m):
|
||
continue
|
||
if i == 0 or i == len(msgs) - 1:
|
||
continue
|
||
content_str = json.dumps(m["content"], separators=(",", ":"))
|
||
if len(content_str) > 20_000:
|
||
content_str = _truncate_middle_tokens(content_str, enc, 20_000)
|
||
m["content"] = content_str
|
||
|
||
# ---- STEP 3: Token-aware content truncation ---------------------------
|
||
# Progressively halve per-message cap and truncate bloated content.
|
||
# This preserves all messages but shortens their content.
|
||
cap = start_cap
|
||
while total_tokens() + reserve > target_tokens and cap >= floor_cap:
|
||
for m in msgs[1:-1]:
|
||
if _is_tool_message(m):
|
||
_truncate_tool_message_content(m, enc, cap)
|
||
continue
|
||
if _is_objective_message(m):
|
||
continue
|
||
content = m.get("content") or ""
|
||
if _tok_len(content, enc) > cap:
|
||
m["content"] = _truncate_middle_tokens(content, enc, cap)
|
||
cap //= 2
|
||
|
||
# ---- STEP 4: Middle-out deletion --------------------------------------
|
||
# Delete messages one at a time from the center outward.
|
||
# This is more granular than dropping all old messages at once.
|
||
while total_tokens() + reserve > target_tokens and len(msgs) > 2:
|
||
deletable: list[int] = []
|
||
for i in range(1, len(msgs) - 1):
|
||
msg = msgs[i]
|
||
if (
|
||
msg is not None
|
||
and not _is_tool_message(msg)
|
||
and not _is_objective_message(msg)
|
||
):
|
||
deletable.append(i)
|
||
if not deletable:
|
||
break
|
||
centre = len(msgs) // 2
|
||
to_delete = min(deletable, key=lambda i: abs(i - centre))
|
||
del msgs[to_delete]
|
||
messages_dropped += 1
|
||
|
||
# ---- STEP 5: Final trim on first/last ---------------------------------
|
||
cap = start_cap
|
||
while total_tokens() + reserve > target_tokens and cap >= floor_cap:
|
||
for idx in (0, -1):
|
||
msg = msgs[idx]
|
||
if msg is None:
|
||
continue
|
||
if _is_tool_message(msg):
|
||
_truncate_tool_message_content(msg, enc, cap)
|
||
continue
|
||
text = msg.get("content") or ""
|
||
if _tok_len(text, enc) > cap:
|
||
msg["content"] = _truncate_middle_tokens(text, enc, cap)
|
||
cap //= 2
|
||
|
||
# Filter out any None values that may have been introduced
|
||
final_msgs: list[dict] = [m for m in msgs if m is not None]
|
||
|
||
# ---- STEP 6: Final tool-pair validation ---------------------------------
|
||
# After all compression steps, verify that every tool response has a
|
||
# matching tool_call in a preceding assistant message. Remove orphans
|
||
# to prevent API errors (e.g., Anthropic's "unexpected tool_use_id").
|
||
final_msgs = validate_and_remove_orphan_tool_responses(final_msgs)
|
||
|
||
final_count = sum(_msg_tokens(m, enc) for m in final_msgs)
|
||
error = None
|
||
if final_count + reserve > target_tokens:
|
||
error = f"Could not compress below target ({final_count + reserve} > {target_tokens})"
|
||
logger.warning(error)
|
||
|
||
return CompressResult(
|
||
messages=final_msgs,
|
||
token_count=final_count,
|
||
was_compacted=True,
|
||
error=error,
|
||
original_token_count=original_count,
|
||
messages_summarized=messages_summarized,
|
||
messages_dropped=messages_dropped,
|
||
)
|