fix(copilot): scope fallback token estimation to current turn only

The fallback estimator was counting the entire openai_messages history
(system prompt + all previous turns) instead of just the messages added
during the current turn. This caused overcounting and overly strict
rate limiting when providers don't return streaming usage data.
This commit is contained in:
Zamil Majdy
2026-03-13 03:44:30 +07:00
parent 3096f94996
commit 4ceb15b3f1

View File

@@ -227,6 +227,9 @@ async def stream_chat_completion_baseline(
# Token usage accumulators — populated from streaming chunks
turn_prompt_tokens = 0
turn_completion_tokens = 0
# Track message count before the tool loop so the fallback estimator
# only counts messages added during *this* turn, not the full history.
_msgs_before_turn = len(openai_messages)
try:
for _round in range(_MAX_TOOL_ROUNDS):
# Open a new step for each LLM round
@@ -429,9 +432,12 @@ async def stream_chat_completion_baseline(
# Fallback: estimate tokens from text length when the provider
# does not honour stream_options={"include_usage": True}.
# Only count messages added during *this* turn (user message +
# tool rounds), not the full conversation history.
# Rough estimate: 1 token ≈ 4 characters.
if turn_prompt_tokens == 0 and turn_completion_tokens == 0:
prompt_chars = sum(len(m.get("content") or "") for m in openai_messages)
turn_messages = openai_messages[_msgs_before_turn - 1 :]
prompt_chars = sum(len(m.get("content") or "") for m in turn_messages)
turn_prompt_tokens = max(prompt_chars // 4, 1)
turn_completion_tokens = max(len(assistant_text) // 4, 1)
logger.info(