fix(backend/copilot): skip fallback token estimation for failed API requests

When a streaming error occurs before any output is produced (e.g., connection timeout), the fallback token estimation in the finally block would incorrectly charge rate-limit tokens — including an artificial minimum of 1 completion token via max(..., 1). This penalises users for requests that produced no output. Track whether an error occurred via _stream_error flag and skip the fallback estimation when the request failed and no assistant text was generated.
2026-04-08 03:00:28 -04:00 · 2026-03-14 22:46:12 +07:00
parent b9951a3c53
commit b9be577904
1 changed files with 14 additions and 1 deletions
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -227,6 +227,7 @@ async def stream_chat_completion_baseline(
    # Token usage accumulators — populated from streaming chunks
    turn_prompt_tokens = 0
    turn_completion_tokens = 0
+    _stream_error = False  # Track whether an error occurred during streaming
    try:
        for _round in range(_MAX_TOOL_ROUNDS):
            # Open a new step for each LLM round
@@ -410,6 +411,7 @@ async def stream_chat_completion_baseline(
            )

    except Exception as e:
+        _stream_error = True
        error_msg = str(e) or type(e).__name__
        logger.error("[Baseline] Streaming error: %s", error_msg, exc_info=True)
        # Close any open text/step before emitting error
@@ -431,7 +433,15 @@ async def stream_chat_completion_baseline(
        # not honour stream_options={"include_usage": True}.
        # Count the full message list (system + history + turn) since
        # each API call sends the complete context window.
-        if turn_prompt_tokens == 0 and turn_completion_tokens == 0:
+        # NOTE: This estimates one round's prompt tokens. Multi-round tool-calling
+        # turns consume prompt tokens on each API call, so the total is underestimated.
+        # Skip fallback when an error occurred and no output was produced —
+        # charging rate-limit tokens for completely failed requests is unfair.
+        if (
+            turn_prompt_tokens == 0
+            and turn_completion_tokens == 0
+            and not (_stream_error and not assistant_text)
+        ):
            from backend.util.prompt import (
                estimate_token_count,
                estimate_token_count_str,
@@ -467,6 +477,9 @@ async def stream_chat_completion_baseline(
                total_tokens,
            )
            # Record for rate limiting counters
+            # NOTE: OpenRouter folds cached tokens into prompt_tokens, so we cannot
+            # break out cache_read/cache_creation weights. Users on the baseline
+            # path may be slightly over-counted vs the SDK path.
            if user_id:
                try:
                    await record_token_usage(