fix(copilot): move session.usage.append to finally block in SDK service

Ensures session usage persistence and rate-limit recording stay consistent even when an exception interrupts the try block. Mirrors the baseline service pattern.
2026-04-08 03:00:28 -04:00 · 2026-03-13 00:35:09 +07:00
parent dcb71ab0b9
commit 1d2f641a26
1 changed files with 25 additions and 19 deletions
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -1350,7 +1350,10 @@ async def stream_chat_completion_sdk(
            ) and not has_appended_assistant:
                session.messages.append(assistant_response)

-        # Emit token usage and update session for persistence
+        # Emit token usage to the client (must be in try to reach SSE stream).
+        # Session persistence of usage is in finally to stay consistent with
+        # rate-limit recording even if an exception interrupts between here
+        # and the finally block.
        if turn_prompt_tokens > 0 or turn_completion_tokens > 0:
            total_tokens = turn_prompt_tokens + turn_completion_tokens
            yield StreamUsage(
@@ -1358,21 +1361,6 @@ async def stream_chat_completion_sdk(
                completionTokens=turn_completion_tokens,
                totalTokens=total_tokens,
            )
-            session.usage.append(
-                Usage(
-                    prompt_tokens=turn_prompt_tokens,
-                    completion_tokens=turn_completion_tokens,
-                    total_tokens=total_tokens,
-                )
-            )
-            logger.info(
-                "%s Turn usage: prompt=%d, completion=%d, total=%d, cost_usd=%s",
-                log_prefix,
-                turn_prompt_tokens,
-                turn_completion_tokens,
-                total_tokens,
-                turn_cost_usd,
-            )

        # Transcript upload is handled exclusively in the finally block
        # to avoid double-uploads (the success path used to upload the
@@ -1438,9 +1426,27 @@ async def stream_chat_completion_sdk(
            except Exception:
                logger.warning("OTEL context teardown failed", exc_info=True)

-        # --- Record token usage for rate limiting ---
-        # Must run in finally to ensure tokens are always recorded, even when
-        # exceptions interrupt the stream (prevents rate limit bypass).
+        # --- Persist token usage to session + rate-limit counters ---
+        # Both must live in finally so they stay consistent even when an
+        # exception interrupts the try block after StreamUsage was yielded.
+        if turn_prompt_tokens > 0 or turn_completion_tokens > 0:
+            total_tokens = turn_prompt_tokens + turn_completion_tokens
+            if session is not None:
+                session.usage.append(
+                    Usage(
+                        prompt_tokens=turn_prompt_tokens,
+                        completion_tokens=turn_completion_tokens,
+                        total_tokens=total_tokens,
+                    )
+                )
+            logger.info(
+                "%s Turn usage: prompt=%d, completion=%d, total=%d, cost_usd=%s",
+                log_prefix,
+                turn_prompt_tokens,
+                turn_completion_tokens,
+                total_tokens,
+                turn_cost_usd,
+            )
        if user_id and (turn_prompt_tokens > 0 or turn_completion_tokens > 0):
            try:
                await record_token_usage(