fix(copilot): fix multi-turn cost over-estimation and add cache_creation_tokens extraction

Bug 1: Fallback cost estimation was using accumulated turn_prompt_tokens / turn_completion_tokens across all tool-call rounds, causing compounding over-estimation on the 2nd+ turn. Snapshot token counts before each call and pass only the per-call delta to _estimate_cost_from_tokens. Bug 2: turn_cache_creation_tokens was defined but never populated. Extract cache_creation_input_tokens from prompt_tokens_details (available from some providers such as Anthropic via OpenRouter). Add regression tests for both fixes.
2026-04-30 03:00:41 -04:00 · 2026-04-13 09:53:05 +00:00
parent f6c7d1eaf7
commit c6af52033d
2 changed files with 132 additions and 3 deletions
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -415,6 +415,13 @@ async def _baseline_llm_caller(
            )
        tool_calls_by_index: dict[int, dict[str, str]] = {}

+        # Snapshot token counts before this call so we can compute the delta
+        # used for fallback cost estimation (state fields are accumulated across
+        # all tool-call turns, so we must not pass the cumulative total to the
+        # per-call cost estimator).
+        prompt_tokens_before = state.turn_prompt_tokens
+        completion_tokens_before = state.turn_completion_tokens
+
        async for chunk in response:
            if chunk.usage:
                state.turn_prompt_tokens += chunk.usage.prompt_tokens or 0
@@ -426,6 +433,11 @@ async def _baseline_llm_caller(
                    state.turn_cache_read_tokens += (
                        getattr(ptd, "cached_tokens", 0) or 0
                    )
+                    # cache_creation_input_tokens is reported by some providers
+                    # (e.g. Anthropic native) but not standard OpenAI streaming.
+                    state.turn_cache_creation_tokens += (
+                        getattr(ptd, "cache_creation_input_tokens", 0) or 0
+                    )

            delta = chunk.choices[0].delta if chunk.choices else None
            if not delta:
@@ -495,13 +507,18 @@ async def _baseline_llm_caller(

        # Fallback: estimate cost from token counts when x-total-cost is
        # missing (e.g. some OpenRouter models don't report it).
+        # Use the delta for this call only — the state accumulators grow across
+        # all tool-call turns, so passing the cumulative total would
+        # compound-overestimate costs on the 2nd+ turn.
+        call_prompt_tokens = state.turn_prompt_tokens - prompt_tokens_before
+        call_completion_tokens = state.turn_completion_tokens - completion_tokens_before
        if not got_header_cost and (
-            state.turn_prompt_tokens > 0 or state.turn_completion_tokens > 0
+            call_prompt_tokens > 0 or call_completion_tokens > 0
        ):
            estimated = _estimate_cost_from_tokens(
                state.model,
-                state.turn_prompt_tokens,
-                state.turn_completion_tokens,
+                call_prompt_tokens,
+                call_completion_tokens,
            )
            if estimated is not None:
                state.cost_usd = (state.cost_usd or 0.0) + estimated
--- a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
@@ -938,3 +938,115 @@ class TestBaselineCostExtraction:

        assert state.turn_cache_read_tokens == 800
        assert state.turn_prompt_tokens == 1000
+
+    @pytest.mark.asyncio
+    async def test_cache_creation_tokens_extracted_from_usage_details(self):
+        """cache_creation_tokens are extracted from prompt_tokens_details."""
+        from backend.copilot.baseline.service import (
+            _baseline_llm_caller,
+            _BaselineStreamState,
+        )
+
+        state = _BaselineStreamState(model="openai/gpt-4o")
+
+        mock_raw = MagicMock()
+        mock_raw.headers = {"x-total-cost": "0.01"}
+        mock_stream = MagicMock()
+        mock_stream._response = mock_raw
+
+        mock_ptd = MagicMock()
+        mock_ptd.cached_tokens = 0
+        mock_ptd.cache_creation_input_tokens = 500
+
+        mock_chunk = MagicMock()
+        mock_chunk.usage = MagicMock()
+        mock_chunk.usage.prompt_tokens = 1000
+        mock_chunk.usage.completion_tokens = 200
+        mock_chunk.usage.prompt_tokens_details = mock_ptd
+        mock_chunk.choices = []
+
+        async def chunk_aiter():
+            yield mock_chunk
+
+        mock_stream.__aiter__ = lambda self: chunk_aiter()
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(return_value=mock_stream)
+
+        with patch(
+            "backend.copilot.baseline.service._get_openai_client",
+            return_value=mock_client,
+        ):
+            await _baseline_llm_caller(
+                messages=[{"role": "user", "content": "hi"}],
+                tools=[],
+                state=state,
+            )
+
+        assert state.turn_cache_creation_tokens == 500
+
+    @pytest.mark.asyncio
+    async def test_multiturn_fallback_cost_uses_per_call_delta(self):
+        """Fallback cost estimation uses per-call token delta, not session total.
+
+        On the second tool-call turn, the state accumulators already hold
+        tokens from turn 1.  The estimator must charge only for the new tokens
+        reported in the current call, not the running total.
+        """
+        from backend.copilot.baseline.service import (
+            _baseline_llm_caller,
+            _BaselineStreamState,
+        )
+
+        state = _BaselineStreamState(model="anthropic/claude-sonnet-4")
+
+        def make_stream(prompt_tokens: int, completion_tokens: int):
+            mock_raw = MagicMock()
+            mock_raw.headers = {}  # no x-total-cost
+            mock_stream = MagicMock()
+            mock_stream._response = mock_raw
+
+            mock_chunk = MagicMock()
+            mock_chunk.usage = MagicMock()
+            mock_chunk.usage.prompt_tokens = prompt_tokens
+            mock_chunk.usage.completion_tokens = completion_tokens
+            mock_chunk.usage.prompt_tokens_details = None
+            mock_chunk.choices = []
+
+            async def chunk_aiter():
+                yield mock_chunk
+
+            mock_stream.__aiter__ = lambda self: chunk_aiter()
+            return mock_stream
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(
+            side_effect=[
+                make_stream(1000, 200),  # turn 1
+                make_stream(1100, 300),  # turn 2 (accumulators now hold 1000+1100, 200+300)
+            ]
+        )
+
+        with patch(
+            "backend.copilot.baseline.service._get_openai_client",
+            return_value=mock_client,
+        ):
+            await _baseline_llm_caller(
+                messages=[{"role": "user", "content": "hi"}],
+                tools=[],
+                state=state,
+            )
+            await _baseline_llm_caller(
+                messages=[{"role": "user", "content": "follow up"}],
+                tools=[],
+                state=state,
+            )
+
+        # Turn 1: 1000 * 3.0/1M + 200 * 15.0/1M = 0.003 + 0.003 = 0.006
+        # Turn 2: 1100 * 3.0/1M + 300 * 15.0/1M = 0.0033 + 0.0045 = 0.0078
+        # Total: 0.0138 — NOT 0.006 + cumulative (2100*3/1M + 500*15/1M = 0.006+0.0138)
+        expected = pytest.approx(0.006 + 0.0078, rel=1e-5)
+        assert state.cost_usd == expected
+        # Accumulators hold all tokens across both turns
+        assert state.turn_prompt_tokens == 2100
+        assert state.turn_completion_tokens == 500