diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py index 15d173cedc..11d247cd03 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service.py @@ -415,6 +415,13 @@ async def _baseline_llm_caller( ) tool_calls_by_index: dict[int, dict[str, str]] = {} + # Snapshot token counts before this call so we can compute the delta + # used for fallback cost estimation (state fields are accumulated across + # all tool-call turns, so we must not pass the cumulative total to the + # per-call cost estimator). + prompt_tokens_before = state.turn_prompt_tokens + completion_tokens_before = state.turn_completion_tokens + async for chunk in response: if chunk.usage: state.turn_prompt_tokens += chunk.usage.prompt_tokens or 0 @@ -426,6 +433,11 @@ async def _baseline_llm_caller( state.turn_cache_read_tokens += ( getattr(ptd, "cached_tokens", 0) or 0 ) + # cache_creation_input_tokens is reported by some providers + # (e.g. Anthropic native) but not standard OpenAI streaming. + state.turn_cache_creation_tokens += ( + getattr(ptd, "cache_creation_input_tokens", 0) or 0 + ) delta = chunk.choices[0].delta if chunk.choices else None if not delta: @@ -495,13 +507,18 @@ async def _baseline_llm_caller( # Fallback: estimate cost from token counts when x-total-cost is # missing (e.g. some OpenRouter models don't report it). + # Use the delta for this call only — the state accumulators grow across + # all tool-call turns, so passing the cumulative total would + # compound-overestimate costs on the 2nd+ turn. + call_prompt_tokens = state.turn_prompt_tokens - prompt_tokens_before + call_completion_tokens = state.turn_completion_tokens - completion_tokens_before if not got_header_cost and ( - state.turn_prompt_tokens > 0 or state.turn_completion_tokens > 0 + call_prompt_tokens > 0 or call_completion_tokens > 0 ): estimated = _estimate_cost_from_tokens( state.model, - state.turn_prompt_tokens, - state.turn_completion_tokens, + call_prompt_tokens, + call_completion_tokens, ) if estimated is not None: state.cost_usd = (state.cost_usd or 0.0) + estimated diff --git a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py index cc2d58ca82..2a15065679 100644 --- a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py +++ b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py @@ -938,3 +938,115 @@ class TestBaselineCostExtraction: assert state.turn_cache_read_tokens == 800 assert state.turn_prompt_tokens == 1000 + + @pytest.mark.asyncio + async def test_cache_creation_tokens_extracted_from_usage_details(self): + """cache_creation_tokens are extracted from prompt_tokens_details.""" + from backend.copilot.baseline.service import ( + _baseline_llm_caller, + _BaselineStreamState, + ) + + state = _BaselineStreamState(model="openai/gpt-4o") + + mock_raw = MagicMock() + mock_raw.headers = {"x-total-cost": "0.01"} + mock_stream = MagicMock() + mock_stream._response = mock_raw + + mock_ptd = MagicMock() + mock_ptd.cached_tokens = 0 + mock_ptd.cache_creation_input_tokens = 500 + + mock_chunk = MagicMock() + mock_chunk.usage = MagicMock() + mock_chunk.usage.prompt_tokens = 1000 + mock_chunk.usage.completion_tokens = 200 + mock_chunk.usage.prompt_tokens_details = mock_ptd + mock_chunk.choices = [] + + async def chunk_aiter(): + yield mock_chunk + + mock_stream.__aiter__ = lambda self: chunk_aiter() + + mock_client = MagicMock() + mock_client.chat.completions.create = AsyncMock(return_value=mock_stream) + + with patch( + "backend.copilot.baseline.service._get_openai_client", + return_value=mock_client, + ): + await _baseline_llm_caller( + messages=[{"role": "user", "content": "hi"}], + tools=[], + state=state, + ) + + assert state.turn_cache_creation_tokens == 500 + + @pytest.mark.asyncio + async def test_multiturn_fallback_cost_uses_per_call_delta(self): + """Fallback cost estimation uses per-call token delta, not session total. + + On the second tool-call turn, the state accumulators already hold + tokens from turn 1. The estimator must charge only for the new tokens + reported in the current call, not the running total. + """ + from backend.copilot.baseline.service import ( + _baseline_llm_caller, + _BaselineStreamState, + ) + + state = _BaselineStreamState(model="anthropic/claude-sonnet-4") + + def make_stream(prompt_tokens: int, completion_tokens: int): + mock_raw = MagicMock() + mock_raw.headers = {} # no x-total-cost + mock_stream = MagicMock() + mock_stream._response = mock_raw + + mock_chunk = MagicMock() + mock_chunk.usage = MagicMock() + mock_chunk.usage.prompt_tokens = prompt_tokens + mock_chunk.usage.completion_tokens = completion_tokens + mock_chunk.usage.prompt_tokens_details = None + mock_chunk.choices = [] + + async def chunk_aiter(): + yield mock_chunk + + mock_stream.__aiter__ = lambda self: chunk_aiter() + return mock_stream + + mock_client = MagicMock() + mock_client.chat.completions.create = AsyncMock( + side_effect=[ + make_stream(1000, 200), # turn 1 + make_stream(1100, 300), # turn 2 (accumulators now hold 1000+1100, 200+300) + ] + ) + + with patch( + "backend.copilot.baseline.service._get_openai_client", + return_value=mock_client, + ): + await _baseline_llm_caller( + messages=[{"role": "user", "content": "hi"}], + tools=[], + state=state, + ) + await _baseline_llm_caller( + messages=[{"role": "user", "content": "follow up"}], + tools=[], + state=state, + ) + + # Turn 1: 1000 * 3.0/1M + 200 * 15.0/1M = 0.003 + 0.003 = 0.006 + # Turn 2: 1100 * 3.0/1M + 300 * 15.0/1M = 0.0033 + 0.0045 = 0.0078 + # Total: 0.0138 — NOT 0.006 + cumulative (2100*3/1M + 500*15/1M = 0.006+0.0138) + expected = pytest.approx(0.006 + 0.0078, rel=1e-5) + assert state.cost_usd == expected + # Accumulators hold all tokens across both turns + assert state.turn_prompt_tokens == 2100 + assert state.turn_completion_tokens == 500