fix(copilot): fix multi-turn cost over-estimation and add cache_creation_tokens extraction

Bug 1: Fallback cost estimation was using accumulated turn_prompt_tokens /
turn_completion_tokens across all tool-call rounds, causing compounding
over-estimation on the 2nd+ turn. Snapshot token counts before each call and
pass only the per-call delta to _estimate_cost_from_tokens.

Bug 2: turn_cache_creation_tokens was defined but never populated. Extract
cache_creation_input_tokens from prompt_tokens_details (available from some
providers such as Anthropic via OpenRouter).

Add regression tests for both fixes.
This commit is contained in:
majdyz
2026-04-13 09:53:05 +00:00
parent f6c7d1eaf7
commit c6af52033d
2 changed files with 132 additions and 3 deletions

View File

@@ -415,6 +415,13 @@ async def _baseline_llm_caller(
)
tool_calls_by_index: dict[int, dict[str, str]] = {}
# Snapshot token counts before this call so we can compute the delta
# used for fallback cost estimation (state fields are accumulated across
# all tool-call turns, so we must not pass the cumulative total to the
# per-call cost estimator).
prompt_tokens_before = state.turn_prompt_tokens
completion_tokens_before = state.turn_completion_tokens
async for chunk in response:
if chunk.usage:
state.turn_prompt_tokens += chunk.usage.prompt_tokens or 0
@@ -426,6 +433,11 @@ async def _baseline_llm_caller(
state.turn_cache_read_tokens += (
getattr(ptd, "cached_tokens", 0) or 0
)
# cache_creation_input_tokens is reported by some providers
# (e.g. Anthropic native) but not standard OpenAI streaming.
state.turn_cache_creation_tokens += (
getattr(ptd, "cache_creation_input_tokens", 0) or 0
)
delta = chunk.choices[0].delta if chunk.choices else None
if not delta:
@@ -495,13 +507,18 @@ async def _baseline_llm_caller(
# Fallback: estimate cost from token counts when x-total-cost is
# missing (e.g. some OpenRouter models don't report it).
# Use the delta for this call only — the state accumulators grow across
# all tool-call turns, so passing the cumulative total would
# compound-overestimate costs on the 2nd+ turn.
call_prompt_tokens = state.turn_prompt_tokens - prompt_tokens_before
call_completion_tokens = state.turn_completion_tokens - completion_tokens_before
if not got_header_cost and (
state.turn_prompt_tokens > 0 or state.turn_completion_tokens > 0
call_prompt_tokens > 0 or call_completion_tokens > 0
):
estimated = _estimate_cost_from_tokens(
state.model,
state.turn_prompt_tokens,
state.turn_completion_tokens,
call_prompt_tokens,
call_completion_tokens,
)
if estimated is not None:
state.cost_usd = (state.cost_usd or 0.0) + estimated

View File

@@ -938,3 +938,115 @@ class TestBaselineCostExtraction:
assert state.turn_cache_read_tokens == 800
assert state.turn_prompt_tokens == 1000
@pytest.mark.asyncio
async def test_cache_creation_tokens_extracted_from_usage_details(self):
"""cache_creation_tokens are extracted from prompt_tokens_details."""
from backend.copilot.baseline.service import (
_baseline_llm_caller,
_BaselineStreamState,
)
state = _BaselineStreamState(model="openai/gpt-4o")
mock_raw = MagicMock()
mock_raw.headers = {"x-total-cost": "0.01"}
mock_stream = MagicMock()
mock_stream._response = mock_raw
mock_ptd = MagicMock()
mock_ptd.cached_tokens = 0
mock_ptd.cache_creation_input_tokens = 500
mock_chunk = MagicMock()
mock_chunk.usage = MagicMock()
mock_chunk.usage.prompt_tokens = 1000
mock_chunk.usage.completion_tokens = 200
mock_chunk.usage.prompt_tokens_details = mock_ptd
mock_chunk.choices = []
async def chunk_aiter():
yield mock_chunk
mock_stream.__aiter__ = lambda self: chunk_aiter()
mock_client = MagicMock()
mock_client.chat.completions.create = AsyncMock(return_value=mock_stream)
with patch(
"backend.copilot.baseline.service._get_openai_client",
return_value=mock_client,
):
await _baseline_llm_caller(
messages=[{"role": "user", "content": "hi"}],
tools=[],
state=state,
)
assert state.turn_cache_creation_tokens == 500
@pytest.mark.asyncio
async def test_multiturn_fallback_cost_uses_per_call_delta(self):
"""Fallback cost estimation uses per-call token delta, not session total.
On the second tool-call turn, the state accumulators already hold
tokens from turn 1. The estimator must charge only for the new tokens
reported in the current call, not the running total.
"""
from backend.copilot.baseline.service import (
_baseline_llm_caller,
_BaselineStreamState,
)
state = _BaselineStreamState(model="anthropic/claude-sonnet-4")
def make_stream(prompt_tokens: int, completion_tokens: int):
mock_raw = MagicMock()
mock_raw.headers = {} # no x-total-cost
mock_stream = MagicMock()
mock_stream._response = mock_raw
mock_chunk = MagicMock()
mock_chunk.usage = MagicMock()
mock_chunk.usage.prompt_tokens = prompt_tokens
mock_chunk.usage.completion_tokens = completion_tokens
mock_chunk.usage.prompt_tokens_details = None
mock_chunk.choices = []
async def chunk_aiter():
yield mock_chunk
mock_stream.__aiter__ = lambda self: chunk_aiter()
return mock_stream
mock_client = MagicMock()
mock_client.chat.completions.create = AsyncMock(
side_effect=[
make_stream(1000, 200), # turn 1
make_stream(1100, 300), # turn 2 (accumulators now hold 1000+1100, 200+300)
]
)
with patch(
"backend.copilot.baseline.service._get_openai_client",
return_value=mock_client,
):
await _baseline_llm_caller(
messages=[{"role": "user", "content": "hi"}],
tools=[],
state=state,
)
await _baseline_llm_caller(
messages=[{"role": "user", "content": "follow up"}],
tools=[],
state=state,
)
# Turn 1: 1000 * 3.0/1M + 200 * 15.0/1M = 0.003 + 0.003 = 0.006
# Turn 2: 1100 * 3.0/1M + 300 * 15.0/1M = 0.0033 + 0.0045 = 0.0078
# Total: 0.0138 — NOT 0.006 + cumulative (2100*3/1M + 500*15/1M = 0.006+0.0138)
expected = pytest.approx(0.006 + 0.0078, rel=1e-5)
assert state.cost_usd == expected
# Accumulators hold all tokens across both turns
assert state.turn_prompt_tokens == 2100
assert state.turn_completion_tokens == 500