mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-30 03:00:41 -04:00
fix(copilot): fix multi-turn cost over-estimation and add cache_creation_tokens extraction
Bug 1: Fallback cost estimation was using accumulated turn_prompt_tokens / turn_completion_tokens across all tool-call rounds, causing compounding over-estimation on the 2nd+ turn. Snapshot token counts before each call and pass only the per-call delta to _estimate_cost_from_tokens. Bug 2: turn_cache_creation_tokens was defined but never populated. Extract cache_creation_input_tokens from prompt_tokens_details (available from some providers such as Anthropic via OpenRouter). Add regression tests for both fixes.
This commit is contained in:
@@ -415,6 +415,13 @@ async def _baseline_llm_caller(
|
||||
)
|
||||
tool_calls_by_index: dict[int, dict[str, str]] = {}
|
||||
|
||||
# Snapshot token counts before this call so we can compute the delta
|
||||
# used for fallback cost estimation (state fields are accumulated across
|
||||
# all tool-call turns, so we must not pass the cumulative total to the
|
||||
# per-call cost estimator).
|
||||
prompt_tokens_before = state.turn_prompt_tokens
|
||||
completion_tokens_before = state.turn_completion_tokens
|
||||
|
||||
async for chunk in response:
|
||||
if chunk.usage:
|
||||
state.turn_prompt_tokens += chunk.usage.prompt_tokens or 0
|
||||
@@ -426,6 +433,11 @@ async def _baseline_llm_caller(
|
||||
state.turn_cache_read_tokens += (
|
||||
getattr(ptd, "cached_tokens", 0) or 0
|
||||
)
|
||||
# cache_creation_input_tokens is reported by some providers
|
||||
# (e.g. Anthropic native) but not standard OpenAI streaming.
|
||||
state.turn_cache_creation_tokens += (
|
||||
getattr(ptd, "cache_creation_input_tokens", 0) or 0
|
||||
)
|
||||
|
||||
delta = chunk.choices[0].delta if chunk.choices else None
|
||||
if not delta:
|
||||
@@ -495,13 +507,18 @@ async def _baseline_llm_caller(
|
||||
|
||||
# Fallback: estimate cost from token counts when x-total-cost is
|
||||
# missing (e.g. some OpenRouter models don't report it).
|
||||
# Use the delta for this call only — the state accumulators grow across
|
||||
# all tool-call turns, so passing the cumulative total would
|
||||
# compound-overestimate costs on the 2nd+ turn.
|
||||
call_prompt_tokens = state.turn_prompt_tokens - prompt_tokens_before
|
||||
call_completion_tokens = state.turn_completion_tokens - completion_tokens_before
|
||||
if not got_header_cost and (
|
||||
state.turn_prompt_tokens > 0 or state.turn_completion_tokens > 0
|
||||
call_prompt_tokens > 0 or call_completion_tokens > 0
|
||||
):
|
||||
estimated = _estimate_cost_from_tokens(
|
||||
state.model,
|
||||
state.turn_prompt_tokens,
|
||||
state.turn_completion_tokens,
|
||||
call_prompt_tokens,
|
||||
call_completion_tokens,
|
||||
)
|
||||
if estimated is not None:
|
||||
state.cost_usd = (state.cost_usd or 0.0) + estimated
|
||||
|
||||
@@ -938,3 +938,115 @@ class TestBaselineCostExtraction:
|
||||
|
||||
assert state.turn_cache_read_tokens == 800
|
||||
assert state.turn_prompt_tokens == 1000
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cache_creation_tokens_extracted_from_usage_details(self):
|
||||
"""cache_creation_tokens are extracted from prompt_tokens_details."""
|
||||
from backend.copilot.baseline.service import (
|
||||
_baseline_llm_caller,
|
||||
_BaselineStreamState,
|
||||
)
|
||||
|
||||
state = _BaselineStreamState(model="openai/gpt-4o")
|
||||
|
||||
mock_raw = MagicMock()
|
||||
mock_raw.headers = {"x-total-cost": "0.01"}
|
||||
mock_stream = MagicMock()
|
||||
mock_stream._response = mock_raw
|
||||
|
||||
mock_ptd = MagicMock()
|
||||
mock_ptd.cached_tokens = 0
|
||||
mock_ptd.cache_creation_input_tokens = 500
|
||||
|
||||
mock_chunk = MagicMock()
|
||||
mock_chunk.usage = MagicMock()
|
||||
mock_chunk.usage.prompt_tokens = 1000
|
||||
mock_chunk.usage.completion_tokens = 200
|
||||
mock_chunk.usage.prompt_tokens_details = mock_ptd
|
||||
mock_chunk.choices = []
|
||||
|
||||
async def chunk_aiter():
|
||||
yield mock_chunk
|
||||
|
||||
mock_stream.__aiter__ = lambda self: chunk_aiter()
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.chat.completions.create = AsyncMock(return_value=mock_stream)
|
||||
|
||||
with patch(
|
||||
"backend.copilot.baseline.service._get_openai_client",
|
||||
return_value=mock_client,
|
||||
):
|
||||
await _baseline_llm_caller(
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
tools=[],
|
||||
state=state,
|
||||
)
|
||||
|
||||
assert state.turn_cache_creation_tokens == 500
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multiturn_fallback_cost_uses_per_call_delta(self):
|
||||
"""Fallback cost estimation uses per-call token delta, not session total.
|
||||
|
||||
On the second tool-call turn, the state accumulators already hold
|
||||
tokens from turn 1. The estimator must charge only for the new tokens
|
||||
reported in the current call, not the running total.
|
||||
"""
|
||||
from backend.copilot.baseline.service import (
|
||||
_baseline_llm_caller,
|
||||
_BaselineStreamState,
|
||||
)
|
||||
|
||||
state = _BaselineStreamState(model="anthropic/claude-sonnet-4")
|
||||
|
||||
def make_stream(prompt_tokens: int, completion_tokens: int):
|
||||
mock_raw = MagicMock()
|
||||
mock_raw.headers = {} # no x-total-cost
|
||||
mock_stream = MagicMock()
|
||||
mock_stream._response = mock_raw
|
||||
|
||||
mock_chunk = MagicMock()
|
||||
mock_chunk.usage = MagicMock()
|
||||
mock_chunk.usage.prompt_tokens = prompt_tokens
|
||||
mock_chunk.usage.completion_tokens = completion_tokens
|
||||
mock_chunk.usage.prompt_tokens_details = None
|
||||
mock_chunk.choices = []
|
||||
|
||||
async def chunk_aiter():
|
||||
yield mock_chunk
|
||||
|
||||
mock_stream.__aiter__ = lambda self: chunk_aiter()
|
||||
return mock_stream
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.chat.completions.create = AsyncMock(
|
||||
side_effect=[
|
||||
make_stream(1000, 200), # turn 1
|
||||
make_stream(1100, 300), # turn 2 (accumulators now hold 1000+1100, 200+300)
|
||||
]
|
||||
)
|
||||
|
||||
with patch(
|
||||
"backend.copilot.baseline.service._get_openai_client",
|
||||
return_value=mock_client,
|
||||
):
|
||||
await _baseline_llm_caller(
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
tools=[],
|
||||
state=state,
|
||||
)
|
||||
await _baseline_llm_caller(
|
||||
messages=[{"role": "user", "content": "follow up"}],
|
||||
tools=[],
|
||||
state=state,
|
||||
)
|
||||
|
||||
# Turn 1: 1000 * 3.0/1M + 200 * 15.0/1M = 0.003 + 0.003 = 0.006
|
||||
# Turn 2: 1100 * 3.0/1M + 300 * 15.0/1M = 0.0033 + 0.0045 = 0.0078
|
||||
# Total: 0.0138 — NOT 0.006 + cumulative (2100*3/1M + 500*15/1M = 0.006+0.0138)
|
||||
expected = pytest.approx(0.006 + 0.0078, rel=1e-5)
|
||||
assert state.cost_usd == expected
|
||||
# Accumulators hold all tokens across both turns
|
||||
assert state.turn_prompt_tokens == 2100
|
||||
assert state.turn_completion_tokens == 500
|
||||
|
||||
Reference in New Issue
Block a user