diff --git a/autogpt_platform/backend/backend/copilot/baseline/service.py b/autogpt_platform/backend/backend/copilot/baseline/service.py
index 15d173cedc..11d247cd03 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -415,6 +415,13 @@ async def _baseline_llm_caller(
             )
         tool_calls_by_index: dict[int, dict[str, str]] = {}
 
+        # Snapshot token counts before this call so we can compute the delta
+        # used for fallback cost estimation (state fields are accumulated across
+        # all tool-call turns, so we must not pass the cumulative total to the
+        # per-call cost estimator).
+        prompt_tokens_before = state.turn_prompt_tokens
+        completion_tokens_before = state.turn_completion_tokens
+
         async for chunk in response:
             if chunk.usage:
                 state.turn_prompt_tokens += chunk.usage.prompt_tokens or 0
@@ -426,6 +433,11 @@ async def _baseline_llm_caller(
                     state.turn_cache_read_tokens += (
                         getattr(ptd, "cached_tokens", 0) or 0
                     )
+                    # cache_creation_input_tokens is reported by some providers
+                    # (e.g. Anthropic native) but not standard OpenAI streaming.
+                    state.turn_cache_creation_tokens += (
+                        getattr(ptd, "cache_creation_input_tokens", 0) or 0
+                    )
 
             delta = chunk.choices[0].delta if chunk.choices else None
             if not delta:
@@ -495,13 +507,18 @@ async def _baseline_llm_caller(
 
         # Fallback: estimate cost from token counts when x-total-cost is
         # missing (e.g. some OpenRouter models don't report it).
+        # Use the delta for this call only — the state accumulators grow across
+        # all tool-call turns, so passing the cumulative total would
+        # compound-overestimate costs on the 2nd+ turn.
+        call_prompt_tokens = state.turn_prompt_tokens - prompt_tokens_before
+        call_completion_tokens = state.turn_completion_tokens - completion_tokens_before
         if not got_header_cost and (
-            state.turn_prompt_tokens > 0 or state.turn_completion_tokens > 0
+            call_prompt_tokens > 0 or call_completion_tokens > 0
         ):
             estimated = _estimate_cost_from_tokens(
                 state.model,
-                state.turn_prompt_tokens,
-                state.turn_completion_tokens,
+                call_prompt_tokens,
+                call_completion_tokens,
             )
             if estimated is not None:
                 state.cost_usd = (state.cost_usd or 0.0) + estimated
diff --git a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
index cc2d58ca82..2a15065679 100644
--- a/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service_unit_test.py
@@ -938,3 +938,115 @@ class TestBaselineCostExtraction:
 
         assert state.turn_cache_read_tokens == 800
         assert state.turn_prompt_tokens == 1000
+
+    @pytest.mark.asyncio
+    async def test_cache_creation_tokens_extracted_from_usage_details(self):
+        """cache_creation_tokens are extracted from prompt_tokens_details."""
+        from backend.copilot.baseline.service import (
+            _baseline_llm_caller,
+            _BaselineStreamState,
+        )
+
+        state = _BaselineStreamState(model="openai/gpt-4o")
+
+        mock_raw = MagicMock()
+        mock_raw.headers = {"x-total-cost": "0.01"}
+        mock_stream = MagicMock()
+        mock_stream._response = mock_raw
+
+        mock_ptd = MagicMock()
+        mock_ptd.cached_tokens = 0
+        mock_ptd.cache_creation_input_tokens = 500
+
+        mock_chunk = MagicMock()
+        mock_chunk.usage = MagicMock()
+        mock_chunk.usage.prompt_tokens = 1000
+        mock_chunk.usage.completion_tokens = 200
+        mock_chunk.usage.prompt_tokens_details = mock_ptd
+        mock_chunk.choices = []
+
+        async def chunk_aiter():
+            yield mock_chunk
+
+        mock_stream.__aiter__ = lambda self: chunk_aiter()
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(return_value=mock_stream)
+
+        with patch(
+            "backend.copilot.baseline.service._get_openai_client",
+            return_value=mock_client,
+        ):
+            await _baseline_llm_caller(
+                messages=[{"role": "user", "content": "hi"}],
+                tools=[],
+                state=state,
+            )
+
+        assert state.turn_cache_creation_tokens == 500
+
+    @pytest.mark.asyncio
+    async def test_multiturn_fallback_cost_uses_per_call_delta(self):
+        """Fallback cost estimation uses per-call token delta, not session total.
+
+        On the second tool-call turn, the state accumulators already hold
+        tokens from turn 1.  The estimator must charge only for the new tokens
+        reported in the current call, not the running total.
+        """
+        from backend.copilot.baseline.service import (
+            _baseline_llm_caller,
+            _BaselineStreamState,
+        )
+
+        state = _BaselineStreamState(model="anthropic/claude-sonnet-4")
+
+        def make_stream(prompt_tokens: int, completion_tokens: int):
+            mock_raw = MagicMock()
+            mock_raw.headers = {}  # no x-total-cost
+            mock_stream = MagicMock()
+            mock_stream._response = mock_raw
+
+            mock_chunk = MagicMock()
+            mock_chunk.usage = MagicMock()
+            mock_chunk.usage.prompt_tokens = prompt_tokens
+            mock_chunk.usage.completion_tokens = completion_tokens
+            mock_chunk.usage.prompt_tokens_details = None
+            mock_chunk.choices = []
+
+            async def chunk_aiter():
+                yield mock_chunk
+
+            mock_stream.__aiter__ = lambda self: chunk_aiter()
+            return mock_stream
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(
+            side_effect=[
+                make_stream(1000, 200),  # turn 1
+                make_stream(1100, 300),  # turn 2 (accumulators now hold 1000+1100, 200+300)
+            ]
+        )
+
+        with patch(
+            "backend.copilot.baseline.service._get_openai_client",
+            return_value=mock_client,
+        ):
+            await _baseline_llm_caller(
+                messages=[{"role": "user", "content": "hi"}],
+                tools=[],
+                state=state,
+            )
+            await _baseline_llm_caller(
+                messages=[{"role": "user", "content": "follow up"}],
+                tools=[],
+                state=state,
+            )
+
+        # Turn 1: 1000 * 3.0/1M + 200 * 15.0/1M = 0.003 + 0.003 = 0.006
+        # Turn 2: 1100 * 3.0/1M + 300 * 15.0/1M = 0.0033 + 0.0045 = 0.0078
+        # Total: 0.0138 — NOT 0.006 + cumulative (2100*3/1M + 500*15/1M = 0.006+0.0138)
+        expected = pytest.approx(0.006 + 0.0078, rel=1e-5)
+        assert state.cost_usd == expected
+        # Accumulators hold all tokens across both turns
+        assert state.turn_prompt_tokens == 2100
+        assert state.turn_completion_tokens == 500