test: add E2E screenshots for PR #12725

test(backend/copilot): add unit tests for update_message_content_by_sequence
Cover success, not-found (returns False + warning), and DB-error (returns False + error log) paths to push patch coverage above the 80% threshold.
2026-04-30 03:00:41 -04:00 · 2026-04-10 00:20:31 +07:00 · 2026-04-09 23:52:39 +07:00 · 2026-04-09 23:40:14 +07:00 · 2026-04-09 23:24:43 +07:00 · 2026-04-09 22:58:12 +07:00
14 changed files with 587 additions and 31 deletions
--- a/autogpt_platform/backend/backend/blocks/llm.py
+++ b/autogpt_platform/backend/backend/blocks/llm.py
@@ -744,12 +744,12 @@ class LLMResponse(BaseModel):

 def convert_openai_tool_fmt_to_anthropic(
    openai_tools: list[dict] | None = None,
-) -> Iterable[ToolParam] | anthropic.Omit:
+) -> Iterable[ToolParam] | anthropic.NotGiven:
    """
    Convert OpenAI tool format to Anthropic tool format.
    """
    if not openai_tools or len(openai_tools) == 0:
-        return anthropic.omit
+        return anthropic.NOT_GIVEN

    anthropic_tools = []
    for tool in openai_tools:
@@ -972,6 +972,11 @@ async def llm_call(
    elif provider == "anthropic":

        an_tools = convert_openai_tool_fmt_to_anthropic(tools)
+        # Cache tool definitions alongside the system prompt.
+        # Placing cache_control on the last tool caches all tool schemas as a
+        # single prefix — reads cost 10% of normal input tokens.
+        if isinstance(an_tools, list) and an_tools:
+            an_tools[-1] = {**an_tools[-1], "cache_control": {"type": "ephemeral"}}

        system_messages = [p["content"] for p in prompt if p["role"] == "system"]
        sysprompt = " ".join(system_messages)
@@ -994,14 +999,22 @@ async def llm_call(
        client = anthropic.AsyncAnthropic(
            api_key=credentials.api_key.get_secret_value()
        )
-        resp = await client.messages.create(
+        create_kwargs: dict[str, Any] = dict(
            model=llm_model.value,
-            system=sysprompt,
            messages=messages,
            max_tokens=max_tokens,
            tools=an_tools,
            timeout=600,
        )
+        if sysprompt.strip():
+            create_kwargs["system"] = [
+                {
+                    "type": "text",
+                    "text": sysprompt,
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ]
+        resp = await client.messages.create(**create_kwargs)

        if not resp.content:
            raise ValueError("No content returned from Anthropic.")
--- a/autogpt_platform/backend/backend/blocks/test/test_llm.py
+++ b/autogpt_platform/backend/backend/blocks/test/test_llm.py
@@ -1111,3 +1111,181 @@ class TestExtractOpenRouterCost:
    def test_returns_none_for_negative_cost(self):
        response = self._mk_response({"x-total-cost": "-0.005"})
        assert llm.extract_openrouter_cost(response) is None
+
+
+class TestAnthropicCacheControl:
+    """Verify that llm_call attaches cache_control to the system prompt block
+    and to the last tool definition when calling the Anthropic API."""
+
+    def _make_anthropic_credentials(self) -> llm.APIKeyCredentials:
+        from pydantic import SecretStr
+
+        return llm.APIKeyCredentials(
+            id="test-anthropic-id",
+            provider="anthropic",
+            api_key=SecretStr("mock-anthropic-key"),
+            title="Mock Anthropic key",
+            expires_at=None,
+        )
+
+    @pytest.mark.asyncio
+    async def test_system_prompt_sent_as_block_with_cache_control(self):
+        """The system prompt is wrapped in a structured block with cache_control ephemeral."""
+        mock_resp = MagicMock()
+        mock_resp.content = [MagicMock(type="text", text="hello")]
+        mock_resp.usage = MagicMock(input_tokens=5, output_tokens=3)
+
+        captured_kwargs: dict = {}
+
+        async def fake_create(**kwargs):
+            captured_kwargs.update(kwargs)
+            return mock_resp
+
+        mock_client = MagicMock()
+        mock_client.messages.create = fake_create
+
+        credentials = self._make_anthropic_credentials()
+
+        with patch("anthropic.AsyncAnthropic", return_value=mock_client):
+            await llm.llm_call(
+                credentials=credentials,
+                llm_model=llm.LlmModel.CLAUDE_4_6_SONNET,
+                prompt=[
+                    {"role": "system", "content": "You are an assistant."},
+                    {"role": "user", "content": "Hello"},
+                ],
+                max_tokens=100,
+            )
+
+        system_arg = captured_kwargs.get("system")
+        assert isinstance(system_arg, list), "system should be a list of blocks"
+        assert len(system_arg) == 1
+        block = system_arg[0]
+        assert block["type"] == "text"
+        assert block["text"] == "You are an assistant."
+        assert block.get("cache_control") == {"type": "ephemeral"}
+
+    @pytest.mark.asyncio
+    async def test_last_tool_gets_cache_control(self):
+        """cache_control is placed on the last tool in the Anthropic tools list."""
+        mock_resp = MagicMock()
+        mock_resp.content = [MagicMock(type="text", text="ok")]
+        mock_resp.usage = MagicMock(input_tokens=10, output_tokens=5)
+
+        captured_kwargs: dict = {}
+
+        async def fake_create(**kwargs):
+            captured_kwargs.update(kwargs)
+            return mock_resp
+
+        mock_client = MagicMock()
+        mock_client.messages.create = fake_create
+
+        credentials = self._make_anthropic_credentials()
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "tool_a",
+                    "description": "First tool",
+                    "parameters": {"type": "object", "properties": {}, "required": []},
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "tool_b",
+                    "description": "Second tool",
+                    "parameters": {"type": "object", "properties": {}, "required": []},
+                },
+            },
+        ]
+
+        with patch("anthropic.AsyncAnthropic", return_value=mock_client):
+            await llm.llm_call(
+                credentials=credentials,
+                llm_model=llm.LlmModel.CLAUDE_4_6_SONNET,
+                prompt=[
+                    {"role": "system", "content": "System."},
+                    {"role": "user", "content": "Do something"},
+                ],
+                max_tokens=100,
+                tools=tools,
+            )
+
+        an_tools = captured_kwargs.get("tools")
+        assert isinstance(an_tools, list)
+        assert len(an_tools) == 2
+        assert (
+            an_tools[0].get("cache_control") is None
+        ), "Only last tool gets cache_control"
+        assert an_tools[-1].get("cache_control") == {"type": "ephemeral"}
+
+    @pytest.mark.asyncio
+    async def test_no_tools_no_cache_control_on_tools(self):
+        """When there are no tools, the Anthropic call receives anthropic.NOT_GIVEN for tools."""
+        mock_resp = MagicMock()
+        mock_resp.content = [MagicMock(type="text", text="ok")]
+        mock_resp.usage = MagicMock(input_tokens=5, output_tokens=2)
+
+        captured_kwargs: dict = {}
+
+        async def fake_create(**kwargs):
+            captured_kwargs.update(kwargs)
+            return mock_resp
+
+        mock_client = MagicMock()
+        mock_client.messages.create = fake_create
+
+        credentials = self._make_anthropic_credentials()
+
+        with patch("anthropic.AsyncAnthropic", return_value=mock_client):
+            await llm.llm_call(
+                credentials=credentials,
+                llm_model=llm.LlmModel.CLAUDE_4_6_SONNET,
+                prompt=[
+                    {"role": "system", "content": "System."},
+                    {"role": "user", "content": "Hello"},
+                ],
+                max_tokens=100,
+                tools=None,
+            )
+
+        tools_arg = captured_kwargs.get("tools")
+        assert tools_arg is llm.convert_openai_tool_fmt_to_anthropic(
+            None
+        ), "Empty tools should pass anthropic.NOT_GIVEN sentinel"
+
+    @pytest.mark.asyncio
+    async def test_empty_system_prompt_omits_system_key(self):
+        """When sysprompt is empty, the 'system' key must not be sent to Anthropic.
+
+        Anthropic rejects empty text blocks; the guard in llm_call must ensure
+        the system argument is omitted entirely when no system messages are present.
+        """
+        mock_resp = MagicMock()
+        mock_resp.content = [MagicMock(type="text", text="ok")]
+        mock_resp.usage = MagicMock(input_tokens=3, output_tokens=2)
+
+        captured_kwargs: dict = {}
+
+        async def fake_create(**kwargs):
+            captured_kwargs.update(kwargs)
+            return mock_resp
+
+        mock_client = MagicMock()
+        mock_client.messages.create = fake_create
+
+        credentials = self._make_anthropic_credentials()
+
+        with patch("anthropic.AsyncAnthropic", return_value=mock_client):
+            await llm.llm_call(
+                credentials=credentials,
+                llm_model=llm.LlmModel.CLAUDE_4_6_SONNET,
+                prompt=[{"role": "user", "content": "Hi"}],
+                max_tokens=50,
+            )
+
+        assert (
+            "system" not in captured_kwargs
+        ), "system must be omitted when sysprompt is empty to avoid Anthropic 400"
--- a/autogpt_platform/backend/backend/copilot/baseline/service.py
+++ b/autogpt_platform/backend/backend/copilot/baseline/service.py
@@ -27,6 +27,7 @@ from opentelemetry import trace as otel_trace

 from backend.copilot.config import CopilotMode
 from backend.copilot.context import get_workspace_manager, set_execution_context
+from backend.copilot.db import update_message_content_by_sequence
 from backend.copilot.graphiti.config import is_enabled_for_user
 from backend.copilot.model import (
    ChatMessage,
@@ -52,7 +53,7 @@ from backend.copilot.response_model import (
    StreamUsage,
 )
 from backend.copilot.service import (
-    _build_system_prompt,
+    _build_cacheable_system_prompt,
    _get_openai_client,
    _update_title_async,
    config,
@@ -69,6 +70,7 @@ from backend.copilot.transcript import (
    validate_transcript,
 )
 from backend.copilot.transcript_builder import TranscriptBuilder
+from backend.data.understanding import format_understanding_for_prompt
 from backend.util.exceptions import NotFoundError
 from backend.util.prompt import (
    compress_context,
@@ -958,35 +960,34 @@ async def stream_chat_completion_baseline(
    # Build system prompt only on the first turn to avoid mid-conversation
    # changes from concurrent chats updating business understanding.
    is_first_turn = len(session.messages) <= 1
-    if is_first_turn:
-        prompt_task = _build_system_prompt(user_id, has_conversation_history=False)
+    # Gate context fetch on both first turn AND user message so that assistant-
+    # role calls (e.g. tool-result submissions) on the first turn don't trigger
+    # a needless DB lookup for user understanding.
+    should_inject_user_context = is_first_turn and is_user_message
+    if should_inject_user_context:
+        prompt_task = _build_cacheable_system_prompt(user_id)
    else:
-        prompt_task = _build_system_prompt(user_id=None, has_conversation_history=True)
+        prompt_task = _build_cacheable_system_prompt(None)

    # Run download + prompt build concurrently — both are independent I/O
    # on the request critical path.
    if user_id and len(session.messages) > 1:
-        transcript_covers_prefix, (base_system_prompt, _) = await asyncio.gather(
-            _load_prior_transcript(
-                user_id=user_id,
-                session_id=session_id,
-                session_msg_count=len(session.messages),
-                transcript_builder=transcript_builder,
-            ),
-            prompt_task,
+        transcript_covers_prefix, (base_system_prompt, understanding) = (
+            await asyncio.gather(
+                _load_prior_transcript(
+                    user_id=user_id,
+                    session_id=session_id,
+                    session_msg_count=len(session.messages),
+                    transcript_builder=transcript_builder,
+                ),
+                prompt_task,
+            )
        )
    else:
-        base_system_prompt, _ = await prompt_task
+        base_system_prompt, understanding = await prompt_task

-    # Append user message to transcript.
-    # Always append when the message is present and is from the user,
-    # even on duplicate-suppressed retries (is_new_message=False).
-    # The loaded transcript may be stale (uploaded before the previous
-    # attempt stored this message), so skipping it would leave the
-    # transcript without the user turn, creating a malformed
-    # assistant-after-assistant structure when the LLM reply is added.
-    if message and is_user_message:
-        transcript_builder.append_user(content=message)
+    # Append user message to transcript after context injection below so the
+    # transcript receives the prefixed message when user context is available.

    # Generate title for new sessions
    if is_user_message and not session.title:
@@ -1047,6 +1048,48 @@ async def stream_chat_completion_baseline(
        elif msg.role == "user" and msg.content:
            openai_messages.append({"role": msg.role, "content": msg.content})

+    # Inject user context into the first user message on first turn.
+    # Done before attachment/URL injection so the context prefix lands at
+    # the very start of the message content.
+    # The prefixed content is also stored back into session.messages and the
+    # transcript so that resumed sessions and the transcript both carry the
+    # personalisation beyond the first request.
+    user_message_for_transcript = message
+    if should_inject_user_context and understanding:
+        user_ctx = format_understanding_for_prompt(understanding)
+        prefixed: str | None = None
+        for msg in openai_messages:
+            if msg["role"] == "user":
+                prefixed = (
+                    f"<user_context>\n{user_ctx}\n</user_context>\n\n{msg['content']}"
+                )
+                msg["content"] = prefixed
+                break
+        if prefixed is not None:
+            # Persist the prefixed content so subsequent turns and --resume
+            # retain the user context.
+            # The user message was already saved to DB before context injection
+            # (at ~line 932); update the DB record so the prefixed content
+            # survives page reload.
+            for idx, session_msg in enumerate(session.messages):
+                if session_msg.role == "user":
+                    session_msg.content = prefixed
+                    await update_message_content_by_sequence(session_id, idx, prefixed)
+                    break
+            user_message_for_transcript = prefixed
+        else:
+            logger.warning("[Baseline] No user message found for context injection")
+
+    # Append user message to transcript.
+    # Always append when the message is present and is from the user,
+    # even on duplicate-suppressed retries (is_new_message=False).
+    # The loaded transcript may be stale (uploaded before the previous
+    # attempt stored this message), so skipping it would leave the
+    # transcript without the user turn, creating a malformed
+    # assistant-after-assistant structure when the LLM reply is added.
+    if message and is_user_message:
+        transcript_builder.append_user(content=user_message_for_transcript or message)
+
    # --- File attachments (feature parity with SDK path) ---
    working_dir: str | None = None
    attachment_hint = ""
--- a/autogpt_platform/backend/backend/copilot/db.py
+++ b/autogpt_platform/backend/backend/copilot/db.py
@@ -498,6 +498,42 @@ async def update_tool_message_content(
        return False


+async def update_message_content_by_sequence(
+    session_id: str,
+    sequence: int,
+    new_content: str,
+) -> bool:
+    """Update the content of a specific message by its sequence number.
+
+    Used to persist content modifications (e.g. user-context prefix injection)
+    to a message that was already saved to the DB.
+
+    Args:
+        session_id: The chat session ID.
+        sequence: The 0-based sequence number of the message to update.
+        new_content: The new content to set.
+
+    Returns:
+        True if a message was updated, False otherwise.
+    """
+    try:
+        result = await PrismaChatMessage.prisma().update_many(
+            where={"sessionId": session_id, "sequence": sequence},
+            data={"content": sanitize_string(new_content)},
+        )
+        if result == 0:
+            logger.warning(
+                f"No message found to update for session {session_id}, sequence {sequence}"
+            )
+            return False
+        return True
+    except Exception as e:
+        logger.error(
+            f"Failed to update message for session {session_id}, sequence {sequence}: {e}"
+        )
+        return False
+
+
 async def set_turn_duration(session_id: str, duration_ms: int) -> None:
    """Set durationMs on the last assistant message in a session.

--- a/autogpt_platform/backend/backend/copilot/db_test.py
+++ b/autogpt_platform/backend/backend/copilot/db_test.py
@@ -14,6 +14,7 @@ from backend.copilot.db import (
    PaginatedMessages,
    get_chat_messages_paginated,
    set_turn_duration,
+    update_message_content_by_sequence,
 )
 from backend.copilot.model import ChatMessage as CopilotChatMessage
 from backend.copilot.model import ChatSession, get_chat_session, upsert_chat_session
@@ -386,3 +387,53 @@ async def test_set_turn_duration_no_assistant_message(setup_test_user, test_user
    assert cached is not None
    # User message should not have durationMs
    assert cached.messages[0].duration_ms is None
+
+
+# ---------- update_message_content_by_sequence ----------
+
+
+@pytest.mark.asyncio
+async def test_update_message_content_by_sequence_success():
+    """Returns True when update_many reports at least one row updated."""
+    with patch.object(PrismaChatMessage, "prisma") as mock_prisma:
+        mock_prisma.return_value.update_many = AsyncMock(return_value=1)
+
+        result = await update_message_content_by_sequence("sess-1", 0, "new content")
+
+    assert result is True
+    mock_prisma.return_value.update_many.assert_called_once_with(
+        where={"sessionId": "sess-1", "sequence": 0},
+        data={"content": "new content"},
+    )
+
+
+@pytest.mark.asyncio
+async def test_update_message_content_by_sequence_not_found():
+    """Returns False and logs a warning when no rows are updated."""
+    with (
+        patch.object(PrismaChatMessage, "prisma") as mock_prisma,
+        patch("backend.copilot.db.logger") as mock_logger,
+    ):
+        mock_prisma.return_value.update_many = AsyncMock(return_value=0)
+
+        result = await update_message_content_by_sequence("sess-1", 99, "content")
+
+    assert result is False
+    mock_logger.warning.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_update_message_content_by_sequence_db_error():
+    """Returns False and logs an error when the DB raises an exception."""
+    with (
+        patch.object(PrismaChatMessage, "prisma") as mock_prisma,
+        patch("backend.copilot.db.logger") as mock_logger,
+    ):
+        mock_prisma.return_value.update_many = AsyncMock(
+            side_effect=RuntimeError("db error")
+        )
+
+        result = await update_message_content_by_sequence("sess-1", 0, "content")
+
+    assert result is False
+    mock_logger.error.assert_called_once()
--- a/autogpt_platform/backend/backend/copilot/prompt_cache_test.py
+++ b/autogpt_platform/backend/backend/copilot/prompt_cache_test.py
@@ -0,0 +1,146 @@
+"""Unit tests for the cacheable system prompt building logic.
+
+These tests verify that _build_cacheable_system_prompt:
+- Returns the static _CACHEABLE_SYSTEM_PROMPT when no user_id is given
+- Returns the static prompt + understanding when user_id is given
+- Falls through to _CACHEABLE_SYSTEM_PROMPT when Langfuse is not configured
+- Returns the Langfuse-compiled prompt when Langfuse is configured
+- Handles DB errors and Langfuse errors gracefully
+"""
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+_SVC = "backend.copilot.service"
+
+
+class TestBuildCacheableSystemPrompt:
+    @pytest.mark.asyncio
+    async def test_no_user_id_returns_static_prompt(self):
+        """When user_id is None, no DB lookup happens and the static prompt is returned."""
+        with (patch(f"{_SVC}._is_langfuse_configured", return_value=False),):
+            from backend.copilot.service import (
+                _CACHEABLE_SYSTEM_PROMPT,
+                _build_cacheable_system_prompt,
+            )
+
+            prompt, understanding = await _build_cacheable_system_prompt(None)
+
+        assert prompt == _CACHEABLE_SYSTEM_PROMPT
+        assert understanding is None
+
+    @pytest.mark.asyncio
+    async def test_with_user_id_fetches_understanding(self):
+        """When user_id is provided, understanding is fetched and returned alongside prompt."""
+        fake_understanding = MagicMock()
+        mock_db = MagicMock()
+        mock_db.get_business_understanding = AsyncMock(return_value=fake_understanding)
+
+        with (
+            patch(f"{_SVC}._is_langfuse_configured", return_value=False),
+            patch(f"{_SVC}.understanding_db", return_value=mock_db),
+        ):
+            from backend.copilot.service import (
+                _CACHEABLE_SYSTEM_PROMPT,
+                _build_cacheable_system_prompt,
+            )
+
+            prompt, understanding = await _build_cacheable_system_prompt("user-123")
+
+        assert prompt == _CACHEABLE_SYSTEM_PROMPT
+        assert understanding is fake_understanding
+        mock_db.get_business_understanding.assert_called_once_with("user-123")
+
+    @pytest.mark.asyncio
+    async def test_db_error_returns_prompt_with_no_understanding(self):
+        """When the DB raises an exception, understanding is None and prompt is still returned."""
+        mock_db = MagicMock()
+        mock_db.get_business_understanding = AsyncMock(
+            side_effect=RuntimeError("db down")
+        )
+
+        with (
+            patch(f"{_SVC}._is_langfuse_configured", return_value=False),
+            patch(f"{_SVC}.understanding_db", return_value=mock_db),
+        ):
+            from backend.copilot.service import (
+                _CACHEABLE_SYSTEM_PROMPT,
+                _build_cacheable_system_prompt,
+            )
+
+            prompt, understanding = await _build_cacheable_system_prompt("user-456")
+
+        assert prompt == _CACHEABLE_SYSTEM_PROMPT
+        assert understanding is None
+
+    @pytest.mark.asyncio
+    async def test_langfuse_compiled_prompt_returned(self):
+        """When Langfuse is configured and returns a prompt, the compiled text is returned."""
+        fake_understanding = MagicMock()
+        mock_db = MagicMock()
+        mock_db.get_business_understanding = AsyncMock(return_value=fake_understanding)
+
+        langfuse_prompt_text = "You are a Langfuse-sourced assistant."
+        mock_prompt_obj = MagicMock()
+        mock_prompt_obj.compile.return_value = langfuse_prompt_text
+
+        mock_langfuse = MagicMock()
+        mock_langfuse.get_prompt.return_value = mock_prompt_obj
+
+        with (
+            patch(f"{_SVC}._is_langfuse_configured", return_value=True),
+            patch(f"{_SVC}.understanding_db", return_value=mock_db),
+            patch(f"{_SVC}._get_langfuse", return_value=mock_langfuse),
+            patch(
+                f"{_SVC}.asyncio.to_thread", new=AsyncMock(return_value=mock_prompt_obj)
+            ),
+        ):
+            from backend.copilot.service import _build_cacheable_system_prompt
+
+            prompt, understanding = await _build_cacheable_system_prompt("user-789")
+
+        assert prompt == langfuse_prompt_text
+        assert understanding is fake_understanding
+        mock_prompt_obj.compile.assert_called_once_with(users_information="")
+
+    @pytest.mark.asyncio
+    async def test_langfuse_error_falls_back_to_static_prompt(self):
+        """When Langfuse raises an error, the fallback _CACHEABLE_SYSTEM_PROMPT is used."""
+        mock_db = MagicMock()
+        mock_db.get_business_understanding = AsyncMock(return_value=None)
+
+        with (
+            patch(f"{_SVC}._is_langfuse_configured", return_value=True),
+            patch(f"{_SVC}.understanding_db", return_value=mock_db),
+            patch(
+                f"{_SVC}.asyncio.to_thread",
+                new=AsyncMock(side_effect=RuntimeError("langfuse down")),
+            ),
+        ):
+            from backend.copilot.service import (
+                _CACHEABLE_SYSTEM_PROMPT,
+                _build_cacheable_system_prompt,
+            )
+
+            prompt, understanding = await _build_cacheable_system_prompt("user-000")
+
+        assert prompt == _CACHEABLE_SYSTEM_PROMPT
+        assert understanding is None
+
+
+class TestCacheableSystemPromptContent:
+    """Smoke-test the _CACHEABLE_SYSTEM_PROMPT constant for key structural requirements."""
+
+    def test_cacheable_prompt_has_no_placeholder(self):
+        """The static cacheable prompt must not contain format placeholders."""
+        from backend.copilot.service import _CACHEABLE_SYSTEM_PROMPT
+
+        assert "{users_information}" not in _CACHEABLE_SYSTEM_PROMPT
+        assert "{" not in _CACHEABLE_SYSTEM_PROMPT
+
+    def test_cacheable_prompt_mentions_user_context(self):
+        """The prompt instructs the model to parse <user_context> blocks."""
+        from backend.copilot.service import _CACHEABLE_SYSTEM_PROMPT
+
+        assert "user_context" in _CACHEABLE_SYSTEM_PROMPT
--- a/autogpt_platform/backend/backend/copilot/sdk/retry_scenarios_test.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/retry_scenarios_test.py
@@ -988,7 +988,7 @@ def _make_sdk_patches(
            dict(return_value=MagicMock(__enter__=MagicMock(), __exit__=MagicMock())),
        ),
        (
-            f"{_SVC}._build_system_prompt",
+            f"{_SVC}._build_cacheable_system_prompt",
            dict(new_callable=AsyncMock, return_value=("system prompt", None)),
        ),
        (
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -48,6 +48,7 @@ from backend.copilot.transcript import (
 )
 from backend.copilot.transcript_builder import TranscriptBuilder
 from backend.data.redis_client import get_redis_async
+from backend.data.understanding import format_understanding_for_prompt
 from backend.executor.cluster_lock import AsyncClusterLock
 from backend.util.exceptions import NotFoundError
 from backend.util.settings import Settings
@@ -61,6 +62,7 @@ from ..constants import (
    is_transient_api_error,
 )
 from ..context import encode_cwd_for_cli
+from ..db import update_message_content_by_sequence
 from ..graphiti.config import is_enabled_for_user
 from ..model import (
    ChatMessage,
@@ -85,7 +87,11 @@ from ..response_model import (
    StreamToolOutputAvailable,
    StreamUsage,
 )
-from ..service import _build_system_prompt, _is_langfuse_configured, _update_title_async
+from ..service import (
+    _build_cacheable_system_prompt,
+    _is_langfuse_configured,
+    _update_title_async,
+)
 from ..token_tracking import persist_and_record_usage
 from ..tools.e2b_sandbox import get_or_create_sandbox, pause_sandbox_direct
 from ..tools.sandbox import WORKSPACE_PREFIX, make_session_path
@@ -2052,9 +2058,9 @@ async def stream_chat_completion_sdk(
                )
                return None

-        e2b_sandbox, (base_system_prompt, _), dl = await asyncio.gather(
+        e2b_sandbox, (base_system_prompt, understanding), dl = await asyncio.gather(
            _setup_e2b(),
-            _build_system_prompt(user_id, has_conversation_history=has_history),
+            _build_cacheable_system_prompt(user_id if not has_history else None),
            _fetch_transcript(),
        )

@@ -2285,6 +2291,30 @@ async def stream_chat_completion_sdk(
            transcript_msg_count,
            session_id,
        )
+        # On the first turn inject user context into the message instead of the
+        # system prompt — the system prompt is now static (same for all users)
+        # so the LLM can cache it across sessions.
+        # current_message is updated so the transcript and session.messages also
+        # store the prefixed content, preserving personalisation across turns and
+        # on --resume.
+        if not has_history and understanding:
+            user_ctx = format_understanding_for_prompt(understanding)
+            prefixed_message = (
+                f"<user_context>\n{user_ctx}\n</user_context>\n\n{current_message}"
+            )
+            current_message = prefixed_message
+            query_message = prefixed_message
+            # Persist the prefixed content so resumed sessions retain the context.
+            # The user message was already saved to DB before context injection;
+            # update the DB record so the prefixed content survives page reload
+            # and --resume (the save at line ~1926 used the un-prefixed content).
+            for idx, session_msg in enumerate(session.messages):
+                if session_msg.role == "user":
+                    session_msg.content = prefixed_message
+                    await update_message_content_by_sequence(
+                        session_id, idx, prefixed_message
+                    )
+                    break
        # If files are attached, prepare them: images become vision
        # content blocks in the user message, other files go to sdk_cwd.
        attachments = await _prepare_file_attachments(
--- a/autogpt_platform/backend/backend/copilot/service.py
+++ b/autogpt_platform/backend/backend/copilot/service.py
@@ -70,6 +70,21 @@ Your goal is to help users automate tasks by:

 Be concise, proactive, and action-oriented. Bias toward showing working solutions over lengthy explanations."""

+# Static system prompt for token caching — identical for all users.
+# User-specific context is injected into the first user message instead,
+# so the system prompt never changes and can be cached across all sessions.
+_CACHEABLE_SYSTEM_PROMPT = """You are an AI automation assistant helping users build and run automations.
+
+Your goal is to help users automate tasks by:
+- Understanding their needs and business context
+- Building and running working automations
+- Delivering tangible value through action, not just explanation
+
+Be concise, proactive, and action-oriented. Bias toward showing working solutions over lengthy explanations.
+
+When the user provides a <user_context> block in their message, use it to personalise your responses.
+For users you are meeting for the first time with no context provided, greet them warmly and introduce them to the AutoGPT platform."""
+

 # ---------------------------------------------------------------------------
 # Shared helpers (used by SDK service and baseline)
@@ -150,6 +165,50 @@ async def _build_system_prompt(
    return compiled, understanding


+async def _build_cacheable_system_prompt(
+    user_id: str | None,
+) -> tuple[str, Any]:
+    """Build a fully static system prompt suitable for LLM token caching.
+
+    Unlike _build_system_prompt, user-specific context is NOT embedded here.
+    Callers must inject the returned understanding into the first user message
+    via format_understanding_for_prompt() so the system prompt stays identical
+    across all users and sessions, enabling cross-session cache hits.
+
+    Returns:
+        Tuple of (static_prompt, understanding_object_or_None)
+    """
+    understanding = None
+    if user_id:
+        try:
+            understanding = await understanding_db().get_business_understanding(user_id)
+        except Exception as e:
+            logger.warning(f"Failed to fetch business understanding: {e}")
+
+    if _is_langfuse_configured():
+        try:
+            label = (
+                None
+                if settings.config.app_env == AppEnvironment.PRODUCTION
+                else "latest"
+            )
+            prompt = await asyncio.to_thread(
+                _get_langfuse().get_prompt,
+                config.langfuse_prompt_name,
+                label=label,
+                cache_ttl_seconds=config.langfuse_prompt_cache_ttl,
+            )
+            # Pass empty string so existing Langfuse templates stay static
+            compiled = prompt.compile(users_information="")
+            return compiled, understanding
+        except Exception as e:
+            logger.warning(
+                f"Failed to fetch cacheable prompt from Langfuse, using default: {e}"
+            )
+
+    return _CACHEABLE_SYSTEM_PROMPT, understanding
+
+
 async def _generate_session_title(
    message: str,
    user_id: str | None = None,
--- a/docs/integrations/block-integrations/misc.md
+++ b/docs/integrations/block-integrations/misc.md
@@ -58,7 +58,7 @@ Tool and block identifiers provided in `tools` and `blocks` are validated at run
 | system_context | Optional additional context prepended to the prompt. Use this to constrain autopilot behavior, provide domain context, or set output format requirements. | str | No |
 | session_id | Session ID to continue an existing autopilot conversation. Leave empty to start a new session. Use the session_id output from a previous run to continue. | str | No |
 | max_recursion_depth | Maximum nesting depth when the autopilot calls this block recursively (sub-agent pattern). Prevents infinite loops. | int | No |
-| tools | Tool names to filter. Works with tools_exclude to form an allow-list or deny-list. Leave empty to apply no tool filter. | List["add_understanding" \| "ask_question" \| "bash_exec" \| "browser_act" \| "browser_navigate" \| "browser_screenshot" \| "connect_integration" \| "continue_run_block" \| "create_agent" \| "create_feature_request" \| "create_folder" \| "customize_agent" \| "delete_folder" \| "delete_workspace_file" \| "edit_agent" \| "find_agent" \| "find_block" \| "find_library_agent" \| "fix_agent_graph" \| "get_agent_building_guide" \| "get_doc_page" \| "get_mcp_guide" \| "list_folders" \| "list_workspace_files" \| "move_agents_to_folder" \| "move_folder" \| "read_workspace_file" \| "run_agent" \| "run_block" \| "run_mcp_tool" \| "search_docs" \| "search_feature_requests" \| "update_folder" \| "validate_agent_graph" \| "view_agent_output" \| "web_fetch" \| "write_workspace_file" \| "Agent" \| "Edit" \| "Glob" \| "Grep" \| "Read" \| "Task" \| "TodoWrite" \| "WebSearch" \| "Write"] | No |
+| tools | Tool names to filter. Works with tools_exclude to form an allow-list or deny-list. Leave empty to apply no tool filter. | List["add_understanding" \| "ask_question" \| "bash_exec" \| "browser_act" \| "browser_navigate" \| "browser_screenshot" \| "connect_integration" \| "continue_run_block" \| "create_agent" \| "create_feature_request" \| "create_folder" \| "customize_agent" \| "delete_folder" \| "delete_workspace_file" \| "edit_agent" \| "find_agent" \| "find_block" \| "find_library_agent" \| "fix_agent_graph" \| "get_agent_building_guide" \| "get_doc_page" \| "get_mcp_guide" \| "list_folders" \| "list_workspace_files" \| "memory_search" \| "memory_store" \| "move_agents_to_folder" \| "move_folder" \| "read_workspace_file" \| "run_agent" \| "run_block" \| "run_mcp_tool" \| "search_docs" \| "search_feature_requests" \| "update_folder" \| "validate_agent_graph" \| "view_agent_output" \| "web_fetch" \| "write_workspace_file" \| "Agent" \| "Edit" \| "Glob" \| "Grep" \| "Read" \| "Task" \| "TodoWrite" \| "WebSearch" \| "Write"] | No |
 | tools_exclude | Controls how the 'tools' list is interpreted. True (default): 'tools' is a deny-list — listed tools are blocked, all others are allowed. An empty 'tools' list means allow everything. False: 'tools' is an allow-list — only listed tools are permitted. | bool | No |
 | blocks | Block identifiers to filter when the copilot uses run_block. Each entry can be: a block name (e.g. 'HTTP Request'), a full block UUID, or the first 8 hex characters of the UUID (e.g. 'c069dc6b'). Works with blocks_exclude. Leave empty to apply no block filter. | List[str] | No |
 | blocks_exclude | Controls how the 'blocks' list is interpreted. True (default): 'blocks' is a deny-list — listed blocks are blocked, all others are allowed. An empty 'blocks' list means allow everything. False: 'blocks' is an allow-list — only listed blocks are permitted. | bool | No |
--- a/test-screenshots/PR-12725/01-copilot-page.png
+++ b/test-screenshots/PR-12725/01-copilot-page.png
--- a/test-screenshots/PR-12725/02-chat-response.png
+++ b/test-screenshots/PR-12725/02-chat-response.png
--- a/test-screenshots/PR-12725/03-chat-session.png
+++ b/test-screenshots/PR-12725/03-chat-session.png
--- a/test-screenshots/PR-12725/04-chat-with-messages.png
+++ b/test-screenshots/PR-12725/04-chat-with-messages.png
Author	SHA1	Message	Date
Zamil Majdy	c50cee86d2	test: add E2E screenshots for PR #12725	2026-04-10 00:20:31 +07:00
Zamil Majdy	f2b8f81bb1	test(backend/copilot): add unit tests for update_message_content_by_sequence Cover success, not-found (returns False + warning), and DB-error (returns False + error log) paths to push patch coverage above the 80% threshold.	2026-04-09 23:52:39 +07:00
Zamil Majdy	ce0cb1e035	fix(backend/copilot): persist user-context prefix to DB in both SDK and baseline paths The user message was saved to DB before the <user_context> prefix was added to session.messages. Subsequent upsert_chat_session calls only append new messages (slicing by existing_message_count), so the prefixed content was never written to the DB. On page reload or --resume, the unprefixed version was loaded, losing personalisation. Fix: add update_message_content_by_sequence to db.py and call it after injecting the prefix in both sdk/service.py and baseline/service.py.	2026-04-09 23:40:14 +07:00
Zamil Majdy	e5ea2e0d5b	fix(backend/copilot): fix stale docstring referencing anthropic.omit instead of NOT_GIVEN	2026-04-09 23:24:43 +07:00
Zamil Majdy	b7f242f163	chore(backend/copilot): merge dev to pick up graphiti memory and update docs	2026-04-09 22:58:12 +07:00
Zamil Majdy	be86a911e1	fix(frontend): revert accidental openapi.json changes from export hook The previous commit accidentally included SUBSCRIPTION in CreditTransactionType via the local export-api-schema hook which used a Prisma client generated from a different worktree schema. Restore to the correct pre-commit state.	2026-04-09 22:43:15 +07:00
Zamil Majdy	54763b660b	fix(backend/copilot): persist user_context prefix and guard empty Anthropic system block - Guard Anthropic system block behind sysprompt.strip() to avoid 400 errors when sysprompt is empty (Anthropic rejects empty text blocks with 400) - Fix anthropic.omit -> anthropic.NOT_GIVEN in convert_openai_tool_fmt_to_anthropic - Persist <user_context> prefix into session.messages and transcript on first turn in both baseline and SDK paths so personalisation survives resume/reload - Add test for empty-sysprompt -> system key omitted in Anthropic API call	2026-04-09 22:30:39 +07:00
Zamil Majdy	6d60265221	fix(backend/copilot): update retry_scenarios_test to use renamed function `_build_system_prompt` was renamed to `_build_cacheable_system_prompt` in the SDK path as part of the prompt caching PR. Update the patch target in `retry_scenarios_test.py` to match the new name so the tests can find the attribute.	2026-04-09 19:55:15 +07:00
Zamil Majdy	0b8997eb01	perf(backend/copilot): gate user-context DB fetch on is_user_message too Aligns fetch logic with injection logic: `should_inject_user_context` now requires both `is_first_turn` and `is_user_message`, so assistant-role calls (e.g. tool-result submissions) on the first turn no longer trigger a needless `_build_cacheable_system_prompt(user_id)` DB lookup. Addresses coderabbitai nitpick from review 4082258841.	2026-04-09 19:38:18 +07:00
Zamil Majdy	2ff036b86b	fix(backend/copilot): resolve merge conflicts with dev branch Keep caching changes (static system prompt + cache_control markers) on top of dev's new features: transcript support, file attachments, URL context in baseline path, and _update_title_async in SDK path.	2026-04-09 19:33:49 +07:00
Zamil Majdy	1fc3cc74ea	fix(backend/copilot): skip user DB lookup on non-first turns In the SDK path, pass user_id to _build_cacheable_system_prompt only when has_history is False, matching the baseline path. Previously user understanding was fetched from the DB on every turn even though it is only injected into the first user message, causing an N+1 query. Also add a defensive logger.warning in the baseline path when no user message is found for context injection (guarded by is_first_turn, so this edge case is nearly impossible but surfaces unexpected states).	2026-04-09 19:21:02 +07:00
Zamil Majdy	815659d188	perf(backend/copilot): enable LLM prompt caching to reduce token costs Move user-specific context out of the system prompt into the first user message, making the system prompt fully static across all users. Add explicit Anthropic cache_control markers on both system prompt and tool definitions in the direct API path (blocks/llm.py).	2026-04-09 19:02:33 +07:00