From 1b67d4cf8ad8baef67885d4ee90d1246971d8cb3 Mon Sep 17 00:00:00 2001
From: majdyz <zamil.majdy@agpt.co>
Date: Wed, 15 Apr 2026 14:38:39 +0700
Subject: [PATCH] fix(backend/copilot): make system prompt fully static for
 cross-user prompt caching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The system prompt was not cacheable across sessions or users due to two
sources of per-session dynamic content leaking into it:

1. sdk_cwd (/tmp/copilot-<uuid>) was embedded in the storage supplement
   via get_sdk_supplement(cwd=sdk_cwd). Every session has a unique UUID,
   making the system prompt unique per session — cache miss every first
   message.

2. Graphiti warm_ctx (user-specific memory facts) was appended directly
   to the system prompt on the first turn, making it unique per user per
   turn.

Fix both by keeping the system prompt fully static:
- get_sdk_supplement now ignores cwd and uses the constant placeholder
  "/tmp/copilot-<session-id>" in the supplement text. The actual cwd is
  still passed to ClaudeAgentOptions.cwd so the subprocess uses the right
  directory.
- warm_ctx is now injected into the first user message as a trusted
  <memory_context> block (before inject_user_context runs), so it is
  persisted to DB alongside the <user_context> prefix and replayed
  correctly on --resume without re-fetching.

After this change all users share the same system prompt text — one
cache write globally per model, then cache reads for everyone.
---
 .../backend/backend/copilot/prompting.py      | 22 ++++++++++++++--
 .../backend/backend/copilot/sdk/service.py    | 26 +++++++++++++------
 2 files changed, 38 insertions(+), 10 deletions(-)
diff --git a/autogpt_platform/backend/backend/copilot/prompting.py b/autogpt_platform/backend/backend/copilot/prompting.py
index c500a2b865..d93acd52d4 100644
--- a/autogpt_platform/backend/backend/copilot/prompting.py
+++ b/autogpt_platform/backend/backend/copilot/prompting.py
@@ -331,6 +331,9 @@ def _generate_tool_documentation() -> str:
     return docs
 
 
+_LOCAL_STORAGE_SUPPLEMENT: str | None = None
+
+
 def get_sdk_supplement(use_e2b: bool, cwd: str = "") -> str:
     """Get the supplement for SDK mode (Claude Agent SDK).
 
@@ -338,16 +341,31 @@ def get_sdk_supplement(use_e2b: bool, cwd: str = "") -> str:
     receives tool schemas from the SDK. Only includes technical notes about
     storage systems and execution environment.
 
+    The system prompt must be **identical across all sessions and users** to
+    enable cross-session LLM prompt-cache hits (Anthropic caches on exact
+    content). To preserve this invariant, the local-mode supplement uses a
+    generic placeholder for the working directory instead of the real
+    session-specific UUID path. The actual ``cwd`` is passed to the CLI
+    subprocess via ``ClaudeAgentOptions.cwd`` so the model's shell commands
+    land in the right directory; the model can run ``pwd`` to confirm the
+    exact path.
+
     Args:
         use_e2b: Whether E2B cloud sandbox is being used
-        cwd: Current working directory (only used in local_storage mode)
+        cwd: Unused — kept for call-site compatibility.
 
     Returns:
         The supplement string to append to the system prompt
     """
+    del cwd  # intentionally unused — see docstring
     if use_e2b:
         return _get_cloud_sandbox_supplement()
-    return _get_local_storage_supplement(cwd)
+    global _LOCAL_STORAGE_SUPPLEMENT
+    if _LOCAL_STORAGE_SUPPLEMENT is None:
+        _LOCAL_STORAGE_SUPPLEMENT = _get_local_storage_supplement(
+            "/tmp/copilot-<session-id>"
+        )
+    return _LOCAL_STORAGE_SUPPLEMENT
 
 
 def get_graphiti_supplement() -> str:
diff --git a/autogpt_platform/backend/backend/copilot/sdk/service.py b/autogpt_platform/backend/backend/copilot/sdk/service.py
index f291d96431..b319316520 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -2172,13 +2172,15 @@ async def stream_chat_completion_sdk(
             + graphiti_supplement
         )
 
-        # Warm context: pre-load relevant facts from Graphiti on first turn
+        # Warm context: pre-load relevant facts from Graphiti on first turn.
+        # Stored here and injected into the first user message (not the system
+        # prompt) so the system prompt stays identical across all users and
+        # sessions, enabling cross-session Anthropic prompt-cache hits.
+        warm_ctx = ""
         if graphiti_enabled and user_id and len(session.messages) <= 1:
             from backend.copilot.graphiti.context import fetch_warm_context
 
-            warm_ctx = await fetch_warm_context(user_id, message or "")
-            if warm_ctx:
-                system_prompt += f"\n\n{warm_ctx}"
+            warm_ctx = await fetch_warm_context(user_id, message or "") or ""
 
         # Process transcript download result and restore CLI native session.
         # The CLI native session file (uploaded after each turn) is the
@@ -2434,11 +2436,19 @@ async def stream_chat_completion_sdk(
         # cache it across sessions.
         #
         # On resume (has_history=True) we intentionally skip re-injection: the
-        # transcript already contains the <user_context> prefix from the original
-        # turn (persisted to the DB in inject_user_context), so the SDK replay
-        # carries context continuity without us prepending it again.  Adding it
-        # a second time would duplicate the block and inflate tokens.
+        # transcript already contains the <user_context> and <memory_context>
+        # prefixes from the original turn (persisted to the DB via
+        # inject_user_context), so the SDK replay carries context continuity
+        # without us prepending them again.
         if not has_history:
+            # Prepend Graphiti warm context as a trusted <memory_context> block
+            # so it reaches the LLM without polluting the (cached) system prompt.
+            # inject_user_context will persist the full prefixed message to DB.
+            if warm_ctx:
+                current_message = (
+                    f"<memory_context>\n{warm_ctx}\n</memory_context>\n\n"
+                    + current_message
+                )
             prefixed_message = await inject_user_context(
                 understanding, current_message, session_id, session.messages
             )