fix(backend/copilot): always download CLI session from GCS to prevent cross-pod context loss

Removes the same-pod optimisation in restore_cli_session that returned early when a local session file existed. In a multi-pod environment the local file may be from a previous turn that ran on this same pod; using it silently hides the most-recent turn (which ran on a different pod and uploaded to GCS) from the model. Root cause confirmed via GCloud logs on session 93ecf7c9: T14 uploaded 181 805 B to GCS, but T15 – which ran back on Pod A's stale file – uploaded only 176 635 B, proving it never saw T14's content. The fix always overwrites the local file from GCS so --resume always starts from the authoritative, up-to-date session regardless of which pod serves the request.
fix(backend/copilot): split compaction no-op log messages and add identical-content test
2026-04-30 03:00:41 -04:00 · 2026-04-16 00:57:23 +07:00 · 2026-04-16 00:21:30 +07:00 · 2026-04-16 00:09:24 +07:00 · 2026-04-16 00:05:32 +07:00 · 2026-04-15 16:53:30 +00:00
5 changed files with 581 additions and 82 deletions
--- a/autogpt_platform/backend/backend/copilot/config.py
+++ b/autogpt_platform/backend/backend/copilot/config.py
@@ -156,10 +156,9 @@ class ChatConfig(BaseSettings):
        "history compression. Falls back to compression when unavailable.",
    )
    claude_agent_fallback_model: str = Field(
-        default="claude-sonnet-4-20250514",
+        default="claude-sonnet-4-6",
        description="Fallback model when the primary model is unavailable (e.g. 529 "
-        "overloaded). The SDK automatically retries with this alternate model. "
-        "It must differ from the primary model.",
+        "overloaded). The SDK automatically retries with this cheaper model.",
    )
    claude_agent_max_turns: int = Field(
        default=50,
--- a/autogpt_platform/backend/backend/copilot/sdk/p0_guardrails_test.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/p0_guardrails_test.py
@@ -96,39 +96,6 @@ class TestResolveFallbackModel:
        assert result is not None
        assert "sonnet" in result.lower() or "claude" in result.lower()

-    def test_distinct_helper_drops_same_model(self):
-        """CLI fallback is omitted when it matches the resolved primary model."""
-        cfg = _make_config(
-            model="anthropic/claude-sonnet-4-6",
-            claude_agent_fallback_model="claude-sonnet-4-6",
-            use_openrouter=False,
-        )
-        with patch(f"{_SVC}.config", cfg):
-            from backend.copilot.sdk.service import (
-                _resolve_distinct_fallback_model,
-                _resolve_sdk_model,
-            )
-
-            assert _resolve_distinct_fallback_model(_resolve_sdk_model()) is None
-
-    def test_distinct_helper_keeps_different_model(self):
-        """CLI fallback is preserved when it differs from the primary model."""
-        cfg = _make_config(
-            model="anthropic/claude-sonnet-4-6",
-            claude_agent_fallback_model="claude-sonnet-4-20250514",
-            use_openrouter=False,
-        )
-        with patch(f"{_SVC}.config", cfg):
-            from backend.copilot.sdk.service import (
-                _resolve_distinct_fallback_model,
-                _resolve_sdk_model,
-            )
-
-            assert (
-                _resolve_distinct_fallback_model(_resolve_sdk_model())
-                == "claude-sonnet-4-20250514"
-            )
-

 # ---------------------------------------------------------------------------
 # Security & isolation env vars
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -95,6 +95,7 @@ from ..transcript import (
    cleanup_stale_project_dirs,
    compact_transcript,
    download_transcript,
+    maybe_compact_cli_session,
    read_compacted_entries,
    restore_cli_session,
    upload_cli_session,
@@ -686,21 +687,6 @@ def _resolve_fallback_model() -> str | None:
    return _normalize_model_name(raw)


-def _resolve_distinct_fallback_model(primary_model: str | None) -> str | None:
-    """Resolve a fallback model that does not collide with *primary_model*."""
-    fallback_model = _resolve_fallback_model()
-    if not fallback_model or not primary_model:
-        return fallback_model
-    if fallback_model == primary_model:
-        logger.warning(
-            "[SDK] Fallback model %s matches primary model %s; disabling fallback",
-            fallback_model,
-            primary_model,
-        )
-        return None
-    return fallback_model
-
-
 async def _resolve_model_and_multiplier(
    model: "CopilotLlmModel | None",
    session_id: str,
@@ -2516,6 +2502,14 @@ async def stream_chat_completion_sdk(
                    user_id, session_id, sdk_cwd, log_prefix=log_prefix
                )
                if cli_restored:
+                    # Proactively compact the CLI session if it's large enough
+                    # to risk triggering auto-compaction mid-turn.  The CLI's
+                    # silent auto-compact bypasses our PreCompact hook and
+                    # loses context uncontrollably; compacting proactively here
+                    # keeps the session well below the ~200K-token threshold.
+                    await maybe_compact_cli_session(
+                        sdk_cwd, session_id, config.model, log_prefix
+                    )
                    use_resume = True
                    resume_file = session_id  # CLI --resume expects UUID, not file path
                    transcript_msg_count = dl.message_count
@@ -2652,8 +2646,6 @@ async def stream_chat_completion_sdk(
            cross_user_cache=_cross_user,
        )

-        fallback_model = _resolve_distinct_fallback_model(sdk_model)
-
        sdk_options_kwargs: dict[str, Any] = {
            "system_prompt": system_prompt_value,
            "mcp_servers": {"copilot": mcp_server},
@@ -2663,6 +2655,10 @@ async def stream_chat_completion_sdk(
            "cwd": sdk_cwd,
            "max_buffer_size": config.claude_agent_max_buffer_size,
            "stderr": _on_stderr,
+            # --- P0 guardrails ---
+            # fallback_model: SDK auto-retries with this cheaper model on
+            # 529 (overloaded) errors, avoiding user-visible failures.
+            "fallback_model": _resolve_fallback_model(),
            # max_turns: hard cap on agentic tool-use loops per query to
            # prevent runaway execution from burning budget.
            "max_turns": config.claude_agent_max_turns,
@@ -2676,11 +2672,6 @@ async def stream_chat_completion_sdk(
            # native extended thinking), so it is safe to pass unconditionally.
            "max_thinking_tokens": config.claude_agent_max_thinking_tokens,
        }
-        if fallback_model:
-            # fallback_model: SDK auto-retries with this alternate model on
-            # 529 (overloaded) errors. Omit it entirely when it resolves to
-            # the same value as the primary model because the CLI rejects that.
-            sdk_options_kwargs["fallback_model"] = fallback_model
        # effort: only set for models with extended thinking (Opus).
        # Setting effort on Sonnet causes <internal_reasoning> tag leaks.
        if config.claude_agent_thinking_effort:
@@ -2933,9 +2924,10 @@ async def stream_chat_completion_sdk(
                    sdk_options_kwargs_retry.pop("resume", None)
                    sdk_options_kwargs_retry["session_id"] = session_id
                else:
-                    # T2+ retry without --resume: do not pass --session-id.
-                    # The T1 session file already exists at that path; re-using
-                    # the same ID would fail with "Session ID already in use".
+                    # T2+ retry without --resume: initial invocation used
+                    # --resume, which restored the T1 session file to local
+                    # storage.  Re-using session_id without --resume would
+                    # fail with "Session ID already in use".
                    sdk_options_kwargs_retry.pop("resume", None)
                    sdk_options_kwargs_retry.pop("session_id", None)
                # Recompute system_prompt for retry — ctx.use_resume may have
--- a/autogpt_platform/backend/backend/copilot/transcript.py
+++ b/autogpt_platform/backend/backend/copilot/transcript.py
@@ -689,6 +689,15 @@ def _cli_session_storage_path_parts(
    )


+# Byte-size threshold for proactive pre-query CLI session compaction.
+# Measured from production session 93ecf7c9: 233KB file → 204K tokens
+# (≈0.88 bytes/token).  120KB ≈ 105K tokens — roughly half the CLI's
+# ~200K auto-compaction threshold.  Compacting proactively here prevents
+# the CLI from silently auto-compacting mid-turn, which bypasses our
+# PreCompact hook and causes uncontrolled context loss.
+_PROACTIVE_COMPACT_THRESHOLD_BYTES = 120_000
+
+
 async def upload_cli_session(
    user_id: str,
    session_id: str,
@@ -716,7 +725,7 @@ async def upload_cli_session(
        return

    try:
-        content = Path(real_path).read_bytes()
+        raw_bytes = Path(real_path).read_bytes()
    except FileNotFoundError:
        logger.debug(
            "%s CLI session file not found, skipping upload: %s",
@@ -728,6 +737,32 @@ async def upload_cli_session(
        logger.warning("%s Failed to read CLI session file: %s", log_prefix, e)
        return

+    # Strip stale thinking blocks and metadata entries (progress, file-history-snapshot,
+    # queue-operation) from the CLI session before writing it back locally and uploading
+    # to GCS.  Thinking blocks from non-last assistant turns are not needed for --resume
+    # but can be massive (tens of thousands of tokens each), causing the CLI to auto-compact
+    # its session when the context window fills up.  Stripping keeps the session well below
+    # the ~200K-token compaction threshold and prevents silent context loss.
+    try:
+        raw_text = raw_bytes.decode("utf-8")
+        stripped_text = strip_for_upload(raw_text)
+        stripped_bytes = stripped_text.encode("utf-8")
+        if len(stripped_bytes) < len(raw_bytes):
+            # Write the stripped version back locally so same-pod turns also benefit.
+            Path(real_path).write_bytes(stripped_bytes)
+            logger.info(
+                "%s Stripped CLI session file: %dB → %dB",
+                log_prefix,
+                len(raw_bytes),
+                len(stripped_bytes),
+            )
+        content = stripped_bytes
+    except Exception as e:
+        logger.warning(
+            "%s Failed to strip CLI session file, uploading raw: %s", log_prefix, e
+        )
+        content = raw_bytes
+
    storage = await get_workspace_storage()
    wid, fid, fname = _cli_session_storage_path_parts(user_id, session_id)
    try:
@@ -767,17 +802,10 @@ async def restore_cli_session(
        )
        return False

-    # If the session file already exists locally (same-pod reuse), use it directly.
-    # Downloading from storage could overwrite a newer local version when a previous
-    # turn's upload failed: stored content is stale while the local file already
-    # contains extended history from that turn.
-    if Path(real_path).exists():
-        logger.debug(
-            "%s CLI session file already exists locally — using it for --resume",
-            log_prefix,
-        )
-        return True
-
+    # Always download from GCS, even if a local file exists.
+    # In a multi-pod load-balanced environment the local file may belong to a
+    # different (older) turn that ran on this same pod — using it would silently
+    # restore a stale context and hide the most-recent turn from the model.
    storage = await get_workspace_storage()
    path = _build_path_from_parts(
        _cli_session_storage_path_parts(user_id, session_id), storage
@@ -806,6 +834,103 @@ async def restore_cli_session(
        return False


+async def maybe_compact_cli_session(
+    sdk_cwd: str,
+    session_id: str,
+    model: str,
+    log_prefix: str = "[Transcript]",
+) -> bool:
+    """Proactively compact the CLI native session if it risks triggering auto-compaction.
+
+    Called after ``restore_cli_session()`` succeeds and before the SDK turn starts.
+    If the session file exceeds ``_PROACTIVE_COMPACT_THRESHOLD_BYTES``, runs LLM
+    summarization via ``compact_transcript()`` and writes the smaller result back
+    to disk so ``--resume`` uses the compacted context.
+
+    This prevents the CLI from silently auto-compacting mid-turn — a path that
+    bypasses our PreCompact hook and causes uncontrolled context loss for long
+    sessions, including pure-Sonnet sessions where thinking-block stripping
+    provides no relief.
+
+    Returns ``True`` if compaction was performed and the file was updated.
+    """
+    session_file = _cli_session_path(sdk_cwd, session_id)
+    real_path = os.path.realpath(session_file)
+    projects_base = _projects_base()
+
+    if not real_path.startswith(projects_base + os.sep):
+        logger.warning(
+            "%s CLI session path outside projects base, skipping proactive compaction",
+            log_prefix,
+        )
+        return False
+
+    try:
+        raw_bytes = Path(real_path).read_bytes()
+    except FileNotFoundError:
+        logger.debug(
+            "%s CLI session not found, skipping proactive compaction", log_prefix
+        )
+        return False
+    except OSError as e:
+        logger.warning(
+            "%s Failed to read CLI session for compaction: %s", log_prefix, e
+        )
+        return False
+
+    if len(raw_bytes) < _PROACTIVE_COMPACT_THRESHOLD_BYTES:
+        logger.debug(
+            "%s CLI session %dB < threshold %dB — no proactive compaction",
+            log_prefix,
+            len(raw_bytes),
+            _PROACTIVE_COMPACT_THRESHOLD_BYTES,
+        )
+        return False
+
+    logger.info(
+        "%s CLI session %dB >= threshold %dB — running proactive compaction",
+        log_prefix,
+        len(raw_bytes),
+        _PROACTIVE_COMPACT_THRESHOLD_BYTES,
+    )
+
+    try:
+        content = raw_bytes.decode("utf-8")
+    except UnicodeDecodeError as e:
+        logger.warning(
+            "%s CLI session is not valid UTF-8, skipping compaction: %s", log_prefix, e
+        )
+        return False
+
+    compacted = await compact_transcript(content, model=model, log_prefix=log_prefix)
+    if not compacted:
+        logger.warning(
+            "%s Proactive compaction failed or returned None — keeping original",
+            log_prefix,
+        )
+        return False
+    if compacted == content:
+        logger.warning(
+            "%s Proactive compaction returned identical content — keeping original",
+            log_prefix,
+        )
+        return False
+
+    compacted_bytes = compacted.encode("utf-8")
+    try:
+        Path(real_path).write_bytes(compacted_bytes)
+        logger.info(
+            "%s Proactively compacted CLI session: %dB → %dB",
+            log_prefix,
+            len(raw_bytes),
+            len(compacted_bytes),
+        )
+        return True
+    except OSError as e:
+        logger.warning("%s Failed to write compacted CLI session: %s", log_prefix, e)
+        return False
+
+
 async def upload_transcript(
    user_id: str,
    session_id: str,
--- a/autogpt_platform/backend/backend/copilot/transcript_test.py
+++ b/autogpt_platform/backend/backend/copilot/transcript_test.py
@@ -918,6 +918,202 @@ class TestUploadCliSession:

        mock_storage.store.assert_not_called()

+    def test_strips_session_before_upload_and_writes_back(self, tmp_path):
+        """Strippable entries (progress, thinking blocks) are removed before upload.
+
+        The stripped content is written back to disk (so same-pod turns benefit)
+        and the smaller bytes are uploaded to GCS.
+        """
+        import asyncio
+        import os
+        import re
+        from unittest.mock import AsyncMock, patch
+
+        from .transcript import _sanitize_id, upload_cli_session
+
+        projects_base = str(tmp_path)
+        session_id = "12345678-0000-0000-0000-000000000010"
+        sdk_cwd = str(tmp_path)
+
+        encoded_cwd = re.sub(r"[^a-zA-Z0-9]", "-", os.path.realpath(sdk_cwd))
+        session_dir = tmp_path / encoded_cwd
+        session_dir.mkdir(parents=True, exist_ok=True)
+        session_file = session_dir / f"{_sanitize_id(session_id)}.jsonl"
+
+        # A CLI session with a progress entry (strippable) and a real assistant message.
+        import json
+
+        progress_entry = {
+            "type": "progress",
+            "uuid": "p1",
+            "parentUuid": "u1",
+            "data": {"type": "bash_progress", "stdout": "running..."},
+        }
+        user_entry = {
+            "type": "user",
+            "uuid": "u1",
+            "message": {"role": "user", "content": "hello"},
+        }
+        asst_entry = {
+            "type": "assistant",
+            "uuid": "a1",
+            "parentUuid": "u1",
+            "message": {"role": "assistant", "content": "world"},
+        }
+        raw_content = (
+            json.dumps(progress_entry)
+            + "\n"
+            + json.dumps(user_entry)
+            + "\n"
+            + json.dumps(asst_entry)
+            + "\n"
+        )
+        raw_bytes = raw_content.encode("utf-8")
+        session_file.write_bytes(raw_bytes)
+
+        mock_storage = AsyncMock()
+
+        with (
+            patch(
+                "backend.copilot.transcript._projects_base",
+                return_value=projects_base,
+            ),
+            patch(
+                "backend.copilot.transcript.get_workspace_storage",
+                new_callable=AsyncMock,
+                return_value=mock_storage,
+            ),
+        ):
+            asyncio.run(
+                upload_cli_session(
+                    user_id="user-1",
+                    session_id=session_id,
+                    sdk_cwd=sdk_cwd,
+                )
+            )
+
+        # Upload should have been called with stripped bytes (no progress entry).
+        mock_storage.store.assert_called_once()
+        stored_content: bytes = mock_storage.store.call_args.kwargs["content"]
+        stored_lines = stored_content.decode("utf-8").strip().split("\n")
+        stored_types = [json.loads(line).get("type") for line in stored_lines]
+        assert "progress" not in stored_types
+        assert "user" in stored_types
+        assert "assistant" in stored_types
+        # Stripped bytes should be smaller than raw.
+        assert len(stored_content) < len(raw_bytes)
+        # File on disk should also be the stripped version.
+        disk_content = session_file.read_bytes()
+        assert disk_content == stored_content
+
+    def test_strips_stale_thinking_blocks_before_upload(self, tmp_path):
+        """Thinking blocks in non-last assistant turns are stripped to reduce size."""
+        import asyncio
+        import json
+        import os
+        import re
+        from unittest.mock import AsyncMock, patch
+
+        from .transcript import _sanitize_id, upload_cli_session
+
+        projects_base = str(tmp_path)
+        session_id = "12345678-0000-0000-0000-000000000011"
+        sdk_cwd = str(tmp_path)
+
+        encoded_cwd = re.sub(r"[^a-zA-Z0-9]", "-", os.path.realpath(sdk_cwd))
+        session_dir = tmp_path / encoded_cwd
+        session_dir.mkdir(parents=True, exist_ok=True)
+        session_file = session_dir / f"{_sanitize_id(session_id)}.jsonl"
+
+        # Two turns: first assistant has thinking block (stale), second doesn't.
+        u1 = {
+            "type": "user",
+            "uuid": "u1",
+            "message": {"role": "user", "content": "q1"},
+        }
+        a1_with_thinking = {
+            "type": "assistant",
+            "uuid": "a1",
+            "parentUuid": "u1",
+            "message": {
+                "id": "msg_a1",
+                "role": "assistant",
+                "content": [
+                    {"type": "thinking", "thinking": "A" * 5000},
+                    {"type": "text", "text": "answer1"},
+                ],
+            },
+        }
+        u2 = {
+            "type": "user",
+            "uuid": "u2",
+            "parentUuid": "a1",
+            "message": {"role": "user", "content": "q2"},
+        }
+        a2_no_thinking = {
+            "type": "assistant",
+            "uuid": "a2",
+            "parentUuid": "u2",
+            "message": {
+                "id": "msg_a2",
+                "role": "assistant",
+                "content": [{"type": "text", "text": "answer2"}],
+            },
+        }
+        raw_content = (
+            json.dumps(u1)
+            + "\n"
+            + json.dumps(a1_with_thinking)
+            + "\n"
+            + json.dumps(u2)
+            + "\n"
+            + json.dumps(a2_no_thinking)
+            + "\n"
+        )
+        raw_bytes = raw_content.encode("utf-8")
+        session_file.write_bytes(raw_bytes)
+
+        mock_storage = AsyncMock()
+
+        with (
+            patch(
+                "backend.copilot.transcript._projects_base",
+                return_value=projects_base,
+            ),
+            patch(
+                "backend.copilot.transcript.get_workspace_storage",
+                new_callable=AsyncMock,
+                return_value=mock_storage,
+            ),
+        ):
+            asyncio.run(
+                upload_cli_session(
+                    user_id="user-1",
+                    session_id=session_id,
+                    sdk_cwd=sdk_cwd,
+                )
+            )
+
+        stored_content: bytes = mock_storage.store.call_args.kwargs["content"]
+        stored_lines = stored_content.decode("utf-8").strip().split("\n")
+
+        # a1 should have its thinking block stripped (it's not the last assistant turn).
+        a1_stored = json.loads(stored_lines[1])
+        a1_content = a1_stored["message"]["content"]
+        assert all(
+            b["type"] != "thinking" for b in a1_content
+        ), "stale thinking block should be stripped from a1"
+        assert any(
+            b["type"] == "text" for b in a1_content
+        ), "text block should be kept in a1"
+
+        # a2 (last turn) should be unchanged.
+        a2_stored = json.loads(stored_lines[3])
+        assert a2_stored["message"]["content"] == [{"type": "text", "text": "answer2"}]
+
+        # Stripped bytes smaller than raw.
+        assert len(stored_content) < len(raw_bytes)
+

 class TestRestoreCliSession:
    def test_returns_false_when_file_not_found_in_storage(self):
@@ -981,8 +1177,14 @@ class TestRestoreCliSession:

        assert result is False

-    def test_returns_true_when_local_file_already_exists(self, tmp_path):
-        """Same-pod reuse: if local file exists, skip storage download and return True."""
+    def test_gcs_overwrites_stale_local_file(self, tmp_path):
+        """Cross-pod staleness fix: GCS content always overwrites any local file.
+
+        Previously a same-pod optimisation returned early when a local file existed,
+        which caused Pod A to silently reuse a stale file from an earlier turn while
+        the canonical (newer) session lived in GCS — hiding the most-recent turn from
+        the model.  The fix removes that early-return so GCS is always consulted.
+        """
        import asyncio
        import os
        import re
@@ -994,15 +1196,18 @@ class TestRestoreCliSession:
        session_id = "12345678-0000-0000-0000-000000000099"
        sdk_cwd = str(tmp_path)

-        # Pre-create the local session file (simulates previous turn on same pod)
+        # Pre-create a STALE local session file (simulates a previous turn on this pod)
        projects_base = os.path.realpath(str(tmp_path))
        encoded_cwd = re.sub(r"[^a-zA-Z0-9]", "-", projects_base)
        session_dir = Path(projects_base) / encoded_cwd
        session_dir.mkdir(parents=True, exist_ok=True)
-        existing_content = b'{"type":"user"}\n{"type":"assistant"}\n'
-        (session_dir / f"{session_id}.jsonl").write_bytes(existing_content)
+        stale_content = b'{"type":"user"}\n{"type":"assistant","stale":true}\n'
+        (session_dir / f"{session_id}.jsonl").write_bytes(stale_content)

+        # GCS has the FRESH content from the turn that ran on a different pod
+        fresh_gcs_content = b'{"type":"user"}\n{"type":"assistant","fresh":true}\n'
        mock_storage = AsyncMock()
+        mock_storage.retrieve.return_value = fresh_gcs_content

        with (
            patch(
@@ -1024,10 +1229,12 @@ class TestRestoreCliSession:
            )

        assert result is True
-        # Storage should NOT have been accessed (local file was used as-is)
-        mock_storage.retrieve.assert_not_called()
-        # Local file should be unchanged
-        assert (session_dir / f"{session_id}.jsonl").read_bytes() == existing_content
+        # GCS MUST have been consulted — the local file must not be trusted blindly
+        mock_storage.retrieve.assert_called_once()
+        # Local file must now contain the fresh GCS content, not the stale version
+        written = (session_dir / f"{session_id}.jsonl").read_bytes()
+        assert written == fresh_gcs_content
+        assert written != stale_content

    def test_returns_true_on_success(self, tmp_path):
        """Happy path: storage has the session → file written → returns True."""
@@ -1089,3 +1296,212 @@ class TestRestoreCliSession:
            )

        assert result is False
+
+
+class TestMaybeCompactCliSession:
+    def _make_session_file(self, tmp_path, session_id: str, sdk_cwd: str, content: str):
+        import os
+        import re
+
+        from .transcript import _sanitize_id
+
+        encoded_cwd = re.sub(r"[^a-zA-Z0-9]", "-", os.path.realpath(sdk_cwd))
+        session_dir = tmp_path / encoded_cwd
+        session_dir.mkdir(parents=True, exist_ok=True)
+        session_file = session_dir / f"{_sanitize_id(session_id)}.jsonl"
+        session_file.write_text(content, encoding="utf-8")
+        return session_file
+
+    def test_skips_small_file(self, tmp_path):
+        """Files below the threshold are not compacted."""
+        import asyncio
+        from unittest.mock import patch
+
+        from .transcript import maybe_compact_cli_session
+
+        session_id = "12345678-0000-0000-0000-000000000020"
+        sdk_cwd = str(tmp_path)
+        small_content = (
+            '{"type":"user","uuid":"u1","message":{"role":"user","content":"hi"}}\n'
+        )
+        session_file = self._make_session_file(
+            tmp_path, session_id, sdk_cwd, small_content
+        )
+
+        with patch(
+            "backend.copilot.transcript._projects_base",
+            return_value=str(tmp_path),
+        ):
+            result = asyncio.run(
+                maybe_compact_cli_session(
+                    sdk_cwd=sdk_cwd,
+                    session_id=session_id,
+                    model="claude-sonnet-4",
+                )
+            )
+
+        assert result is False
+        # File should be untouched.
+        assert session_file.read_text(encoding="utf-8") == small_content
+
+    def test_compacts_large_file_and_writes_back(self, tmp_path):
+        """Files above the threshold are compacted and the result written to disk."""
+        import asyncio
+        from unittest.mock import AsyncMock, patch
+
+        from .transcript import (
+            _PROACTIVE_COMPACT_THRESHOLD_BYTES,
+            maybe_compact_cli_session,
+        )
+
+        session_id = "12345678-0000-0000-0000-000000000021"
+        sdk_cwd = str(tmp_path)
+
+        # Build a valid JSONL session large enough to exceed the threshold.
+        u1 = (
+            '{"type":"user","uuid":"u1","parentUuid":"","message":{"role":"user","content":"'
+            + ("x" * 1000)
+            + '"}}'
+        )
+        a1 = (
+            '{"type":"assistant","uuid":"a1","parentUuid":"u1","message":{"id":"msg_a1","role":"assistant","model":"","type":"message","content":[{"type":"text","text":"'
+            + ("y" * 1000)
+            + '"}],"stop_reason":"end_turn","stop_sequence":null}}'
+        )
+        single_pair = u1 + "\n" + a1 + "\n"
+        repeat = (_PROACTIVE_COMPACT_THRESHOLD_BYTES // len(single_pair.encode())) + 2
+        large_content = single_pair * repeat
+
+        session_file = self._make_session_file(
+            tmp_path, session_id, sdk_cwd, large_content
+        )
+
+        compacted_content = (
+            '{"type":"user","uuid":"c1","parentUuid":"","message":{"role":"user","content":"summary"}}\n'
+            '{"type":"assistant","uuid":"c2","parentUuid":"c1","message":{"id":"msg_c2","role":"assistant","model":"","type":"message","content":[{"type":"text","text":"compacted"}],"stop_reason":"end_turn","stop_sequence":null}}\n'
+        )
+
+        with (
+            patch(
+                "backend.copilot.transcript._projects_base",
+                return_value=str(tmp_path),
+            ),
+            patch(
+                "backend.copilot.transcript.compact_transcript",
+                new_callable=AsyncMock,
+                return_value=compacted_content,
+            ),
+        ):
+            result = asyncio.run(
+                maybe_compact_cli_session(
+                    sdk_cwd=sdk_cwd,
+                    session_id=session_id,
+                    model="claude-sonnet-4",
+                )
+            )
+
+        assert result is True
+        # File on disk should now contain the compacted content.
+        assert session_file.read_text(encoding="utf-8") == compacted_content
+
+    def test_keeps_original_when_compaction_fails(self, tmp_path):
+        """If compact_transcript returns None, the original file is left intact."""
+        import asyncio
+        from unittest.mock import AsyncMock, patch
+
+        from .transcript import (
+            _PROACTIVE_COMPACT_THRESHOLD_BYTES,
+            maybe_compact_cli_session,
+        )
+
+        session_id = "12345678-0000-0000-0000-000000000022"
+        sdk_cwd = str(tmp_path)
+
+        single_pair = (
+            '{"type":"user","uuid":"u1","parentUuid":"","message":{"role":"user","content":"'
+            + ("x" * 1000)
+            + '"}}\n'
+            '{"type":"assistant","uuid":"a1","parentUuid":"u1","message":{"id":"msg_a1","role":"assistant","model":"","type":"message","content":[{"type":"text","text":"'
+            + ("y" * 1000)
+            + '"}],"stop_reason":"end_turn","stop_sequence":null}}\n'
+        )
+        repeat = (_PROACTIVE_COMPACT_THRESHOLD_BYTES // len(single_pair.encode())) + 2
+        large_content = single_pair * repeat
+
+        session_file = self._make_session_file(
+            tmp_path, session_id, sdk_cwd, large_content
+        )
+
+        with (
+            patch(
+                "backend.copilot.transcript._projects_base",
+                return_value=str(tmp_path),
+            ),
+            patch(
+                "backend.copilot.transcript.compact_transcript",
+                new_callable=AsyncMock,
+                return_value=None,
+            ),
+        ):
+            result = asyncio.run(
+                maybe_compact_cli_session(
+                    sdk_cwd=sdk_cwd,
+                    session_id=session_id,
+                    model="claude-sonnet-4",
+                )
+            )
+
+        assert result is False
+        # File should be unchanged.
+        assert session_file.read_text(encoding="utf-8") == large_content
+
+    def test_keeps_original_when_compaction_returns_identical_content(self, tmp_path):
+        """If compact_transcript returns the same string as the input, the file is left intact."""
+        import asyncio
+        from unittest.mock import AsyncMock, patch
+
+        from .transcript import (
+            _PROACTIVE_COMPACT_THRESHOLD_BYTES,
+            maybe_compact_cli_session,
+        )
+
+        session_id = "12345678-0000-0000-0000-000000000023"
+        sdk_cwd = str(tmp_path)
+
+        single_pair = (
+            '{"type":"user","uuid":"u1","parentUuid":"","message":{"role":"user","content":"'
+            + ("x" * 1000)
+            + '"}}\n'
+            '{"type":"assistant","uuid":"a1","parentUuid":"u1","message":{"id":"msg_a1","role":"assistant","model":"","type":"message","content":[{"type":"text","text":"'
+            + ("y" * 1000)
+            + '"}],"stop_reason":"end_turn","stop_sequence":null}}\n'
+        )
+        repeat = (_PROACTIVE_COMPACT_THRESHOLD_BYTES // len(single_pair.encode())) + 2
+        large_content = single_pair * repeat
+
+        session_file = self._make_session_file(
+            tmp_path, session_id, sdk_cwd, large_content
+        )
+
+        with (
+            patch(
+                "backend.copilot.transcript._projects_base",
+                return_value=str(tmp_path),
+            ),
+            patch(
+                "backend.copilot.transcript.compact_transcript",
+                new_callable=AsyncMock,
+                return_value=large_content,
+            ),
+        ):
+            result = asyncio.run(
+                maybe_compact_cli_session(
+                    sdk_cwd=sdk_cwd,
+                    session_id=session_id,
+                    model="claude-sonnet-4",
+                )
+            )
+
+        assert result is False
+        # File should be unchanged.
+        assert session_file.read_text(encoding="utf-8") == large_content
Author	SHA1	Message	Date
Zamil Majdy	d99b613b1d	fix(backend/copilot): always download CLI session from GCS to prevent cross-pod context loss Removes the same-pod optimisation in restore_cli_session that returned early when a local session file existed. In a multi-pod environment the local file may be from a previous turn that ran on this same pod; using it silently hides the most-recent turn (which ran on a different pod and uploaded to GCS) from the model. Root cause confirmed via GCloud logs on session 93ecf7c9: T14 uploaded 181 805 B to GCS, but T15 – which ran back on Pod A's stale file – uploaded only 176 635 B, proving it never saw T14's content. The fix always overwrites the local file from GCS so --resume always starts from the authoritative, up-to-date session regardless of which pod serves the request.	2026-04-16 00:57:23 +07:00
Zamil Majdy	429ef36568	fix(backend/copilot): split compaction no-op log messages and add identical-content test Split the combined `not compacted or compacted == content` guard into two separate conditions with distinct log messages so logs clearly distinguish between compact_transcript returning None (failure) and returning the identical string (no-op). Add test_keeps_original_when_compaction_returns_identical_content to cover the new branch.	2026-04-16 00:21:30 +07:00
Zamil Majdy	6dc24b3766	fix(backend/copilot): proactively compact CLI session before each turn to prevent silent auto-compaction Sonnet sessions (no thinking blocks) were losing context when the CLI silently auto-compacted mid-turn at ~200K tokens, bypassing the PreCompact hook. Add maybe_compact_cli_session() that runs LLM summarization before the turn starts whenever the CLI session file exceeds 120KB (~105K tokens). Wires the call into service.py after restore_cli_session() succeeds.	2026-04-16 00:09:24 +07:00
Zamil Majdy	d27d22159d	Merge branch 'master' of github.com:Significant-Gravitas/AutoGPT into dev	2026-04-16 00:05:32 +07:00
Nicholas Tindle	fffbe0aad8	fix(backend): default copilot sonnet to 4.6 (#12799 ) ### Why / What / How Why: Copilot/Autopilot standard requests were still defaulting to Claude Sonnet 4, while the expected default for this path is Sonnet 4.6. What: This PR updates the backend Copilot defaults so the standard/default path and fast path use Sonnet 4.6, and aligns the SDK fallback model and related test expectations. How: It changes `ChatConfig.model`, `ChatConfig.fast_model`, and `ChatConfig.claude_agent_fallback_model` to Sonnet 4.6 values, then updates backend tests that assert the default Sonnet model strings. ### Changes 🏗️ - Switch `ChatConfig.model` from `anthropic/claude-sonnet-4` to `anthropic/claude-sonnet-4-6` - Switch `ChatConfig.fast_model` from `anthropic/claude-sonnet-4` to `anthropic/claude-sonnet-4-6` - Switch `ChatConfig.claude_agent_fallback_model` from `claude-sonnet-4-20250514` to `claude-sonnet-4-6` - Update backend Copilot tests that assert the default Sonnet model strings - Configuration changes: - No new environment variables or docker-compose changes are required - Existing `.env.default` and compose files remain compatible because this only changes backend default model values in code ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] `poetry run format` - [x] `poetry run pytest backend/copilot/baseline/transcript_integration_test.py` - [x] `poetry run pytest backend/copilot/sdk/service_helpers_test.py` - [x] `poetry run pytest backend/copilot/sdk/service_test.py` - [x] `poetry run pytest backend/copilot/sdk/p0_guardrails_test.py` <details> <summary>Example test plan</summary> - [ ] Create from scratch and execute an agent with at least 3 blocks - [ ] Import an agent from file upload, and confirm it executes correctly - [ ] Upload agent to marketplace - [ ] Import an agent from marketplace and confirm it executes correctly - [ ] Edit an agent from monitor, and confirm it executes correctly </details> #### For configuration changes: - [x] `.env.default` is updated or already compatible with my changes - [x] `docker-compose.yml` is updated or already compatible with my changes - [x] I have included a list of my configuration changes in the PR description (under Changes) <details> <summary>Examples of configuration changes</summary> - Changing ports - Adding new services that need to communicate with each other - Secrets or environment variable changes - New or infrastructure changes such as databases </details> <!-- CURSOR_SUMMARY --> --- > [!NOTE] > Medium Risk > Changes default/fallback LLM model identifiers for Copilot requests, which can affect runtime behavior, cost, and availability characteristics across both baseline and SDK paths. Risk is mitigated by being a small, config-only change with updated tests. > > Overview > Updates Copilot backend defaults so both the standard (`model`) and fast (`fast_model`) paths use `anthropic/claude-sonnet-4-6`, and aligns the Claude Agent SDK fallback model to `claude-sonnet-4-6`. > > Adjusts related test expectations in baseline transcript integration and SDK helper tests to match the new Sonnet 4.6 model strings. > > <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit `563361ac11`. Bugbot is set up for automated code reviews on this repo. Configure [here](https://www.cursor.com/dashboard/bugbot).</sup> <!-- /CURSOR_SUMMARY -->	2026-04-15 16:53:30 +00:00
Zamil Majdy	df205b5444	fix(backend/copilot): strip CLI session file to prevent auto-compaction context loss The Claude Code CLI auto-compacts its native session JSONL when the context approaches the model's token limit (~200K for Sonnet). After compaction the detailed conversation history is replaced by a ~27K-token summary, causing the silent context loss users see as memory failures in long sessions. Root cause identified from production logs for session 93ecf7c9: - T6 CLI session: 233KB / ~207K tokens (near Sonnet limit) - T7 CLI compacted session -> ~167KB / ~47K tokens (PreCompact hook missed) - T12 second compaction -> ~176KB / ~27K tokens (just system prompt + summary) - T14-T21: cache_read=26714 constantly -- only system prompt visible to Claude The same stripping we already apply to our transcript (stale thinking blocks, progress/metadata entries) now also runs on the CLI native session file. At ~2x the size of the stripped transcript, unstripped sessions routinely hit the compaction threshold within 6-10 turns of a heavy Opus/thinking session. After stripping: - same-pod turns reuse the stripped local file (no compaction trigger) - cross-pod turns restore the stripped GCS file (same benefit)	2026-04-15 23:19:12 +07:00
majdyz	4efa1c4310	fix(copilot): set session_id on mode-switch T1 to enable --resume on subsequent turns When a user switches from baseline (fast) mode to SDK (extended_thinking) mode mid-session, the first SDK turn has has_history=True (prior baseline messages in DB) but no CLI session file in storage. The old code gated session_id on `not has_history`, so mode-switch T1 never received a session_id — the CLI generated a random ID that wasn't uploaded under the expected key. Every subsequent SDK turn would fail to restore the CLI session and run without --resume, injecting the full compressed history on each turn, causing model confusion. Fix: set session_id whenever not using --resume (the `else` branch), covering T1 fresh, mode-switch T1, and T2+ fallback turns. The retry path is updated to use `"session_id" in sdk_options_kwargs` as the discriminator (instead of `not has_history`) so mode-switch T1 retries also keep the session_id while T2+ retries (where T1 restored a session file via restore_cli_session) still remove it to avoid "Session ID already in use".	2026-04-15 23:19:11 +07:00