fix(backend/copilot): strip CLI session file to prevent auto-compaction context loss

The Claude Code CLI auto-compacts its native session JSONL when the context approaches the model's token limit (~200K for Sonnet). After compaction the detailed conversation history is replaced by a ~27K-token summary, causing the silent context loss users see as memory failures in long sessions. Root cause identified from production logs for session 93ecf7c9: - T6 CLI session: 233KB / ~207K tokens (near Sonnet limit) - T7 CLI compacted session -> ~167KB / ~47K tokens (PreCompact hook missed) - T12 second compaction -> ~176KB / ~27K tokens (just system prompt + summary) - T14-T21: cache_read=26714 constantly -- only system prompt visible to Claude The same stripping we already apply to our transcript (stale thinking blocks, progress/metadata entries) now also runs on the CLI native session file. At ~2x the size of the stripped transcript, unstripped sessions routinely hit the compaction threshold within 6-10 turns of a heavy Opus/thinking session. After stripping: - same-pod turns reuse the stripped local file (no compaction trigger) - cross-pod turns restore the stripped GCS file (same benefit)
2026-04-30 03:00:41 -04:00 · 2026-04-15 23:18:59 +07:00
parent 4efa1c4310
commit df205b5444
2 changed files with 223 additions and 1 deletions
--- a/autogpt_platform/backend/backend/copilot/transcript.py
+++ b/autogpt_platform/backend/backend/copilot/transcript.py
@@ -716,7 +716,7 @@ async def upload_cli_session(
        return
    try:
-        content = Path(real_path).read_bytes()
+        raw_bytes = Path(real_path).read_bytes()
    except FileNotFoundError:
        logger.debug(
            "%s CLI session file not found, skipping upload: %s",
@@ -728,6 +728,32 @@ async def upload_cli_session(
        logger.warning("%s Failed to read CLI session file: %s", log_prefix, e)
        return
    # Strip stale thinking blocks and metadata entries (progress, file-history-snapshot,
    # queue-operation) from the CLI session before writing it back locally and uploading
    # to GCS.  Thinking blocks from non-last assistant turns are not needed for --resume
    # but can be massive (tens of thousands of tokens each), causing the CLI to auto-compact
    # its session when the context window fills up.  Stripping keeps the session well below
    # the ~200K-token compaction threshold and prevents silent context loss.
    try:
        raw_text = raw_bytes.decode("utf-8")
        stripped_text = strip_for_upload(raw_text)
        stripped_bytes = stripped_text.encode("utf-8")
        if len(stripped_bytes) < len(raw_bytes):
            # Write the stripped version back locally so same-pod turns also benefit.
            Path(real_path).write_bytes(stripped_bytes)
            logger.info(
                "%s Stripped CLI session file: %dB → %dB",
                log_prefix,
                len(raw_bytes),
                len(stripped_bytes),
            )
        content = stripped_bytes
    except Exception as e:
        logger.warning(
            "%s Failed to strip CLI session file, uploading raw: %s", log_prefix, e
        )
        content = raw_bytes
    storage = await get_workspace_storage()
    wid, fid, fname = _cli_session_storage_path_parts(user_id, session_id)
    try:
--- a/autogpt_platform/backend/backend/copilot/transcript_test.py
+++ b/autogpt_platform/backend/backend/copilot/transcript_test.py
@@ -918,6 +918,202 @@ class TestUploadCliSession:
        mock_storage.store.assert_not_called()
    def test_strips_session_before_upload_and_writes_back(self, tmp_path):
        """Strippable entries (progress, thinking blocks) are removed before upload.
        The stripped content is written back to disk (so same-pod turns benefit)
        and the smaller bytes are uploaded to GCS.
        """
        import asyncio
        import os
        import re
        from unittest.mock import AsyncMock, patch
        from .transcript import _sanitize_id, upload_cli_session
        projects_base = str(tmp_path)
        session_id = "12345678-0000-0000-0000-000000000010"
        sdk_cwd = str(tmp_path)
        encoded_cwd = re.sub(r"[^a-zA-Z0-9]", "-", os.path.realpath(sdk_cwd))
        session_dir = tmp_path / encoded_cwd
        session_dir.mkdir(parents=True, exist_ok=True)
        session_file = session_dir / f"{_sanitize_id(session_id)}.jsonl"
        # A CLI session with a progress entry (strippable) and a real assistant message.
        import json
        progress_entry = {
            "type": "progress",
            "uuid": "p1",
            "parentUuid": "u1",
            "data": {"type": "bash_progress", "stdout": "running..."},
        }
        user_entry = {
            "type": "user",
            "uuid": "u1",
            "message": {"role": "user", "content": "hello"},
        }
        asst_entry = {
            "type": "assistant",
            "uuid": "a1",
            "parentUuid": "u1",
            "message": {"role": "assistant", "content": "world"},
        }
        raw_content = (
            json.dumps(progress_entry)
            + "\n"
            + json.dumps(user_entry)
            + "\n"
            + json.dumps(asst_entry)
            + "\n"
        )
        raw_bytes = raw_content.encode("utf-8")
        session_file.write_bytes(raw_bytes)
        mock_storage = AsyncMock()
        with (
            patch(
                "backend.copilot.transcript._projects_base",
                return_value=projects_base,
            ),
            patch(
                "backend.copilot.transcript.get_workspace_storage",
                new_callable=AsyncMock,
                return_value=mock_storage,
            ),
        ):
            asyncio.run(
                upload_cli_session(
                    user_id="user-1",
                    session_id=session_id,
                    sdk_cwd=sdk_cwd,
                )
            )
        # Upload should have been called with stripped bytes (no progress entry).
        mock_storage.store.assert_called_once()
        stored_content: bytes = mock_storage.store.call_args.kwargs["content"]
        stored_lines = stored_content.decode("utf-8").strip().split("\n")
        stored_types = [json.loads(line).get("type") for line in stored_lines]
        assert "progress" not in stored_types
        assert "user" in stored_types
        assert "assistant" in stored_types
        # Stripped bytes should be smaller than raw.
        assert len(stored_content) < len(raw_bytes)
        # File on disk should also be the stripped version.
        disk_content = session_file.read_bytes()
        assert disk_content == stored_content
    def test_strips_stale_thinking_blocks_before_upload(self, tmp_path):
        """Thinking blocks in non-last assistant turns are stripped to reduce size."""
        import asyncio
        import json
        import os
        import re
        from unittest.mock import AsyncMock, patch
        from .transcript import _sanitize_id, upload_cli_session
        projects_base = str(tmp_path)
        session_id = "12345678-0000-0000-0000-000000000011"
        sdk_cwd = str(tmp_path)
        encoded_cwd = re.sub(r"[^a-zA-Z0-9]", "-", os.path.realpath(sdk_cwd))
        session_dir = tmp_path / encoded_cwd
        session_dir.mkdir(parents=True, exist_ok=True)
        session_file = session_dir / f"{_sanitize_id(session_id)}.jsonl"
        # Two turns: first assistant has thinking block (stale), second doesn't.
        u1 = {
            "type": "user",
            "uuid": "u1",
            "message": {"role": "user", "content": "q1"},
        }
        a1_with_thinking = {
            "type": "assistant",
            "uuid": "a1",
            "parentUuid": "u1",
            "message": {
                "id": "msg_a1",
                "role": "assistant",
                "content": [
                    {"type": "thinking", "thinking": "A" * 5000},
                    {"type": "text", "text": "answer1"},
                ],
            },
        }
        u2 = {
            "type": "user",
            "uuid": "u2",
            "parentUuid": "a1",
            "message": {"role": "user", "content": "q2"},
        }
        a2_no_thinking = {
            "type": "assistant",
            "uuid": "a2",
            "parentUuid": "u2",
            "message": {
                "id": "msg_a2",
                "role": "assistant",
                "content": [{"type": "text", "text": "answer2"}],
            },
        }
        raw_content = (
            json.dumps(u1)
            + "\n"
            + json.dumps(a1_with_thinking)
            + "\n"
            + json.dumps(u2)
            + "\n"
            + json.dumps(a2_no_thinking)
            + "\n"
        )
        raw_bytes = raw_content.encode("utf-8")
        session_file.write_bytes(raw_bytes)
        mock_storage = AsyncMock()
        with (
            patch(
                "backend.copilot.transcript._projects_base",
                return_value=projects_base,
            ),
            patch(
                "backend.copilot.transcript.get_workspace_storage",
                new_callable=AsyncMock,
                return_value=mock_storage,
            ),
        ):
            asyncio.run(
                upload_cli_session(
                    user_id="user-1",
                    session_id=session_id,
                    sdk_cwd=sdk_cwd,
                )
            )
        stored_content: bytes = mock_storage.store.call_args.kwargs["content"]
        stored_lines = stored_content.decode("utf-8").strip().split("\n")
        # a1 should have its thinking block stripped (it's not the last assistant turn).
        a1_stored = json.loads(stored_lines[1])
        a1_content = a1_stored["message"]["content"]
        assert all(
            b["type"] != "thinking" for b in a1_content
        ), "stale thinking block should be stripped from a1"
        assert any(
            b["type"] == "text" for b in a1_content
        ), "text block should be kept in a1"
        # a2 (last turn) should be unchanged.
        a2_stored = json.loads(stored_lines[3])
        assert a2_stored["message"]["content"] == [{"type": "text", "text": "answer2"}]
        # Stripped bytes smaller than raw.
        assert len(stored_content) < len(raw_bytes)
 class TestRestoreCliSession:
    def test_returns_false_when_file_not_found_in_storage(self):