fix(backend/copilot): strip CLI session file to prevent auto-compaction context loss

The Claude Code CLI auto-compacts its native session JSONL when the context
approaches the model's token limit (~200K for Sonnet).  After compaction the
detailed conversation history is replaced by a ~27K-token summary, causing
the silent context loss users see as memory failures in long sessions.

Root cause identified from production logs for session 93ecf7c9:
- T6 CLI session: 233KB / ~207K tokens (near Sonnet limit)
- T7 CLI compacted session -> ~167KB / ~47K tokens (PreCompact hook missed)
- T12 second compaction -> ~176KB / ~27K tokens (just system prompt + summary)
- T14-T21: cache_read=26714 constantly -- only system prompt visible to Claude

The same stripping we already apply to our transcript (stale thinking blocks,
progress/metadata entries) now also runs on the CLI native session file.  At
~2x the size of the stripped transcript, unstripped sessions routinely hit the
compaction threshold within 6-10 turns of a heavy Opus/thinking session.
After stripping:
- same-pod turns reuse the stripped local file (no compaction trigger)
- cross-pod turns restore the stripped GCS file (same benefit)
This commit is contained in:
Zamil Majdy
2026-04-15 23:18:59 +07:00
parent 4efa1c4310
commit df205b5444
2 changed files with 223 additions and 1 deletions

View File

@@ -716,7 +716,7 @@ async def upload_cli_session(
return
try:
content = Path(real_path).read_bytes()
raw_bytes = Path(real_path).read_bytes()
except FileNotFoundError:
logger.debug(
"%s CLI session file not found, skipping upload: %s",
@@ -728,6 +728,32 @@ async def upload_cli_session(
logger.warning("%s Failed to read CLI session file: %s", log_prefix, e)
return
# Strip stale thinking blocks and metadata entries (progress, file-history-snapshot,
# queue-operation) from the CLI session before writing it back locally and uploading
# to GCS. Thinking blocks from non-last assistant turns are not needed for --resume
# but can be massive (tens of thousands of tokens each), causing the CLI to auto-compact
# its session when the context window fills up. Stripping keeps the session well below
# the ~200K-token compaction threshold and prevents silent context loss.
try:
raw_text = raw_bytes.decode("utf-8")
stripped_text = strip_for_upload(raw_text)
stripped_bytes = stripped_text.encode("utf-8")
if len(stripped_bytes) < len(raw_bytes):
# Write the stripped version back locally so same-pod turns also benefit.
Path(real_path).write_bytes(stripped_bytes)
logger.info(
"%s Stripped CLI session file: %dB → %dB",
log_prefix,
len(raw_bytes),
len(stripped_bytes),
)
content = stripped_bytes
except Exception as e:
logger.warning(
"%s Failed to strip CLI session file, uploading raw: %s", log_prefix, e
)
content = raw_bytes
storage = await get_workspace_storage()
wid, fid, fname = _cli_session_storage_path_parts(user_id, session_id)
try:

View File

@@ -918,6 +918,202 @@ class TestUploadCliSession:
mock_storage.store.assert_not_called()
def test_strips_session_before_upload_and_writes_back(self, tmp_path):
"""Strippable entries (progress, thinking blocks) are removed before upload.
The stripped content is written back to disk (so same-pod turns benefit)
and the smaller bytes are uploaded to GCS.
"""
import asyncio
import os
import re
from unittest.mock import AsyncMock, patch
from .transcript import _sanitize_id, upload_cli_session
projects_base = str(tmp_path)
session_id = "12345678-0000-0000-0000-000000000010"
sdk_cwd = str(tmp_path)
encoded_cwd = re.sub(r"[^a-zA-Z0-9]", "-", os.path.realpath(sdk_cwd))
session_dir = tmp_path / encoded_cwd
session_dir.mkdir(parents=True, exist_ok=True)
session_file = session_dir / f"{_sanitize_id(session_id)}.jsonl"
# A CLI session with a progress entry (strippable) and a real assistant message.
import json
progress_entry = {
"type": "progress",
"uuid": "p1",
"parentUuid": "u1",
"data": {"type": "bash_progress", "stdout": "running..."},
}
user_entry = {
"type": "user",
"uuid": "u1",
"message": {"role": "user", "content": "hello"},
}
asst_entry = {
"type": "assistant",
"uuid": "a1",
"parentUuid": "u1",
"message": {"role": "assistant", "content": "world"},
}
raw_content = (
json.dumps(progress_entry)
+ "\n"
+ json.dumps(user_entry)
+ "\n"
+ json.dumps(asst_entry)
+ "\n"
)
raw_bytes = raw_content.encode("utf-8")
session_file.write_bytes(raw_bytes)
mock_storage = AsyncMock()
with (
patch(
"backend.copilot.transcript._projects_base",
return_value=projects_base,
),
patch(
"backend.copilot.transcript.get_workspace_storage",
new_callable=AsyncMock,
return_value=mock_storage,
),
):
asyncio.run(
upload_cli_session(
user_id="user-1",
session_id=session_id,
sdk_cwd=sdk_cwd,
)
)
# Upload should have been called with stripped bytes (no progress entry).
mock_storage.store.assert_called_once()
stored_content: bytes = mock_storage.store.call_args.kwargs["content"]
stored_lines = stored_content.decode("utf-8").strip().split("\n")
stored_types = [json.loads(line).get("type") for line in stored_lines]
assert "progress" not in stored_types
assert "user" in stored_types
assert "assistant" in stored_types
# Stripped bytes should be smaller than raw.
assert len(stored_content) < len(raw_bytes)
# File on disk should also be the stripped version.
disk_content = session_file.read_bytes()
assert disk_content == stored_content
def test_strips_stale_thinking_blocks_before_upload(self, tmp_path):
"""Thinking blocks in non-last assistant turns are stripped to reduce size."""
import asyncio
import json
import os
import re
from unittest.mock import AsyncMock, patch
from .transcript import _sanitize_id, upload_cli_session
projects_base = str(tmp_path)
session_id = "12345678-0000-0000-0000-000000000011"
sdk_cwd = str(tmp_path)
encoded_cwd = re.sub(r"[^a-zA-Z0-9]", "-", os.path.realpath(sdk_cwd))
session_dir = tmp_path / encoded_cwd
session_dir.mkdir(parents=True, exist_ok=True)
session_file = session_dir / f"{_sanitize_id(session_id)}.jsonl"
# Two turns: first assistant has thinking block (stale), second doesn't.
u1 = {
"type": "user",
"uuid": "u1",
"message": {"role": "user", "content": "q1"},
}
a1_with_thinking = {
"type": "assistant",
"uuid": "a1",
"parentUuid": "u1",
"message": {
"id": "msg_a1",
"role": "assistant",
"content": [
{"type": "thinking", "thinking": "A" * 5000},
{"type": "text", "text": "answer1"},
],
},
}
u2 = {
"type": "user",
"uuid": "u2",
"parentUuid": "a1",
"message": {"role": "user", "content": "q2"},
}
a2_no_thinking = {
"type": "assistant",
"uuid": "a2",
"parentUuid": "u2",
"message": {
"id": "msg_a2",
"role": "assistant",
"content": [{"type": "text", "text": "answer2"}],
},
}
raw_content = (
json.dumps(u1)
+ "\n"
+ json.dumps(a1_with_thinking)
+ "\n"
+ json.dumps(u2)
+ "\n"
+ json.dumps(a2_no_thinking)
+ "\n"
)
raw_bytes = raw_content.encode("utf-8")
session_file.write_bytes(raw_bytes)
mock_storage = AsyncMock()
with (
patch(
"backend.copilot.transcript._projects_base",
return_value=projects_base,
),
patch(
"backend.copilot.transcript.get_workspace_storage",
new_callable=AsyncMock,
return_value=mock_storage,
),
):
asyncio.run(
upload_cli_session(
user_id="user-1",
session_id=session_id,
sdk_cwd=sdk_cwd,
)
)
stored_content: bytes = mock_storage.store.call_args.kwargs["content"]
stored_lines = stored_content.decode("utf-8").strip().split("\n")
# a1 should have its thinking block stripped (it's not the last assistant turn).
a1_stored = json.loads(stored_lines[1])
a1_content = a1_stored["message"]["content"]
assert all(
b["type"] != "thinking" for b in a1_content
), "stale thinking block should be stripped from a1"
assert any(
b["type"] == "text" for b in a1_content
), "text block should be kept in a1"
# a2 (last turn) should be unchanged.
a2_stored = json.loads(stored_lines[3])
assert a2_stored["message"]["content"] == [{"type": "text", "text": "answer2"}]
# Stripped bytes smaller than raw.
assert len(stored_content) < len(raw_bytes)
class TestRestoreCliSession:
def test_returns_false_when_file_not_found_in_storage(self):