fix(backend/copilot): split broad except in _read_cli_session_from_disk, clean up dataclass field comment

- Split `except Exception` into `UnicodeDecodeError` (returns raw) + nested
  `OSError` on write-back (returns stripped for GCS despite local failure) +
  fallback `Exception` — eliminates path leak via OSError str() and clarifies
  each fallback path's intent
- Move write-back into its own nested try/except so pyright can verify
  `stripped_bytes` is always bound before use
- Simplify `TranscriptDownload.mode` default: use conventional comment on
  separate line instead of parenthesized expression
- Add `TestReadCliSessionFromDisk` tests covering both new exception branches
This commit is contained in:
Zamil Majdy
2026-04-16 06:58:04 +07:00
parent cbf71fddb2
commit 9415166ee0
3 changed files with 109 additions and 11 deletions

View File

@@ -939,8 +939,18 @@ def _read_cli_session_from_disk(
raw_text = raw_bytes.decode("utf-8")
stripped_text = strip_for_upload(raw_text)
stripped_bytes = stripped_text.encode("utf-8")
if len(stripped_bytes) < len(raw_bytes):
# Write back locally so same-pod turns also benefit.
except UnicodeDecodeError:
logger.warning("%s CLI session is not valid UTF-8, uploading raw", log_prefix)
return raw_bytes
except Exception as e:
logger.warning(
"%s Failed to strip CLI session, uploading raw: %s", log_prefix, e
)
return raw_bytes
if len(stripped_bytes) < len(raw_bytes):
# Write back locally so same-pod turns also benefit.
try:
Path(real_path).write_bytes(stripped_bytes)
logger.info(
"%s Stripped CLI session: %dB → %dB",
@@ -948,12 +958,15 @@ def _read_cli_session_from_disk(
len(raw_bytes),
len(stripped_bytes),
)
return stripped_bytes
except Exception as e:
logger.warning(
"%s Failed to strip CLI session, uploading raw: %s", log_prefix, e
)
return raw_bytes
except OSError as e:
# write_bytes failed — stripped content is still valid for GCS upload even
# though the local write-back failed (same-pod optimization silently skipped).
logger.warning(
"%s Failed to write back stripped CLI session: %s",
log_prefix,
e.strerror or str(e),
)
return stripped_bytes
def _process_cli_restore(

View File

@@ -1451,3 +1451,89 @@ class TestProcessCliRestore:
assert not ok
assert stripped_str == ""
class TestReadCliSessionFromDisk:
"""``_read_cli_session_from_disk`` reads, strips, and optionally writes back the session."""
def _build_session_file(self, tmp_path, session_id: str):
"""Build the session file path inside tmp_path using the same encoding as cli_session_path."""
import os
import re
from pathlib import Path
sdk_cwd = str(tmp_path)
encoded_cwd = re.sub(r"[^a-zA-Z0-9]", "-", os.path.realpath(sdk_cwd))
session_dir = Path(str(tmp_path)) / encoded_cwd
session_dir.mkdir(parents=True, exist_ok=True)
return sdk_cwd, session_dir / f"{session_id}.jsonl"
def test_returns_raw_bytes_for_invalid_utf8(self, tmp_path):
"""Non-UTF-8 bytes trigger UnicodeDecodeError — returns raw bytes (upload-raw fallback)."""
from unittest.mock import patch
from backend.copilot.sdk.service import _read_cli_session_from_disk
session_id = "12345678-0000-0000-0000-aabbccdd0001"
projects_base_dir = str(tmp_path)
sdk_cwd, session_file = self._build_session_file(tmp_path, session_id)
# Write raw invalid UTF-8 bytes
session_file.write_bytes(b"\xff\xfe invalid utf-8\n")
with (
patch(
"backend.copilot.sdk.service.projects_base",
return_value=projects_base_dir,
),
patch(
"backend.copilot.transcript.projects_base",
return_value=projects_base_dir,
),
):
result = _read_cli_session_from_disk(sdk_cwd, session_id, "[Test]")
# UnicodeDecodeError path returns the raw bytes (upload-raw fallback)
assert result == b"\xff\xfe invalid utf-8\n"
def test_write_back_oserror_still_returns_stripped_bytes(self, tmp_path):
"""OSError on write-back returns stripped bytes for GCS upload (not raw)."""
from unittest.mock import patch
from backend.copilot.sdk.service import _read_cli_session_from_disk
session_id = "12345678-0000-0000-0000-aabbccdd0002"
projects_base_dir = str(tmp_path)
sdk_cwd, session_file = self._build_session_file(tmp_path, session_id)
# Content with a strippable progress entry so stripped_bytes < raw_bytes
raw_content = (
'{"type":"progress","uuid":"p1","subtype":"agent_progress","parentUuid":null}\n'
'{"type":"user","uuid":"u1","parentUuid":null,"message":{"role":"user","content":"hi"}}\n'
'{"type":"assistant","uuid":"a1","parentUuid":"u1","message":{"role":"assistant","content":[{"type":"text","text":"hello"}]}}\n'
)
session_file.write_bytes(raw_content.encode("utf-8"))
# Make the file read-only so write_bytes raises OSError on the write-back
session_file.chmod(0o444)
try:
with (
patch(
"backend.copilot.sdk.service.projects_base",
return_value=projects_base_dir,
),
patch(
"backend.copilot.transcript.projects_base",
return_value=projects_base_dir,
),
):
result = _read_cli_session_from_disk(sdk_cwd, session_id, "[Test]")
finally:
session_file.chmod(0o644)
# Must return stripped bytes (not raw, not None) so GCS gets the clean version
assert result is not None
assert (
b"progress" not in result
), "Stripped bytes must not contain progress entry"
assert b"hello" in result, "Stripped bytes should contain assistant turn"

View File

@@ -55,9 +55,8 @@ TranscriptMode = Literal["sdk", "baseline"]
class TranscriptDownload:
content: bytes
message_count: int = 0
mode: TranscriptMode = (
"sdk" # "sdk" = Claude CLI native, "baseline" = TranscriptBuilder
)
# "sdk" = Claude CLI native, "baseline" = TranscriptBuilder
mode: TranscriptMode = "sdk"
# Storage prefix for the CLI's native session JSONL files (for cross-pod --resume).