fix(chat/sdk): fix transcript validation and type captured_transcript properly

- Replace dict[str,str] with CapturedTranscript dataclass for type safety
- Fix validate_transcript requiring >=3 lines — after stripping metadata,
  a valid 1-turn conversation is just user+assistant (2 lines)
- Apply CodeQL autofix: internalize max_len in _sanitize_id, add fallback
This commit is contained in:
Zamil Majdy
2026-02-13 16:32:06 +04:00
parent d0f0c32e70
commit 52c8a25531
2 changed files with 29 additions and 15 deletions

View File

@@ -6,6 +6,7 @@ import logging
import os
import uuid
from collections.abc import AsyncGenerator
from dataclasses import dataclass
from typing import Any
from backend.util.exceptions import NotFoundError
@@ -60,6 +61,18 @@ config = ChatConfig()
_background_tasks: set[asyncio.Task[Any]] = set()
@dataclass
class CapturedTranscript:
"""Info captured by the SDK Stop hook for stateless --resume."""
path: str = ""
sdk_session_id: str = ""
@property
def available(self) -> bool:
return bool(self.path)
_SDK_CWD_PREFIX = WORKSPACE_PREFIX
# Appended to the system prompt to inform the agent about available tools.
@@ -502,11 +515,11 @@ async def stream_chat_completion_sdk(
sdk_model = _resolve_sdk_model()
# --- Transcript capture via Stop hook ---
captured_transcript: dict[str, str] = {}
captured_transcript = CapturedTranscript()
def _on_stop(transcript_path: str, sdk_session_id: str) -> None:
captured_transcript["path"] = transcript_path
captured_transcript["session_id"] = sdk_session_id
captured_transcript.path = transcript_path
captured_transcript.sdk_session_id = sdk_session_id
security_hooks = create_security_hooks(
user_id,
@@ -567,9 +580,9 @@ async def stream_chat_completion_sdk(
yield StreamFinish()
return
# Build query: with --resume the CLI already has full context,
# so we only send the new message. Without resume, compress
# history into a context prefix as before.
# Build query: with --resume the CLI already has full
# context, so we only send the new message. Without
# resume, compress history into a context prefix.
query_message = current_message
if not use_resume and len(session.messages) > 1:
logger.warning(
@@ -675,13 +688,12 @@ async def stream_chat_completion_sdk(
if (
config.claude_agent_use_resume
and user_id
and captured_transcript.get("path")
and captured_transcript.available
):
# Give CLI time to flush JSONL writes before we read
await asyncio.sleep(0.5)
raw_transcript = read_transcript_file(captured_transcript["path"])
raw_transcript = read_transcript_file(captured_transcript.path)
if raw_transcript:
# Upload in background — strip + store to bucket
task = asyncio.create_task(
_upload_transcript_bg(user_id, session_id, raw_transcript)
)

View File

@@ -145,13 +145,15 @@ def read_transcript_file(transcript_path: str) -> str | None:
return None
def _sanitize_id(raw_id: str) -> str:
def _sanitize_id(raw_id: str, max_len: int = 36) -> str:
"""Sanitize an ID for safe use in file paths.
Session/user IDs are UUIDs (hex + hyphens). Strip everything else
to prevent path traversal or injection via crafted IDs.
Session/user IDs are expected to be UUIDs (hex + hyphens). Strip
everything else and truncate to *max_len* so the result cannot introduce
path separators or other special characters.
"""
return _SAFE_ID_RE.sub("", raw_id)
cleaned = _SAFE_ID_RE.sub("", raw_id or "")[:max_len]
return cleaned or "unknown"
_SAFE_CWD_PREFIX = os.path.realpath("/tmp/copilot-")
@@ -177,7 +179,7 @@ def write_transcript_to_tempfile(
try:
os.makedirs(real_cwd, exist_ok=True)
safe_id = _sanitize_id(session_id)[:8]
safe_id = _sanitize_id(session_id, max_len=8)
jsonl_path = os.path.join(real_cwd, f"transcript-{safe_id}.jsonl")
with open(jsonl_path, "w") as f:
@@ -202,7 +204,7 @@ def validate_transcript(content: str | None) -> bool:
return False
lines = content.strip().split("\n")
if len(lines) < 3:
if len(lines) < 2:
return False
has_user = False