fix(backend): Extract binary files from ClaudeCodeBlock sandbox

Enables binary file extraction (images, PDFs, etc.) for the Claude Code block
by setting text_only=False in extract_and_store_sandbox_files.

Changes:
- sandbox_files.py: Add BINARY_EXTENSIONS set with supported formats
- sandbox_files.py: Add MAX_BINARY_FILE_SIZE (50MB) limit to prevent OOM
- sandbox_files.py: Add size check before reading binary files
- sandbox_files.py: Add .svg to TEXT_EXTENSIONS (XML-based)
- sandbox_files.py: Make extension matching case-insensitive
- claude_code.py: Enable binary file extraction (text_only=False)
- claude_code.py: Update output description to mention binary support
- claude_code.md: Update docs to reflect binary file support

Binary files are stored via store_media_file which handles:
- Virus scanning via scan_content_safe()
- Workspace storage (returns workspace:// URI in CoPilot)
- Data URI fallback for graph execution

Closes SECRT-1897
This commit is contained in:
Bentlybro
2026-02-16 14:10:05 +00:00
parent 9d4dcbd9e0
commit 5e554526e2
3 changed files with 86 additions and 11 deletions

View File

@@ -187,9 +187,11 @@ class ClaudeCodeBlock(Block):
)
files: list[SandboxFileOutput] = SchemaField(
description=(
"List of text files created/modified by Claude Code during this execution. "
"List of files created/modified by Claude Code during this execution. "
"Includes text files and binary files (images, PDFs, etc.). "
"Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. "
"workspace_ref contains a workspace:// URI if the file was stored to workspace."
"workspace_ref contains a workspace:// URI for workspace storage. "
"For binary files, content contains a placeholder; use workspace_ref to access the file."
)
)
conversation_history: str = SchemaField(
@@ -453,12 +455,14 @@ class ClaudeCodeBlock(Block):
new_conversation_history = turn_entry
# Extract files created/modified during this run and store to workspace
# Include binary files (images, PDFs, etc.) - they'll be stored via
# store_media_file which handles virus scanning and workspace storage
sandbox_files = await extract_and_store_sandbox_files(
sandbox=sandbox,
working_directory=working_directory,
execution_context=execution_context,
since_timestamp=start_timestamp,
text_only=True,
text_only=False, # Extract both text and binary files
)
return (

View File

@@ -74,8 +74,51 @@ TEXT_EXTENSIONS = {
".tex",
".csv",
".log",
".svg", # SVG is XML-based text
}
# Binary file extensions we explicitly support extracting
# These are common output formats that users expect to retrieve
BINARY_EXTENSIONS = {
# Images
".png",
".jpg",
".jpeg",
".gif",
".webp",
".ico",
".bmp",
".tiff",
".tif",
# Documents
".pdf",
# Archives
".zip",
".tar",
".gz",
".7z",
# Audio
".mp3",
".wav",
".ogg",
".flac",
# Video
".mp4",
".webm",
".mov",
".avi",
# Fonts
".woff",
".woff2",
".ttf",
".otf",
".eot",
}
# Maximum file size for binary extraction (50MB)
# Prevents OOM from accidentally extracting huge files
MAX_BINARY_FILE_SIZE = 50 * 1024 * 1024
class SandboxFileOutput(BaseModel):
"""A file extracted from a sandbox and optionally stored in workspace."""
@@ -120,7 +163,8 @@ async def extract_sandbox_files(
sandbox: The E2B sandbox instance
working_directory: Directory to search for files
since_timestamp: ISO timestamp - only return files modified after this time
text_only: If True, only extract text files (default). If False, extract all files.
text_only: If True, only extract text files. If False, also extract
supported binary files (images, PDFs, etc.).
Returns:
List of ExtractedFile objects with path, content, and metadata
@@ -149,14 +193,41 @@ async def extract_sandbox_files(
if not file_path:
continue
# Check if it's a text file
is_text = any(file_path.endswith(ext) for ext in TEXT_EXTENSIONS)
# Check file type (case-insensitive for extensions)
file_path_lower = file_path.lower()
is_text = any(file_path_lower.endswith(ext) for ext in TEXT_EXTENSIONS)
is_binary = any(file_path_lower.endswith(ext) for ext in BINARY_EXTENSIONS)
# Skip non-text files if text_only mode
if text_only and not is_text:
continue
# Determine if we should extract this file
if text_only:
# Only extract text files
if not is_text:
continue
else:
# Extract text files and supported binary files
if not is_text and not is_binary:
continue
try:
# For binary files, check size before reading to prevent OOM
if is_binary:
stat_result = await sandbox.commands.run(
f"stat -c %s {shlex.quote(file_path)} 2>/dev/null"
)
if stat_result.exit_code != 0 or not stat_result.stdout:
logger.debug(
f"Skipping {file_path}: could not determine file size"
)
continue
file_size = int(stat_result.stdout.strip())
if file_size > MAX_BINARY_FILE_SIZE:
logger.info(
f"Skipping {file_path}: size {file_size} bytes "
f"exceeds limit {MAX_BINARY_FILE_SIZE}"
)
continue
# Read file content as bytes
content = await sandbox.files.read(file_path, format="bytes")
if isinstance(content, str):