Compare commits

...

6 Commits

Author SHA1 Message Date
Bentlybro
e8b8cad97a fix: apply size check to text files too (OOM protection) 2026-02-17 14:11:44 +00:00
Bentlybro
be35c626ad fix: address review comments
- Remove redundant inline comment on text_only param
- Simplify file filtering logic per review suggestion
2026-02-17 14:03:55 +00:00
Bentlybro
719c4ee1d1 fix: add explicit ValueError guard for stat output parsing 2026-02-16 14:46:06 +00:00
Bentlybro
411c399e03 style: fix formatting and sync docs
- Fix Black formatting for is_text/is_binary checks
- Update llm.md to reflect binary file support in Claude Code block
2026-02-16 14:40:53 +00:00
Bentlybro
6ac011e36c fix: normalize extension case in sandbox file extraction
Fixes bug where 'Dockerfile' in TEXT_EXTENSIONS wouldn't match after
lowercasing file_path because the extension itself wasn't lowercased.
2026-02-16 14:18:25 +00:00
Bentlybro
5e554526e2 fix(backend): Extract binary files from ClaudeCodeBlock sandbox
Enables binary file extraction (images, PDFs, etc.) for the Claude Code block
by setting text_only=False in extract_and_store_sandbox_files.

Changes:
- sandbox_files.py: Add BINARY_EXTENSIONS set with supported formats
- sandbox_files.py: Add MAX_BINARY_FILE_SIZE (50MB) limit to prevent OOM
- sandbox_files.py: Add size check before reading binary files
- sandbox_files.py: Add .svg to TEXT_EXTENSIONS (XML-based)
- sandbox_files.py: Make extension matching case-insensitive
- claude_code.py: Enable binary file extraction (text_only=False)
- claude_code.py: Update output description to mention binary support
- claude_code.md: Update docs to reflect binary file support

Binary files are stored via store_media_file which handles:
- Virus scanning via scan_content_safe()
- Workspace storage (returns workspace:// URI in CoPilot)
- Data URI fallback for graph execution

Closes SECRT-1897
2026-02-16 14:10:05 +00:00
4 changed files with 93 additions and 13 deletions

View File

@@ -187,9 +187,11 @@ class ClaudeCodeBlock(Block):
)
files: list[SandboxFileOutput] = SchemaField(
description=(
"List of text files created/modified by Claude Code during this execution. "
"List of files created/modified by Claude Code during this execution. "
"Includes text files and binary files (images, PDFs, etc.). "
"Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. "
"workspace_ref contains a workspace:// URI if the file was stored to workspace."
"workspace_ref contains a workspace:// URI for workspace storage. "
"For binary files, content contains a placeholder; use workspace_ref to access the file."
)
)
conversation_history: str = SchemaField(
@@ -452,13 +454,15 @@ class ClaudeCodeBlock(Block):
else:
new_conversation_history = turn_entry
# Extract files created/modified during this run and store to workspace
# Extract files created/modified during this run and store to workspace.
# Binary files (images, PDFs, etc.) are stored via store_media_file
# which handles virus scanning and workspace storage.
sandbox_files = await extract_and_store_sandbox_files(
sandbox=sandbox,
working_directory=working_directory,
execution_context=execution_context,
since_timestamp=start_timestamp,
text_only=True,
text_only=False,
)
return (

View File

@@ -74,8 +74,50 @@ TEXT_EXTENSIONS = {
".tex",
".csv",
".log",
".svg", # SVG is XML-based text
}
# Binary file extensions we explicitly support extracting
BINARY_EXTENSIONS = {
# Images
".png",
".jpg",
".jpeg",
".gif",
".webp",
".ico",
".bmp",
".tiff",
".tif",
# Documents
".pdf",
# Archives
".zip",
".tar",
".gz",
".7z",
# Audio
".mp3",
".wav",
".ogg",
".flac",
# Video
".mp4",
".webm",
".mov",
".avi",
# Fonts
".woff",
".woff2",
".ttf",
".otf",
".eot",
}
# Maximum file size for binary extraction (50MB)
# Prevents OOM from accidentally extracting huge files
MAX_BINARY_FILE_SIZE = 50 * 1024 * 1024
class SandboxFileOutput(BaseModel):
"""A file extracted from a sandbox and optionally stored in workspace."""
@@ -120,7 +162,8 @@ async def extract_sandbox_files(
sandbox: The E2B sandbox instance
working_directory: Directory to search for files
since_timestamp: ISO timestamp - only return files modified after this time
text_only: If True, only extract text files (default). If False, extract all files.
text_only: If True, only extract text files. If False, also extract
supported binary files (images, PDFs, etc.).
Returns:
List of ExtractedFile objects with path, content, and metadata
@@ -149,15 +192,48 @@ async def extract_sandbox_files(
if not file_path:
continue
# Check if it's a text file
is_text = any(file_path.endswith(ext) for ext in TEXT_EXTENSIONS)
# Check file type (case-insensitive for extensions)
file_path_lower = file_path.lower()
is_text = any(
file_path_lower.endswith(ext.lower()) for ext in TEXT_EXTENSIONS
)
is_binary = any(
file_path_lower.endswith(ext.lower()) for ext in BINARY_EXTENSIONS
)
# Skip non-text files if text_only mode
# Skip files with unrecognized extensions
if not is_text and not is_binary:
continue
# In text_only mode, skip binary files
if text_only and not is_text:
continue
try:
# Read file content as bytes
# Check file size before reading to prevent OOM
stat_result = await sandbox.commands.run(
f"stat -c %s {shlex.quote(file_path)} 2>/dev/null"
)
if stat_result.exit_code != 0 or not stat_result.stdout:
logger.debug(f"Skipping {file_path}: could not determine file size")
continue
try:
file_size = int(stat_result.stdout.strip())
except ValueError:
logger.debug(
f"Skipping {file_path}: unexpected stat output "
f"{stat_result.stdout.strip()!r}"
)
continue
if file_size > MAX_BINARY_FILE_SIZE:
logger.info(
f"Skipping {file_path}: size {file_size} bytes "
f"exceeds limit {MAX_BINARY_FILE_SIZE}"
)
continue
content = await sandbox.files.read(file_path, format="bytes")
if isinstance(content, str):
content = content.encode("utf-8")

View File

@@ -16,7 +16,7 @@ When activated, the block:
- Install dependencies (npm, pip, etc.)
- Run terminal commands
- Build and test applications
5. Extracts all text files created/modified during execution
5. Extracts all files created/modified during execution (text files and binary files like images, PDFs, etc.)
6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks
The block supports conversation continuation through three mechanisms:
@@ -42,7 +42,7 @@ The block supports conversation continuation through three mechanisms:
| Output | Description |
|--------|-------------|
| Response | The output/response from Claude Code execution |
| Files | List of text files created/modified during execution. Each file includes path, relative_path, name, and content fields |
| Files | List of files (text and binary) created/modified during execution. Includes images, PDFs, and other supported formats. Each file has path, relative_path, name, content, and workspace_ref fields. Binary files are stored in workspace and accessible via workspace_ref |
| Conversation History | Full conversation history including this turn. Use to restore context on a fresh sandbox |
| Session ID | Session ID for this conversation. Pass back with sandbox_id to continue the conversation |
| Sandbox ID | ID of the sandbox instance (null if disposed). Pass back with session_id to continue the conversation |

View File

@@ -535,7 +535,7 @@ When activated, the block:
2. Installs the latest version of Claude Code in the sandbox
3. Optionally runs setup commands to prepare the environment
4. Executes your prompt using Claude Code, which can create/edit files, install dependencies, run terminal commands, and build applications
5. Extracts all text files created/modified during execution
5. Extracts all files created/modified during execution (text files and binary files like images, PDFs, etc.)
6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks
The block supports conversation continuation through three mechanisms:
@@ -563,7 +563,7 @@ The block supports conversation continuation through three mechanisms:
|--------|-------------|------|
| error | Error message if execution failed | str |
| response | The output/response from Claude Code execution | str |
| files | List of text files created/modified by Claude Code during this execution. Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. workspace_ref contains a workspace:// URI if the file was stored to workspace. | List[SandboxFileOutput] |
| files | List of files created/modified by Claude Code during this execution. Includes text files and binary files (images, PDFs, etc.). Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. workspace_ref contains a workspace:// URI for workspace storage. For binary files, content contains a placeholder; use workspace_ref to access the file. | List[SandboxFileOutput] |
| conversation_history | Full conversation history including this turn. Pass this to conversation_history input to continue on a fresh sandbox if the previous sandbox timed out. | str |
| session_id | Session ID for this conversation. Pass this back along with sandbox_id to continue the conversation. | str |
| sandbox_id | ID of the sandbox instance. Pass this back along with session_id to continue the conversation. This is None if dispose_sandbox was True (sandbox was disposed). | str |