Compare commits

...

6 Commits

Author SHA1 Message Date
Bentlybro
e8b8cad97a fix: apply size check to text files too (OOM protection) 2026-02-17 14:11:44 +00:00
Bentlybro
be35c626ad fix: address review comments
- Remove redundant inline comment on text_only param
- Simplify file filtering logic per review suggestion
2026-02-17 14:03:55 +00:00
Bentlybro
719c4ee1d1 fix: add explicit ValueError guard for stat output parsing 2026-02-16 14:46:06 +00:00
Bentlybro
411c399e03 style: fix formatting and sync docs
- Fix Black formatting for is_text/is_binary checks
- Update llm.md to reflect binary file support in Claude Code block
2026-02-16 14:40:53 +00:00
Bentlybro
6ac011e36c fix: normalize extension case in sandbox file extraction
Fixes bug where 'Dockerfile' in TEXT_EXTENSIONS wouldn't match after
lowercasing file_path because the extension itself wasn't lowercased.
2026-02-16 14:18:25 +00:00
Bentlybro
5e554526e2 fix(backend): Extract binary files from ClaudeCodeBlock sandbox
Enables binary file extraction (images, PDFs, etc.) for the Claude Code block
by setting text_only=False in extract_and_store_sandbox_files.

Changes:
- sandbox_files.py: Add BINARY_EXTENSIONS set with supported formats
- sandbox_files.py: Add MAX_BINARY_FILE_SIZE (50MB) limit to prevent OOM
- sandbox_files.py: Add size check before reading binary files
- sandbox_files.py: Add .svg to TEXT_EXTENSIONS (XML-based)
- sandbox_files.py: Make extension matching case-insensitive
- claude_code.py: Enable binary file extraction (text_only=False)
- claude_code.py: Update output description to mention binary support
- claude_code.md: Update docs to reflect binary file support

Binary files are stored via store_media_file which handles:
- Virus scanning via scan_content_safe()
- Workspace storage (returns workspace:// URI in CoPilot)
- Data URI fallback for graph execution

Closes SECRT-1897
2026-02-16 14:10:05 +00:00
4 changed files with 93 additions and 13 deletions

View File

@@ -187,9 +187,11 @@ class ClaudeCodeBlock(Block):
) )
files: list[SandboxFileOutput] = SchemaField( files: list[SandboxFileOutput] = SchemaField(
description=( description=(
"List of text files created/modified by Claude Code during this execution. " "List of files created/modified by Claude Code during this execution. "
"Includes text files and binary files (images, PDFs, etc.). "
"Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. " "Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. "
"workspace_ref contains a workspace:// URI if the file was stored to workspace." "workspace_ref contains a workspace:// URI for workspace storage. "
"For binary files, content contains a placeholder; use workspace_ref to access the file."
) )
) )
conversation_history: str = SchemaField( conversation_history: str = SchemaField(
@@ -452,13 +454,15 @@ class ClaudeCodeBlock(Block):
else: else:
new_conversation_history = turn_entry new_conversation_history = turn_entry
# Extract files created/modified during this run and store to workspace # Extract files created/modified during this run and store to workspace.
# Binary files (images, PDFs, etc.) are stored via store_media_file
# which handles virus scanning and workspace storage.
sandbox_files = await extract_and_store_sandbox_files( sandbox_files = await extract_and_store_sandbox_files(
sandbox=sandbox, sandbox=sandbox,
working_directory=working_directory, working_directory=working_directory,
execution_context=execution_context, execution_context=execution_context,
since_timestamp=start_timestamp, since_timestamp=start_timestamp,
text_only=True, text_only=False,
) )
return ( return (

View File

@@ -74,8 +74,50 @@ TEXT_EXTENSIONS = {
".tex", ".tex",
".csv", ".csv",
".log", ".log",
".svg", # SVG is XML-based text
} }
# Binary file extensions we explicitly support extracting
BINARY_EXTENSIONS = {
# Images
".png",
".jpg",
".jpeg",
".gif",
".webp",
".ico",
".bmp",
".tiff",
".tif",
# Documents
".pdf",
# Archives
".zip",
".tar",
".gz",
".7z",
# Audio
".mp3",
".wav",
".ogg",
".flac",
# Video
".mp4",
".webm",
".mov",
".avi",
# Fonts
".woff",
".woff2",
".ttf",
".otf",
".eot",
}
# Maximum file size for binary extraction (50MB)
# Prevents OOM from accidentally extracting huge files
MAX_BINARY_FILE_SIZE = 50 * 1024 * 1024
class SandboxFileOutput(BaseModel): class SandboxFileOutput(BaseModel):
"""A file extracted from a sandbox and optionally stored in workspace.""" """A file extracted from a sandbox and optionally stored in workspace."""
@@ -120,7 +162,8 @@ async def extract_sandbox_files(
sandbox: The E2B sandbox instance sandbox: The E2B sandbox instance
working_directory: Directory to search for files working_directory: Directory to search for files
since_timestamp: ISO timestamp - only return files modified after this time since_timestamp: ISO timestamp - only return files modified after this time
text_only: If True, only extract text files (default). If False, extract all files. text_only: If True, only extract text files. If False, also extract
supported binary files (images, PDFs, etc.).
Returns: Returns:
List of ExtractedFile objects with path, content, and metadata List of ExtractedFile objects with path, content, and metadata
@@ -149,15 +192,48 @@ async def extract_sandbox_files(
if not file_path: if not file_path:
continue continue
# Check if it's a text file # Check file type (case-insensitive for extensions)
is_text = any(file_path.endswith(ext) for ext in TEXT_EXTENSIONS) file_path_lower = file_path.lower()
is_text = any(
file_path_lower.endswith(ext.lower()) for ext in TEXT_EXTENSIONS
)
is_binary = any(
file_path_lower.endswith(ext.lower()) for ext in BINARY_EXTENSIONS
)
# Skip non-text files if text_only mode # Skip files with unrecognized extensions
if not is_text and not is_binary:
continue
# In text_only mode, skip binary files
if text_only and not is_text: if text_only and not is_text:
continue continue
try: try:
# Read file content as bytes # Check file size before reading to prevent OOM
stat_result = await sandbox.commands.run(
f"stat -c %s {shlex.quote(file_path)} 2>/dev/null"
)
if stat_result.exit_code != 0 or not stat_result.stdout:
logger.debug(f"Skipping {file_path}: could not determine file size")
continue
try:
file_size = int(stat_result.stdout.strip())
except ValueError:
logger.debug(
f"Skipping {file_path}: unexpected stat output "
f"{stat_result.stdout.strip()!r}"
)
continue
if file_size > MAX_BINARY_FILE_SIZE:
logger.info(
f"Skipping {file_path}: size {file_size} bytes "
f"exceeds limit {MAX_BINARY_FILE_SIZE}"
)
continue
content = await sandbox.files.read(file_path, format="bytes") content = await sandbox.files.read(file_path, format="bytes")
if isinstance(content, str): if isinstance(content, str):
content = content.encode("utf-8") content = content.encode("utf-8")

View File

@@ -16,7 +16,7 @@ When activated, the block:
- Install dependencies (npm, pip, etc.) - Install dependencies (npm, pip, etc.)
- Run terminal commands - Run terminal commands
- Build and test applications - Build and test applications
5. Extracts all text files created/modified during execution 5. Extracts all files created/modified during execution (text files and binary files like images, PDFs, etc.)
6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks 6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks
The block supports conversation continuation through three mechanisms: The block supports conversation continuation through three mechanisms:
@@ -42,7 +42,7 @@ The block supports conversation continuation through three mechanisms:
| Output | Description | | Output | Description |
|--------|-------------| |--------|-------------|
| Response | The output/response from Claude Code execution | | Response | The output/response from Claude Code execution |
| Files | List of text files created/modified during execution. Each file includes path, relative_path, name, and content fields | | Files | List of files (text and binary) created/modified during execution. Includes images, PDFs, and other supported formats. Each file has path, relative_path, name, content, and workspace_ref fields. Binary files are stored in workspace and accessible via workspace_ref |
| Conversation History | Full conversation history including this turn. Use to restore context on a fresh sandbox | | Conversation History | Full conversation history including this turn. Use to restore context on a fresh sandbox |
| Session ID | Session ID for this conversation. Pass back with sandbox_id to continue the conversation | | Session ID | Session ID for this conversation. Pass back with sandbox_id to continue the conversation |
| Sandbox ID | ID of the sandbox instance (null if disposed). Pass back with session_id to continue the conversation | | Sandbox ID | ID of the sandbox instance (null if disposed). Pass back with session_id to continue the conversation |

View File

@@ -535,7 +535,7 @@ When activated, the block:
2. Installs the latest version of Claude Code in the sandbox 2. Installs the latest version of Claude Code in the sandbox
3. Optionally runs setup commands to prepare the environment 3. Optionally runs setup commands to prepare the environment
4. Executes your prompt using Claude Code, which can create/edit files, install dependencies, run terminal commands, and build applications 4. Executes your prompt using Claude Code, which can create/edit files, install dependencies, run terminal commands, and build applications
5. Extracts all text files created/modified during execution 5. Extracts all files created/modified during execution (text files and binary files like images, PDFs, etc.)
6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks 6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks
The block supports conversation continuation through three mechanisms: The block supports conversation continuation through three mechanisms:
@@ -563,7 +563,7 @@ The block supports conversation continuation through three mechanisms:
|--------|-------------|------| |--------|-------------|------|
| error | Error message if execution failed | str | | error | Error message if execution failed | str |
| response | The output/response from Claude Code execution | str | | response | The output/response from Claude Code execution | str |
| files | List of text files created/modified by Claude Code during this execution. Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. workspace_ref contains a workspace:// URI if the file was stored to workspace. | List[SandboxFileOutput] | | files | List of files created/modified by Claude Code during this execution. Includes text files and binary files (images, PDFs, etc.). Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. workspace_ref contains a workspace:// URI for workspace storage. For binary files, content contains a placeholder; use workspace_ref to access the file. | List[SandboxFileOutput] |
| conversation_history | Full conversation history including this turn. Pass this to conversation_history input to continue on a fresh sandbox if the previous sandbox timed out. | str | | conversation_history | Full conversation history including this turn. Pass this to conversation_history input to continue on a fresh sandbox if the previous sandbox timed out. | str |
| session_id | Session ID for this conversation. Pass this back along with sandbox_id to continue the conversation. | str | | session_id | Session ID for this conversation. Pass this back along with sandbox_id to continue the conversation. | str |
| sandbox_id | ID of the sandbox instance. Pass this back along with session_id to continue the conversation. This is None if dispose_sandbox was True (sandbox was disposed). | str | | sandbox_id | ID of the sandbox instance. Pass this back along with session_id to continue the conversation. This is None if dispose_sandbox was True (sandbox was disposed). | str |