mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-02-17 18:21:46 -05:00
fix(backend): Extract binary files from ClaudeCodeBlock sandbox
Enables binary file extraction (images, PDFs, etc.) for the Claude Code block by setting text_only=False in extract_and_store_sandbox_files. Changes: - sandbox_files.py: Add BINARY_EXTENSIONS set with supported formats - sandbox_files.py: Add MAX_BINARY_FILE_SIZE (50MB) limit to prevent OOM - sandbox_files.py: Add size check before reading binary files - sandbox_files.py: Add .svg to TEXT_EXTENSIONS (XML-based) - sandbox_files.py: Make extension matching case-insensitive - claude_code.py: Enable binary file extraction (text_only=False) - claude_code.py: Update output description to mention binary support - claude_code.md: Update docs to reflect binary file support Binary files are stored via store_media_file which handles: - Virus scanning via scan_content_safe() - Workspace storage (returns workspace:// URI in CoPilot) - Data URI fallback for graph execution Closes SECRT-1897
This commit is contained in:
@@ -187,9 +187,11 @@ class ClaudeCodeBlock(Block):
|
||||
)
|
||||
files: list[SandboxFileOutput] = SchemaField(
|
||||
description=(
|
||||
"List of text files created/modified by Claude Code during this execution. "
|
||||
"List of files created/modified by Claude Code during this execution. "
|
||||
"Includes text files and binary files (images, PDFs, etc.). "
|
||||
"Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. "
|
||||
"workspace_ref contains a workspace:// URI if the file was stored to workspace."
|
||||
"workspace_ref contains a workspace:// URI for workspace storage. "
|
||||
"For binary files, content contains a placeholder; use workspace_ref to access the file."
|
||||
)
|
||||
)
|
||||
conversation_history: str = SchemaField(
|
||||
@@ -453,12 +455,14 @@ class ClaudeCodeBlock(Block):
|
||||
new_conversation_history = turn_entry
|
||||
|
||||
# Extract files created/modified during this run and store to workspace
|
||||
# Include binary files (images, PDFs, etc.) - they'll be stored via
|
||||
# store_media_file which handles virus scanning and workspace storage
|
||||
sandbox_files = await extract_and_store_sandbox_files(
|
||||
sandbox=sandbox,
|
||||
working_directory=working_directory,
|
||||
execution_context=execution_context,
|
||||
since_timestamp=start_timestamp,
|
||||
text_only=True,
|
||||
text_only=False, # Extract both text and binary files
|
||||
)
|
||||
|
||||
return (
|
||||
|
||||
@@ -74,8 +74,51 @@ TEXT_EXTENSIONS = {
|
||||
".tex",
|
||||
".csv",
|
||||
".log",
|
||||
".svg", # SVG is XML-based text
|
||||
}
|
||||
|
||||
# Binary file extensions we explicitly support extracting
|
||||
# These are common output formats that users expect to retrieve
|
||||
BINARY_EXTENSIONS = {
|
||||
# Images
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".webp",
|
||||
".ico",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".tif",
|
||||
# Documents
|
||||
".pdf",
|
||||
# Archives
|
||||
".zip",
|
||||
".tar",
|
||||
".gz",
|
||||
".7z",
|
||||
# Audio
|
||||
".mp3",
|
||||
".wav",
|
||||
".ogg",
|
||||
".flac",
|
||||
# Video
|
||||
".mp4",
|
||||
".webm",
|
||||
".mov",
|
||||
".avi",
|
||||
# Fonts
|
||||
".woff",
|
||||
".woff2",
|
||||
".ttf",
|
||||
".otf",
|
||||
".eot",
|
||||
}
|
||||
|
||||
# Maximum file size for binary extraction (50MB)
|
||||
# Prevents OOM from accidentally extracting huge files
|
||||
MAX_BINARY_FILE_SIZE = 50 * 1024 * 1024
|
||||
|
||||
|
||||
class SandboxFileOutput(BaseModel):
|
||||
"""A file extracted from a sandbox and optionally stored in workspace."""
|
||||
@@ -120,7 +163,8 @@ async def extract_sandbox_files(
|
||||
sandbox: The E2B sandbox instance
|
||||
working_directory: Directory to search for files
|
||||
since_timestamp: ISO timestamp - only return files modified after this time
|
||||
text_only: If True, only extract text files (default). If False, extract all files.
|
||||
text_only: If True, only extract text files. If False, also extract
|
||||
supported binary files (images, PDFs, etc.).
|
||||
|
||||
Returns:
|
||||
List of ExtractedFile objects with path, content, and metadata
|
||||
@@ -149,14 +193,41 @@ async def extract_sandbox_files(
|
||||
if not file_path:
|
||||
continue
|
||||
|
||||
# Check if it's a text file
|
||||
is_text = any(file_path.endswith(ext) for ext in TEXT_EXTENSIONS)
|
||||
# Check file type (case-insensitive for extensions)
|
||||
file_path_lower = file_path.lower()
|
||||
is_text = any(file_path_lower.endswith(ext) for ext in TEXT_EXTENSIONS)
|
||||
is_binary = any(file_path_lower.endswith(ext) for ext in BINARY_EXTENSIONS)
|
||||
|
||||
# Skip non-text files if text_only mode
|
||||
if text_only and not is_text:
|
||||
continue
|
||||
# Determine if we should extract this file
|
||||
if text_only:
|
||||
# Only extract text files
|
||||
if not is_text:
|
||||
continue
|
||||
else:
|
||||
# Extract text files and supported binary files
|
||||
if not is_text and not is_binary:
|
||||
continue
|
||||
|
||||
try:
|
||||
# For binary files, check size before reading to prevent OOM
|
||||
if is_binary:
|
||||
stat_result = await sandbox.commands.run(
|
||||
f"stat -c %s {shlex.quote(file_path)} 2>/dev/null"
|
||||
)
|
||||
if stat_result.exit_code != 0 or not stat_result.stdout:
|
||||
logger.debug(
|
||||
f"Skipping {file_path}: could not determine file size"
|
||||
)
|
||||
continue
|
||||
|
||||
file_size = int(stat_result.stdout.strip())
|
||||
if file_size > MAX_BINARY_FILE_SIZE:
|
||||
logger.info(
|
||||
f"Skipping {file_path}: size {file_size} bytes "
|
||||
f"exceeds limit {MAX_BINARY_FILE_SIZE}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Read file content as bytes
|
||||
content = await sandbox.files.read(file_path, format="bytes")
|
||||
if isinstance(content, str):
|
||||
|
||||
@@ -16,7 +16,7 @@ When activated, the block:
|
||||
- Install dependencies (npm, pip, etc.)
|
||||
- Run terminal commands
|
||||
- Build and test applications
|
||||
5. Extracts all text files created/modified during execution
|
||||
5. Extracts all files created/modified during execution (text files and binary files like images, PDFs, etc.)
|
||||
6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks
|
||||
|
||||
The block supports conversation continuation through three mechanisms:
|
||||
@@ -42,7 +42,7 @@ The block supports conversation continuation through three mechanisms:
|
||||
| Output | Description |
|
||||
|--------|-------------|
|
||||
| Response | The output/response from Claude Code execution |
|
||||
| Files | List of text files created/modified during execution. Each file includes path, relative_path, name, and content fields |
|
||||
| Files | List of files (text and binary) created/modified during execution. Includes images, PDFs, and other supported formats. Each file has path, relative_path, name, content, and workspace_ref fields. Binary files are stored in workspace and accessible via workspace_ref |
|
||||
| Conversation History | Full conversation history including this turn. Use to restore context on a fresh sandbox |
|
||||
| Session ID | Session ID for this conversation. Pass back with sandbox_id to continue the conversation |
|
||||
| Sandbox ID | ID of the sandbox instance (null if disposed). Pass back with session_id to continue the conversation |
|
||||
|
||||
Reference in New Issue
Block a user