Compare commits

...

10 Commits

Author SHA1 Message Date
Bentlybro
0b2fb655bc style: black formatting 2026-02-12 12:46:20 +00:00
Bentlybro
99f8bf5f0c fix: skip binary file if stat fails to prevent OOM
If the stat command fails (file deleted, permissions issue, etc.),
we now skip the file rather than proceeding to read it with an
unknown size. This prevents potential OOM crashes from large files
where size verification failed.
2026-02-12 12:32:13 +00:00
Bentlybro
3f76f1318b docs: Fix llm.md to match exact schema description 2026-02-12 12:25:29 +00:00
Bentlybro
b011289dd2 fix: Address code review feedback
- Add 50MB size guard for binary files to prevent OOM
- Extract helper function for path resolution (DRY)
- Add logging for file extraction errors
- Remove dead 'Dockerfile' entry from text_extensions
2026-02-12 12:02:45 +00:00
Bentlybro
49c2f578b4 docs: Update llm.md for binary file support in Claude Code block 2026-02-12 11:58:35 +00:00
Bentlybro
7150b7768d fix: Make Dockerfile check case-insensitive 2026-02-12 11:53:57 +00:00
Bentlybro
8c95b03636 fix: Update tests and address code review feedback
- Update test fixtures with is_binary and content_base64 fields
- Move .svg to text_extensions (it's XML-based)
- Make extension matching case-insensitive for both text and binary
2026-02-12 11:45:52 +00:00
Bentlybro
4a8368887f fix: Use format='bytes' for reading binary files from E2B sandbox
Fixes the critical bug where binary files would fail to read because
files.read() defaults to text mode (UTF-8 decoding). Now explicitly
uses format='bytes' which returns a bytearray.
2026-02-12 11:29:43 +00:00
Bentlybro
d46e5e6b6a docs: Update claude_code.md for binary file support 2026-02-12 11:26:58 +00:00
Bentlybro
4e632bbd60 fix(backend): Extract binary files from ClaudeCodeBlock sandbox
Add support for extracting binary files (PDFs, images, etc.) from the E2B
sandbox in ClaudeCodeBlock.

Changes:
- Add binary_extensions set for common binary file types (.pdf, .png, .jpg, etc.)
- Update FileOutput schema with is_binary and content_base64 fields
- Binary files are read as bytes and base64-encoded before returning
- Text files continue to work as before with is_binary=False

Closes SECRT-1897
2026-02-12 11:23:05 +00:00
3 changed files with 137 additions and 28 deletions

View File

@@ -1,4 +1,6 @@
import base64
import json import json
import logging
import shlex import shlex
import uuid import uuid
from typing import Literal, Optional from typing import Literal, Optional
@@ -21,6 +23,11 @@ from backend.data.model import (
) )
from backend.integrations.providers import ProviderName from backend.integrations.providers import ProviderName
logger = logging.getLogger(__name__)
# Maximum size for binary files to extract (50MB)
MAX_BINARY_FILE_SIZE = 50 * 1024 * 1024
class ClaudeCodeExecutionError(Exception): class ClaudeCodeExecutionError(Exception):
"""Exception raised when Claude Code execution fails. """Exception raised when Claude Code execution fails.
@@ -180,7 +187,9 @@ class ClaudeCodeBlock(Block):
path: str path: str
relative_path: str # Path relative to working directory (for GitHub, etc.) relative_path: str # Path relative to working directory (for GitHub, etc.)
name: str name: str
content: str content: str # Text content for text files, empty string for binary files
is_binary: bool = False # True if this is a binary file
content_base64: Optional[str] = None # Base64-encoded content for binary files
class Output(BlockSchemaOutput): class Output(BlockSchemaOutput):
response: str = SchemaField( response: str = SchemaField(
@@ -188,8 +197,11 @@ class ClaudeCodeBlock(Block):
) )
files: list["ClaudeCodeBlock.FileOutput"] = SchemaField( files: list["ClaudeCodeBlock.FileOutput"] = SchemaField(
description=( description=(
"List of text files created/modified by Claude Code during this execution. " "List of files created/modified by Claude Code during this execution. "
"Each file has 'path', 'relative_path', 'name', and 'content' fields." "Each file has 'path', 'relative_path', 'name', 'content', 'is_binary', "
"and 'content_base64' fields. For text files, 'content' contains the text "
"and 'is_binary' is False. For binary files (PDFs, images, etc.), "
"'is_binary' is True and 'content_base64' contains the base64-encoded data."
) )
) )
conversation_history: str = SchemaField( conversation_history: str = SchemaField(
@@ -252,6 +264,8 @@ class ClaudeCodeBlock(Block):
"relative_path": "index.html", "relative_path": "index.html",
"name": "index.html", "name": "index.html",
"content": "<html>Hello World</html>", "content": "<html>Hello World</html>",
"is_binary": False,
"content_base64": None,
} }
], ],
), ),
@@ -272,6 +286,8 @@ class ClaudeCodeBlock(Block):
relative_path="index.html", relative_path="index.html",
name="index.html", name="index.html",
content="<html>Hello World</html>", content="<html>Hello World</html>",
is_binary=False,
content_base64=None,
) )
], # files ], # files
"User: Create a hello world HTML file\n" "User: Create a hello world HTML file\n"
@@ -531,7 +547,6 @@ class ClaudeCodeBlock(Block):
".env", ".env",
".gitignore", ".gitignore",
".dockerfile", ".dockerfile",
"Dockerfile",
".vue", ".vue",
".svelte", ".svelte",
".astro", ".astro",
@@ -540,6 +555,44 @@ class ClaudeCodeBlock(Block):
".tex", ".tex",
".csv", ".csv",
".log", ".log",
".svg", # SVG is XML-based text
}
# Binary file extensions we can read and base64-encode
binary_extensions = {
# Images
".png",
".jpg",
".jpeg",
".gif",
".webp",
".ico",
".bmp",
".tiff",
".tif",
# Documents
".pdf",
# Archives (useful for downloads)
".zip",
".tar",
".gz",
".7z",
# Audio/Video (if small enough)
".mp3",
".wav",
".mp4",
".webm",
# Other binary formats
".woff",
".woff2",
".ttf",
".otf",
".eot",
".bin",
".exe",
".dll",
".so",
".dylib",
} }
try: try:
@@ -564,10 +617,26 @@ class ClaudeCodeBlock(Block):
if not file_path: if not file_path:
continue continue
# Check if it's a text file we can read # Check if it's a text file we can read (case-insensitive)
file_path_lower = file_path.lower()
is_text = any( is_text = any(
file_path.endswith(ext) for ext in text_extensions file_path_lower.endswith(ext) for ext in text_extensions
) or file_path.endswith("Dockerfile") ) or file_path_lower.endswith("dockerfile")
# Check if it's a binary file we should extract
is_binary = any(
file_path_lower.endswith(ext) for ext in binary_extensions
)
# Helper to extract filename and relative path
def get_file_info(path: str, work_dir: str) -> tuple[str, str]:
name = path.split("/")[-1]
rel_path = path
if path.startswith(work_dir):
rel_path = path[len(work_dir) :]
if rel_path.startswith("/"):
rel_path = rel_path[1:]
return name, rel_path
if is_text: if is_text:
try: try:
@@ -576,32 +645,72 @@ class ClaudeCodeBlock(Block):
if isinstance(content, bytes): if isinstance(content, bytes):
content = content.decode("utf-8", errors="replace") content = content.decode("utf-8", errors="replace")
# Extract filename from path file_name, relative_path = get_file_info(
file_name = file_path.split("/")[-1] file_path, working_directory
)
# Calculate relative path by stripping working directory
relative_path = file_path
if file_path.startswith(working_directory):
relative_path = file_path[len(working_directory) :]
# Remove leading slash if present
if relative_path.startswith("/"):
relative_path = relative_path[1:]
files.append( files.append(
ClaudeCodeBlock.FileOutput( ClaudeCodeBlock.FileOutput(
path=file_path, path=file_path,
relative_path=relative_path, relative_path=relative_path,
name=file_name, name=file_name,
content=content, content=content,
is_binary=False,
content_base64=None,
) )
) )
except Exception: except Exception as e:
# Skip files that can't be read logger.warning(f"Failed to read text file {file_path}: {e}")
pass elif is_binary:
try:
# Check file size before reading to avoid OOM
stat_result = await sandbox.commands.run(
f"stat -c %s {shlex.quote(file_path)} 2>/dev/null"
)
if stat_result.exit_code != 0 or not stat_result.stdout:
logger.warning(
f"Skipping binary file {file_path}: "
f"could not determine file size"
)
continue
file_size = int(stat_result.stdout.strip())
if file_size > MAX_BINARY_FILE_SIZE:
logger.warning(
f"Skipping binary file {file_path}: "
f"size {file_size} exceeds limit "
f"{MAX_BINARY_FILE_SIZE}"
)
continue
except Exception: # Read binary file as bytes using format="bytes"
# If file extraction fails, return empty results content_bytes = await sandbox.files.read(
pass file_path, format="bytes"
)
# Base64 encode the binary content
content_b64 = base64.b64encode(content_bytes).decode(
"ascii"
)
file_name, relative_path = get_file_info(
file_path, working_directory
)
files.append(
ClaudeCodeBlock.FileOutput(
path=file_path,
relative_path=relative_path,
name=file_name,
content="", # Empty for binary files
is_binary=True,
content_base64=content_b64,
)
)
except Exception as e:
logger.warning(
f"Failed to read binary file {file_path}: {e}"
)
except Exception as e:
logger.warning(f"File extraction failed: {e}")
return files return files

View File

@@ -16,7 +16,7 @@ When activated, the block:
- Install dependencies (npm, pip, etc.) - Install dependencies (npm, pip, etc.)
- Run terminal commands - Run terminal commands
- Build and test applications - Build and test applications
5. Extracts all text files created/modified during execution 5. Extracts all text and binary files created/modified during execution
6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks 6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks
The block supports conversation continuation through three mechanisms: The block supports conversation continuation through three mechanisms:
@@ -42,7 +42,7 @@ The block supports conversation continuation through three mechanisms:
| Output | Description | | Output | Description |
|--------|-------------| |--------|-------------|
| Response | The output/response from Claude Code execution | | Response | The output/response from Claude Code execution |
| Files | List of text files created/modified during execution. Each file includes path, relative_path, name, and content fields | | Files | List of files created/modified during execution. Each file includes path, relative_path, name, content, is_binary, and content_base64 fields. For text files, content contains the text and is_binary is False. For binary files (PDFs, images, etc.), is_binary is True and content_base64 contains the base64-encoded data |
| Conversation History | Full conversation history including this turn. Use to restore context on a fresh sandbox | | Conversation History | Full conversation history including this turn. Use to restore context on a fresh sandbox |
| Session ID | Session ID for this conversation. Pass back with sandbox_id to continue the conversation | | Session ID | Session ID for this conversation. Pass back with sandbox_id to continue the conversation |
| Sandbox ID | ID of the sandbox instance (null if disposed). Pass back with session_id to continue the conversation | | Sandbox ID | ID of the sandbox instance (null if disposed). Pass back with session_id to continue the conversation |

View File

@@ -535,7 +535,7 @@ When activated, the block:
2. Installs the latest version of Claude Code in the sandbox 2. Installs the latest version of Claude Code in the sandbox
3. Optionally runs setup commands to prepare the environment 3. Optionally runs setup commands to prepare the environment
4. Executes your prompt using Claude Code, which can create/edit files, install dependencies, run terminal commands, and build applications 4. Executes your prompt using Claude Code, which can create/edit files, install dependencies, run terminal commands, and build applications
5. Extracts all text files created/modified during execution 5. Extracts all text and binary files created/modified during execution
6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks 6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks
The block supports conversation continuation through three mechanisms: The block supports conversation continuation through three mechanisms:
@@ -563,7 +563,7 @@ The block supports conversation continuation through three mechanisms:
|--------|-------------|------| |--------|-------------|------|
| error | Error message if execution failed | str | | error | Error message if execution failed | str |
| response | The output/response from Claude Code execution | str | | response | The output/response from Claude Code execution | str |
| files | List of text files created/modified by Claude Code during this execution. Each file has 'path', 'relative_path', 'name', and 'content' fields. | List[FileOutput] | | files | List of files created/modified by Claude Code during this execution. Each file has 'path', 'relative_path', 'name', 'content', 'is_binary', and 'content_base64' fields. For text files, 'content' contains the text and 'is_binary' is False. For binary files (PDFs, images, etc.), 'is_binary' is True and 'content_base64' contains the base64-encoded data. | List[FileOutput] |
| conversation_history | Full conversation history including this turn. Pass this to conversation_history input to continue on a fresh sandbox if the previous sandbox timed out. | str | | conversation_history | Full conversation history including this turn. Pass this to conversation_history input to continue on a fresh sandbox if the previous sandbox timed out. | str |
| session_id | Session ID for this conversation. Pass this back along with sandbox_id to continue the conversation. | str | | session_id | Session ID for this conversation. Pass this back along with sandbox_id to continue the conversation. | str |
| sandbox_id | ID of the sandbox instance. Pass this back along with session_id to continue the conversation. This is None if dispose_sandbox was True (sandbox was disposed). | str | | sandbox_id | ID of the sandbox instance. Pass this back along with session_id to continue the conversation. This is None if dispose_sandbox was True (sandbox was disposed). | str |