feat(blocks): Store sandbox files to workspace

- Add shared sandbox_files.py utility for file extraction and workspace storage
- Update Claude Code block to use shared utility and add workspace_ref field
- Update Code Executor block to extract files from /output directory
- Files are stored via store_media_file() with virus scanning and size limits
- Backward compatible: content field preserved, workspace_ref added as optional

Closes SECRT-1931
This commit is contained in:
Nick Tindle
2026-02-11 10:18:07 -06:00
parent 062fe1aa70
commit b98fbc40ee
3 changed files with 377 additions and 146 deletions

View File

@@ -1,7 +1,7 @@
import json
import shlex
import uuid
from typing import Literal, Optional
from typing import TYPE_CHECKING, Literal, Optional
from e2b import AsyncSandbox as BaseAsyncSandbox
from pydantic import BaseModel, SecretStr
@@ -20,6 +20,10 @@ from backend.data.model import (
SchemaField,
)
from backend.integrations.providers import ProviderName
from backend.util.sandbox_files import extract_and_store_sandbox_files
if TYPE_CHECKING:
from backend.executor.utils import ExecutionContext
class ClaudeCodeExecutionError(Exception):
@@ -181,6 +185,7 @@ class ClaudeCodeBlock(Block):
relative_path: str # Path relative to working directory (for GitHub, etc.)
name: str
content: str
workspace_ref: Optional[str] = None # workspace://{id}#mime if stored
class Output(BlockSchemaOutput):
response: str = SchemaField(
@@ -189,7 +194,8 @@ class ClaudeCodeBlock(Block):
files: list["ClaudeCodeBlock.FileOutput"] = SchemaField(
description=(
"List of text files created/modified by Claude Code during this execution. "
"Each file has 'path', 'relative_path', 'name', and 'content' fields."
"Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. "
"workspace_ref contains a workspace:// URI if the file was stored to workspace."
)
)
conversation_history: str = SchemaField(
@@ -294,6 +300,7 @@ class ClaudeCodeBlock(Block):
existing_sandbox_id: str,
conversation_history: str,
dispose_sandbox: bool,
execution_context: "ExecutionContext",
) -> tuple[str, list["ClaudeCodeBlock.FileOutput"], str, str, str]:
"""
Execute Claude Code in an E2B sandbox.
@@ -449,11 +456,27 @@ class ClaudeCodeBlock(Block):
else:
new_conversation_history = turn_entry
# Extract files created/modified during this run
files = await self._extract_files(
sandbox, working_directory, start_timestamp
# Extract files created/modified during this run and store to workspace
sandbox_files = await extract_and_store_sandbox_files(
sandbox=sandbox,
working_directory=working_directory,
execution_context=execution_context,
since_timestamp=start_timestamp,
text_only=True,
)
# Convert to FileOutput format
files = [
ClaudeCodeBlock.FileOutput(
path=f.path,
relative_path=f.relative_path,
name=f.name,
content=f.content,
workspace_ref=f.workspace_ref,
)
for f in sandbox_files
]
return (
response,
files,
@@ -471,140 +494,6 @@ class ClaudeCodeBlock(Block):
if dispose_sandbox and sandbox:
await sandbox.kill()
async def _extract_files(
self,
sandbox: BaseAsyncSandbox,
working_directory: str,
since_timestamp: str | None = None,
) -> list["ClaudeCodeBlock.FileOutput"]:
"""
Extract text files created/modified during this Claude Code execution.
Args:
sandbox: The E2B sandbox instance
working_directory: Directory to search for files
since_timestamp: ISO timestamp - only return files modified after this time
Returns:
List of FileOutput objects with path, relative_path, name, and content
"""
files: list[ClaudeCodeBlock.FileOutput] = []
# Text file extensions we can safely read as text
text_extensions = {
".txt",
".md",
".html",
".htm",
".css",
".js",
".ts",
".jsx",
".tsx",
".json",
".xml",
".yaml",
".yml",
".toml",
".ini",
".cfg",
".conf",
".py",
".rb",
".php",
".java",
".c",
".cpp",
".h",
".hpp",
".cs",
".go",
".rs",
".swift",
".kt",
".scala",
".sh",
".bash",
".zsh",
".sql",
".graphql",
".env",
".gitignore",
".dockerfile",
"Dockerfile",
".vue",
".svelte",
".astro",
".mdx",
".rst",
".tex",
".csv",
".log",
}
try:
# List files recursively using find command
# Exclude node_modules and .git directories, but allow hidden files
# like .env and .gitignore (they're filtered by text_extensions later)
# Filter by timestamp to only get files created/modified during this run
safe_working_dir = shlex.quote(working_directory)
timestamp_filter = ""
if since_timestamp:
timestamp_filter = f"-newermt {shlex.quote(since_timestamp)} "
find_result = await sandbox.commands.run(
f"find {safe_working_dir} -type f "
f"{timestamp_filter}"
f"-not -path '*/node_modules/*' "
f"-not -path '*/.git/*' "
f"2>/dev/null"
)
if find_result.stdout:
for file_path in find_result.stdout.strip().split("\n"):
if not file_path:
continue
# Check if it's a text file we can read
is_text = any(
file_path.endswith(ext) for ext in text_extensions
) or file_path.endswith("Dockerfile")
if is_text:
try:
content = await sandbox.files.read(file_path)
# Handle bytes or string
if isinstance(content, bytes):
content = content.decode("utf-8", errors="replace")
# Extract filename from path
file_name = file_path.split("/")[-1]
# Calculate relative path by stripping working directory
relative_path = file_path
if file_path.startswith(working_directory):
relative_path = file_path[len(working_directory) :]
# Remove leading slash if present
if relative_path.startswith("/"):
relative_path = relative_path[1:]
files.append(
ClaudeCodeBlock.FileOutput(
path=file_path,
relative_path=relative_path,
name=file_name,
content=content,
)
)
except Exception:
# Skip files that can't be read
pass
except Exception:
# If file extraction fails, return empty results
pass
return files
def _escape_prompt(self, prompt: str) -> str:
"""Escape the prompt for safe shell execution."""
# Use single quotes and escape any single quotes in the prompt
@@ -617,6 +506,7 @@ class ClaudeCodeBlock(Block):
*,
e2b_credentials: APIKeyCredentials,
anthropic_credentials: APIKeyCredentials,
execution_context: "ExecutionContext",
**kwargs,
) -> BlockOutput:
try:
@@ -637,6 +527,7 @@ class ClaudeCodeBlock(Block):
existing_sandbox_id=input_data.sandbox_id,
conversation_history=input_data.conversation_history,
dispose_sandbox=input_data.dispose_sandbox,
execution_context=execution_context,
)
yield "response", response

View File

@@ -1,5 +1,5 @@
from enum import Enum
from typing import Any, Literal, Optional
from typing import TYPE_CHECKING, Any, Literal, Optional
from e2b_code_interpreter import AsyncSandbox
from e2b_code_interpreter import Result as E2BExecutionResult
@@ -20,6 +20,13 @@ from backend.data.model import (
SchemaField,
)
from backend.integrations.providers import ProviderName
from backend.util.sandbox_files import (
SandboxFileOutput,
extract_and_store_sandbox_files,
)
if TYPE_CHECKING:
from backend.executor.utils import ExecutionContext
TEST_CREDENTIALS = APIKeyCredentials(
id="01234567-89ab-cdef-0123-456789abcdef",
@@ -85,6 +92,9 @@ class CodeExecutionResult(MainCodeExecutionResult):
class BaseE2BExecutorMixin:
"""Shared implementation methods for E2B executor blocks."""
# Default output directory for file extraction
OUTPUT_DIR = "/output"
async def execute_code(
self,
api_key: str,
@@ -95,14 +105,21 @@ class BaseE2BExecutorMixin:
timeout: Optional[int] = None,
sandbox_id: Optional[str] = None,
dispose_sandbox: bool = False,
execution_context: Optional["ExecutionContext"] = None,
extract_files: bool = False,
):
"""
Unified code execution method that handles all three use cases:
1. Create new sandbox and execute (ExecuteCodeBlock)
2. Create new sandbox, execute, and return sandbox_id (InstantiateCodeSandboxBlock)
3. Connect to existing sandbox and execute (ExecuteCodeStepBlock)
Args:
extract_files: If True and execution_context provided, extract files from
/output directory and store to workspace.
""" # noqa
sandbox = None
files: list[SandboxFileOutput] = []
try:
if sandbox_id:
# Connect to existing sandbox (ExecuteCodeStepBlock case)
@@ -114,6 +131,9 @@ class BaseE2BExecutorMixin:
sandbox = await AsyncSandbox.create(
api_key=api_key, template=template_id, timeout=timeout
)
# Create /output directory for file extraction
if extract_files:
await sandbox.commands.run(f"mkdir -p {self.OUTPUT_DIR}")
if setup_commands:
for cmd in setup_commands:
await sandbox.commands.run(cmd)
@@ -133,7 +153,24 @@ class BaseE2BExecutorMixin:
stdout_logs = "".join(execution.logs.stdout)
stderr_logs = "".join(execution.logs.stderr)
return results, text_output, stdout_logs, stderr_logs, sandbox.sandbox_id
# Extract files from /output if requested
if extract_files and execution_context:
files = await extract_and_store_sandbox_files(
sandbox=sandbox,
working_directory=self.OUTPUT_DIR,
execution_context=execution_context,
since_timestamp=None, # Get all files in /output
text_only=False, # Include binary files too
)
return (
results,
text_output,
stdout_logs,
stderr_logs,
sandbox.sandbox_id,
files,
)
finally:
# Dispose of sandbox if requested to reduce usage costs
if dispose_sandbox and sandbox:
@@ -238,6 +275,13 @@ class ExecuteCodeBlock(Block, BaseE2BExecutorMixin):
description="Standard output logs from execution"
)
stderr_logs: str = SchemaField(description="Standard error logs from execution")
files: list[SandboxFileOutput] = SchemaField(
description=(
"Files written to /output directory during execution. "
"Each file has path, name, content, and workspace_ref (if stored)."
),
default=[],
)
def __init__(self):
super().__init__(
@@ -261,21 +305,27 @@ class ExecuteCodeBlock(Block, BaseE2BExecutorMixin):
("stdout_logs", "Hello World\n"),
],
test_mock={
"execute_code": lambda api_key, code, language, template_id, setup_commands, timeout, dispose_sandbox: ( # noqa
"execute_code": lambda api_key, code, language, template_id, setup_commands, timeout, dispose_sandbox, execution_context, extract_files: ( # noqa
[], # results
"Hello World", # text_output
"Hello World\n", # stdout_logs
"", # stderr_logs
"sandbox_id", # sandbox_id
[], # files
),
},
)
async def run(
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
self,
input_data: Input,
*,
credentials: APIKeyCredentials,
execution_context: "ExecutionContext",
**kwargs,
) -> BlockOutput:
try:
results, text_output, stdout, stderr, _ = await self.execute_code(
results, text_output, stdout, stderr, _, files = await self.execute_code(
api_key=credentials.api_key.get_secret_value(),
code=input_data.code,
language=input_data.language,
@@ -283,6 +333,8 @@ class ExecuteCodeBlock(Block, BaseE2BExecutorMixin):
setup_commands=input_data.setup_commands,
timeout=input_data.timeout,
dispose_sandbox=input_data.dispose_sandbox,
execution_context=execution_context,
extract_files=True,
)
# Determine result object shape & filter out empty formats
@@ -296,6 +348,8 @@ class ExecuteCodeBlock(Block, BaseE2BExecutorMixin):
yield "stdout_logs", stdout
if stderr:
yield "stderr_logs", stderr
# Always yield files (empty list if none)
yield "files", [f.model_dump() for f in files]
except Exception as e:
yield "error", str(e)
@@ -393,6 +447,7 @@ class InstantiateCodeSandboxBlock(Block, BaseE2BExecutorMixin):
"Hello World\n", # stdout_logs
"", # stderr_logs
"sandbox_id", # sandbox_id
[], # files
),
},
)
@@ -401,7 +456,7 @@ class InstantiateCodeSandboxBlock(Block, BaseE2BExecutorMixin):
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
) -> BlockOutput:
try:
_, text_output, stdout, stderr, sandbox_id = await self.execute_code(
_, text_output, stdout, stderr, sandbox_id, _ = await self.execute_code(
api_key=credentials.api_key.get_secret_value(),
code=input_data.setup_code,
language=input_data.language,
@@ -500,6 +555,7 @@ class ExecuteCodeStepBlock(Block, BaseE2BExecutorMixin):
"Hello World\n", # stdout_logs
"", # stderr_logs
sandbox_id, # sandbox_id
[], # files
),
},
)
@@ -508,7 +564,7 @@ class ExecuteCodeStepBlock(Block, BaseE2BExecutorMixin):
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
) -> BlockOutput:
try:
results, text_output, stdout, stderr, _ = await self.execute_code(
results, text_output, stdout, stderr, _, _ = await self.execute_code(
api_key=credentials.api_key.get_secret_value(),
code=input_data.step_code,
language=input_data.language,

View File

@@ -0,0 +1,284 @@
"""
Shared utilities for extracting and storing files from E2B sandboxes.
This module provides common file extraction and workspace storage functionality
for blocks that run code in E2B sandboxes (Claude Code, Code Executor, etc.).
"""
import logging
import shlex
from dataclasses import dataclass
from typing import TYPE_CHECKING
from pydantic import BaseModel
from backend.util.file import store_media_file
if TYPE_CHECKING:
from e2b import AsyncSandbox as BaseAsyncSandbox
from backend.executor.utils import ExecutionContext
logger = logging.getLogger(__name__)
# Text file extensions that can be safely read and stored as text
TEXT_EXTENSIONS = {
".txt",
".md",
".html",
".htm",
".css",
".js",
".ts",
".jsx",
".tsx",
".json",
".xml",
".yaml",
".yml",
".toml",
".ini",
".cfg",
".conf",
".py",
".rb",
".php",
".java",
".c",
".cpp",
".h",
".hpp",
".cs",
".go",
".rs",
".swift",
".kt",
".scala",
".sh",
".bash",
".zsh",
".sql",
".graphql",
".env",
".gitignore",
".dockerfile",
"Dockerfile",
".vue",
".svelte",
".astro",
".mdx",
".rst",
".tex",
".csv",
".log",
}
class SandboxFileOutput(BaseModel):
"""A file extracted from a sandbox and optionally stored in workspace."""
path: str
"""Full path in the sandbox."""
relative_path: str
"""Path relative to the working directory."""
name: str
"""Filename only."""
content: str
"""File content as text (for backward compatibility)."""
workspace_ref: str | None = None
"""Workspace reference (workspace://{id}#mime) if stored, None otherwise."""
@dataclass
class ExtractedFile:
"""Internal representation of an extracted file before storage."""
path: str
relative_path: str
name: str
content: bytes
is_text: bool
async def extract_sandbox_files(
sandbox: "BaseAsyncSandbox",
working_directory: str,
since_timestamp: str | None = None,
text_only: bool = True,
) -> list[ExtractedFile]:
"""
Extract files from an E2B sandbox.
Args:
sandbox: The E2B sandbox instance
working_directory: Directory to search for files
since_timestamp: ISO timestamp - only return files modified after this time
text_only: If True, only extract text files (default). If False, extract all files.
Returns:
List of ExtractedFile objects with path, content, and metadata
"""
files: list[ExtractedFile] = []
try:
# Build find command
safe_working_dir = shlex.quote(working_directory)
timestamp_filter = ""
if since_timestamp:
timestamp_filter = f"-newermt {shlex.quote(since_timestamp)} "
find_result = await sandbox.commands.run(
f"find {safe_working_dir} -type f "
f"{timestamp_filter}"
f"-not -path '*/node_modules/*' "
f"-not -path '*/.git/*' "
f"2>/dev/null"
)
if not find_result.stdout:
return files
for file_path in find_result.stdout.strip().split("\n"):
if not file_path:
continue
# Check if it's a text file
is_text = any(
file_path.endswith(ext) for ext in TEXT_EXTENSIONS
) or file_path.endswith("Dockerfile")
# Skip non-text files if text_only mode
if text_only and not is_text:
continue
try:
# Read file content as bytes
content = await sandbox.files.read(file_path, format="bytes")
if isinstance(content, str):
content = content.encode("utf-8")
# Extract filename from path
file_name = file_path.split("/")[-1]
# Calculate relative path
relative_path = file_path
if file_path.startswith(working_directory):
relative_path = file_path[len(working_directory) :]
if relative_path.startswith("/"):
relative_path = relative_path[1:]
files.append(
ExtractedFile(
path=file_path,
relative_path=relative_path,
name=file_name,
content=content,
is_text=is_text,
)
)
except Exception as e:
logger.debug(f"Failed to read file {file_path}: {e}")
continue
except Exception as e:
logger.warning(f"File extraction failed: {e}")
return files
async def store_sandbox_files(
extracted_files: list[ExtractedFile],
execution_context: "ExecutionContext",
) -> list[SandboxFileOutput]:
"""
Store extracted sandbox files to workspace and return output objects.
Args:
extracted_files: List of files extracted from sandbox
execution_context: Execution context for workspace storage
Returns:
List of SandboxFileOutput objects with workspace refs
"""
outputs: list[SandboxFileOutput] = []
for file in extracted_files:
# Decode content for text files (for backward compat content field)
if file.is_text:
try:
content_str = file.content.decode("utf-8", errors="replace")
except Exception:
content_str = ""
else:
content_str = f"[Binary file: {len(file.content)} bytes]"
# Try to store in workspace
workspace_ref: str | None = None
try:
# Convert bytes to data URI for store_media_file
import base64
import mimetypes
mime_type = mimetypes.guess_type(file.name)[0] or "application/octet-stream"
data_uri = (
f"data:{mime_type};base64,{base64.b64encode(file.content).decode()}"
)
result = await store_media_file(
file=data_uri,
execution_context=execution_context,
return_format="for_block_output",
)
# Result is workspace://... or data:... depending on context
if result.startswith("workspace://"):
workspace_ref = result
except Exception as e:
logger.debug(f"Failed to store file {file.name} to workspace: {e}")
outputs.append(
SandboxFileOutput(
path=file.path,
relative_path=file.relative_path,
name=file.name,
content=content_str,
workspace_ref=workspace_ref,
)
)
return outputs
async def extract_and_store_sandbox_files(
sandbox: "BaseAsyncSandbox",
working_directory: str,
execution_context: "ExecutionContext",
since_timestamp: str | None = None,
text_only: bool = True,
) -> list[SandboxFileOutput]:
"""
Extract files from sandbox and store them in workspace.
This is the main entry point combining extraction and storage.
Args:
sandbox: The E2B sandbox instance
working_directory: Directory to search for files
execution_context: Execution context for workspace storage
since_timestamp: ISO timestamp - only return files modified after this time
text_only: If True, only extract text files
Returns:
List of SandboxFileOutput objects with content and workspace refs
"""
extracted = await extract_sandbox_files(
sandbox=sandbox,
working_directory=working_directory,
since_timestamp=since_timestamp,
text_only=text_only,
)
return await store_sandbox_files(extracted, execution_context)