From b98fbc40ee542464027e929c545ac7344b3cfc9f Mon Sep 17 00:00:00 2001
From: Nick Tindle <nick@ntindle.com>
Date: Wed, 11 Feb 2026 10:18:07 -0600
Subject: [PATCH] feat(blocks): Store sandbox files to workspace

- Add shared sandbox_files.py utility for file extraction and workspace storage
- Update Claude Code block to use shared utility and add workspace_ref field
- Update Code Executor block to extract files from /output directory
- Files are stored via store_media_file() with virus scanning and size limits
- Backward compatible: content field preserved, workspace_ref added as optional

Closes SECRT-1931
---
 .../backend/backend/blocks/claude_code.py     | 169 ++---------
 .../backend/backend/blocks/code_executor.py   |  70 ++++-
 .../backend/backend/util/sandbox_files.py     | 284 ++++++++++++++++++
 3 files changed, 377 insertions(+), 146 deletions(-)
 create mode 100644 autogpt_platform/backend/backend/util/sandbox_files.py

diff --git a/autogpt_platform/backend/backend/blocks/claude_code.py b/autogpt_platform/backend/backend/blocks/claude_code.py
index 4ef44603b2..311edf55b3 100644
--- a/autogpt_platform/backend/backend/blocks/claude_code.py
+++ b/autogpt_platform/backend/backend/blocks/claude_code.py
@@ -1,7 +1,7 @@
 import json
 import shlex
 import uuid
-from typing import Literal, Optional
+from typing import TYPE_CHECKING, Literal, Optional
 
 from e2b import AsyncSandbox as BaseAsyncSandbox
 from pydantic import BaseModel, SecretStr
@@ -20,6 +20,10 @@ from backend.data.model import (
     SchemaField,
 )
 from backend.integrations.providers import ProviderName
+from backend.util.sandbox_files import extract_and_store_sandbox_files
+
+if TYPE_CHECKING:
+    from backend.executor.utils import ExecutionContext
 
 
 class ClaudeCodeExecutionError(Exception):
@@ -181,6 +185,7 @@ class ClaudeCodeBlock(Block):
         relative_path: str  # Path relative to working directory (for GitHub, etc.)
         name: str
         content: str
+        workspace_ref: Optional[str] = None  # workspace://{id}#mime if stored
 
     class Output(BlockSchemaOutput):
         response: str = SchemaField(
@@ -189,7 +194,8 @@ class ClaudeCodeBlock(Block):
         files: list["ClaudeCodeBlock.FileOutput"] = SchemaField(
             description=(
                 "List of text files created/modified by Claude Code during this execution. "
-                "Each file has 'path', 'relative_path', 'name', and 'content' fields."
+                "Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. "
+                "workspace_ref contains a workspace:// URI if the file was stored to workspace."
             )
         )
         conversation_history: str = SchemaField(
@@ -294,6 +300,7 @@ class ClaudeCodeBlock(Block):
         existing_sandbox_id: str,
         conversation_history: str,
         dispose_sandbox: bool,
+        execution_context: "ExecutionContext",
     ) -> tuple[str, list["ClaudeCodeBlock.FileOutput"], str, str, str]:
         """
         Execute Claude Code in an E2B sandbox.
@@ -449,11 +456,27 @@ class ClaudeCodeBlock(Block):
                 else:
                     new_conversation_history = turn_entry
 
-            # Extract files created/modified during this run
-            files = await self._extract_files(
-                sandbox, working_directory, start_timestamp
+            # Extract files created/modified during this run and store to workspace
+            sandbox_files = await extract_and_store_sandbox_files(
+                sandbox=sandbox,
+                working_directory=working_directory,
+                execution_context=execution_context,
+                since_timestamp=start_timestamp,
+                text_only=True,
             )
 
+            # Convert to FileOutput format
+            files = [
+                ClaudeCodeBlock.FileOutput(
+                    path=f.path,
+                    relative_path=f.relative_path,
+                    name=f.name,
+                    content=f.content,
+                    workspace_ref=f.workspace_ref,
+                )
+                for f in sandbox_files
+            ]
+
             return (
                 response,
                 files,
@@ -471,140 +494,6 @@ class ClaudeCodeBlock(Block):
             if dispose_sandbox and sandbox:
                 await sandbox.kill()
 
-    async def _extract_files(
-        self,
-        sandbox: BaseAsyncSandbox,
-        working_directory: str,
-        since_timestamp: str | None = None,
-    ) -> list["ClaudeCodeBlock.FileOutput"]:
-        """
-        Extract text files created/modified during this Claude Code execution.
-
-        Args:
-            sandbox: The E2B sandbox instance
-            working_directory: Directory to search for files
-            since_timestamp: ISO timestamp - only return files modified after this time
-
-        Returns:
-            List of FileOutput objects with path, relative_path, name, and content
-        """
-        files: list[ClaudeCodeBlock.FileOutput] = []
-
-        # Text file extensions we can safely read as text
-        text_extensions = {
-            ".txt",
-            ".md",
-            ".html",
-            ".htm",
-            ".css",
-            ".js",
-            ".ts",
-            ".jsx",
-            ".tsx",
-            ".json",
-            ".xml",
-            ".yaml",
-            ".yml",
-            ".toml",
-            ".ini",
-            ".cfg",
-            ".conf",
-            ".py",
-            ".rb",
-            ".php",
-            ".java",
-            ".c",
-            ".cpp",
-            ".h",
-            ".hpp",
-            ".cs",
-            ".go",
-            ".rs",
-            ".swift",
-            ".kt",
-            ".scala",
-            ".sh",
-            ".bash",
-            ".zsh",
-            ".sql",
-            ".graphql",
-            ".env",
-            ".gitignore",
-            ".dockerfile",
-            "Dockerfile",
-            ".vue",
-            ".svelte",
-            ".astro",
-            ".mdx",
-            ".rst",
-            ".tex",
-            ".csv",
-            ".log",
-        }
-
-        try:
-            # List files recursively using find command
-            # Exclude node_modules and .git directories, but allow hidden files
-            # like .env and .gitignore (they're filtered by text_extensions later)
-            # Filter by timestamp to only get files created/modified during this run
-            safe_working_dir = shlex.quote(working_directory)
-            timestamp_filter = ""
-            if since_timestamp:
-                timestamp_filter = f"-newermt {shlex.quote(since_timestamp)} "
-            find_result = await sandbox.commands.run(
-                f"find {safe_working_dir} -type f "
-                f"{timestamp_filter}"
-                f"-not -path '*/node_modules/*' "
-                f"-not -path '*/.git/*' "
-                f"2>/dev/null"
-            )
-
-            if find_result.stdout:
-                for file_path in find_result.stdout.strip().split("\n"):
-                    if not file_path:
-                        continue
-
-                    # Check if it's a text file we can read
-                    is_text = any(
-                        file_path.endswith(ext) for ext in text_extensions
-                    ) or file_path.endswith("Dockerfile")
-
-                    if is_text:
-                        try:
-                            content = await sandbox.files.read(file_path)
-                            # Handle bytes or string
-                            if isinstance(content, bytes):
-                                content = content.decode("utf-8", errors="replace")
-
-                            # Extract filename from path
-                            file_name = file_path.split("/")[-1]
-
-                            # Calculate relative path by stripping working directory
-                            relative_path = file_path
-                            if file_path.startswith(working_directory):
-                                relative_path = file_path[len(working_directory) :]
-                                # Remove leading slash if present
-                                if relative_path.startswith("/"):
-                                    relative_path = relative_path[1:]
-
-                            files.append(
-                                ClaudeCodeBlock.FileOutput(
-                                    path=file_path,
-                                    relative_path=relative_path,
-                                    name=file_name,
-                                    content=content,
-                                )
-                            )
-                        except Exception:
-                            # Skip files that can't be read
-                            pass
-
-        except Exception:
-            # If file extraction fails, return empty results
-            pass
-
-        return files
-
     def _escape_prompt(self, prompt: str) -> str:
         """Escape the prompt for safe shell execution."""
         # Use single quotes and escape any single quotes in the prompt
@@ -617,6 +506,7 @@ class ClaudeCodeBlock(Block):
         *,
         e2b_credentials: APIKeyCredentials,
         anthropic_credentials: APIKeyCredentials,
+        execution_context: "ExecutionContext",
         **kwargs,
     ) -> BlockOutput:
         try:
@@ -637,6 +527,7 @@ class ClaudeCodeBlock(Block):
                 existing_sandbox_id=input_data.sandbox_id,
                 conversation_history=input_data.conversation_history,
                 dispose_sandbox=input_data.dispose_sandbox,
+                execution_context=execution_context,
             )
 
             yield "response", response
diff --git a/autogpt_platform/backend/backend/blocks/code_executor.py b/autogpt_platform/backend/backend/blocks/code_executor.py
index be6f2bba55..7ab5bf375c 100644
--- a/autogpt_platform/backend/backend/blocks/code_executor.py
+++ b/autogpt_platform/backend/backend/blocks/code_executor.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Any, Literal, Optional
+from typing import TYPE_CHECKING, Any, Literal, Optional
 
 from e2b_code_interpreter import AsyncSandbox
 from e2b_code_interpreter import Result as E2BExecutionResult
@@ -20,6 +20,13 @@ from backend.data.model import (
     SchemaField,
 )
 from backend.integrations.providers import ProviderName
+from backend.util.sandbox_files import (
+    SandboxFileOutput,
+    extract_and_store_sandbox_files,
+)
+
+if TYPE_CHECKING:
+    from backend.executor.utils import ExecutionContext
 
 TEST_CREDENTIALS = APIKeyCredentials(
     id="01234567-89ab-cdef-0123-456789abcdef",
@@ -85,6 +92,9 @@ class CodeExecutionResult(MainCodeExecutionResult):
 class BaseE2BExecutorMixin:
     """Shared implementation methods for E2B executor blocks."""
 
+    # Default output directory for file extraction
+    OUTPUT_DIR = "/output"
+
     async def execute_code(
         self,
         api_key: str,
@@ -95,14 +105,21 @@ class BaseE2BExecutorMixin:
         timeout: Optional[int] = None,
         sandbox_id: Optional[str] = None,
         dispose_sandbox: bool = False,
+        execution_context: Optional["ExecutionContext"] = None,
+        extract_files: bool = False,
     ):
         """
         Unified code execution method that handles all three use cases:
         1. Create new sandbox and execute (ExecuteCodeBlock)
         2. Create new sandbox, execute, and return sandbox_id (InstantiateCodeSandboxBlock)
         3. Connect to existing sandbox and execute (ExecuteCodeStepBlock)
+
+        Args:
+            extract_files: If True and execution_context provided, extract files from
+                           /output directory and store to workspace.
         """  # noqa
         sandbox = None
+        files: list[SandboxFileOutput] = []
         try:
             if sandbox_id:
                 # Connect to existing sandbox (ExecuteCodeStepBlock case)
@@ -114,6 +131,9 @@ class BaseE2BExecutorMixin:
                 sandbox = await AsyncSandbox.create(
                     api_key=api_key, template=template_id, timeout=timeout
                 )
+                # Create /output directory for file extraction
+                if extract_files:
+                    await sandbox.commands.run(f"mkdir -p {self.OUTPUT_DIR}")
                 if setup_commands:
                     for cmd in setup_commands:
                         await sandbox.commands.run(cmd)
@@ -133,7 +153,24 @@ class BaseE2BExecutorMixin:
             stdout_logs = "".join(execution.logs.stdout)
             stderr_logs = "".join(execution.logs.stderr)
 
-            return results, text_output, stdout_logs, stderr_logs, sandbox.sandbox_id
+            # Extract files from /output if requested
+            if extract_files and execution_context:
+                files = await extract_and_store_sandbox_files(
+                    sandbox=sandbox,
+                    working_directory=self.OUTPUT_DIR,
+                    execution_context=execution_context,
+                    since_timestamp=None,  # Get all files in /output
+                    text_only=False,  # Include binary files too
+                )
+
+            return (
+                results,
+                text_output,
+                stdout_logs,
+                stderr_logs,
+                sandbox.sandbox_id,
+                files,
+            )
         finally:
             # Dispose of sandbox if requested to reduce usage costs
             if dispose_sandbox and sandbox:
@@ -238,6 +275,13 @@ class ExecuteCodeBlock(Block, BaseE2BExecutorMixin):
             description="Standard output logs from execution"
         )
         stderr_logs: str = SchemaField(description="Standard error logs from execution")
+        files: list[SandboxFileOutput] = SchemaField(
+            description=(
+                "Files written to /output directory during execution. "
+                "Each file has path, name, content, and workspace_ref (if stored)."
+            ),
+            default=[],
+        )
 
     def __init__(self):
         super().__init__(
@@ -261,21 +305,27 @@ class ExecuteCodeBlock(Block, BaseE2BExecutorMixin):
                 ("stdout_logs", "Hello World\n"),
             ],
             test_mock={
-                "execute_code": lambda api_key, code, language, template_id, setup_commands, timeout, dispose_sandbox: (  # noqa
+                "execute_code": lambda api_key, code, language, template_id, setup_commands, timeout, dispose_sandbox, execution_context, extract_files: (  # noqa
                     [],  # results
                     "Hello World",  # text_output
                     "Hello World\n",  # stdout_logs
                     "",  # stderr_logs
                     "sandbox_id",  # sandbox_id
+                    [],  # files
                 ),
             },
         )
 
     async def run(
-        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+        self,
+        input_data: Input,
+        *,
+        credentials: APIKeyCredentials,
+        execution_context: "ExecutionContext",
+        **kwargs,
     ) -> BlockOutput:
         try:
-            results, text_output, stdout, stderr, _ = await self.execute_code(
+            results, text_output, stdout, stderr, _, files = await self.execute_code(
                 api_key=credentials.api_key.get_secret_value(),
                 code=input_data.code,
                 language=input_data.language,
@@ -283,6 +333,8 @@ class ExecuteCodeBlock(Block, BaseE2BExecutorMixin):
                 setup_commands=input_data.setup_commands,
                 timeout=input_data.timeout,
                 dispose_sandbox=input_data.dispose_sandbox,
+                execution_context=execution_context,
+                extract_files=True,
             )
 
             # Determine result object shape & filter out empty formats
@@ -296,6 +348,8 @@ class ExecuteCodeBlock(Block, BaseE2BExecutorMixin):
                 yield "stdout_logs", stdout
             if stderr:
                 yield "stderr_logs", stderr
+            # Always yield files (empty list if none)
+            yield "files", [f.model_dump() for f in files]
         except Exception as e:
             yield "error", str(e)
 
@@ -393,6 +447,7 @@ class InstantiateCodeSandboxBlock(Block, BaseE2BExecutorMixin):
                     "Hello World\n",  # stdout_logs
                     "",  # stderr_logs
                     "sandbox_id",  # sandbox_id
+                    [],  # files
                 ),
             },
         )
@@ -401,7 +456,7 @@ class InstantiateCodeSandboxBlock(Block, BaseE2BExecutorMixin):
         self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
     ) -> BlockOutput:
         try:
-            _, text_output, stdout, stderr, sandbox_id = await self.execute_code(
+            _, text_output, stdout, stderr, sandbox_id, _ = await self.execute_code(
                 api_key=credentials.api_key.get_secret_value(),
                 code=input_data.setup_code,
                 language=input_data.language,
@@ -500,6 +555,7 @@ class ExecuteCodeStepBlock(Block, BaseE2BExecutorMixin):
                     "Hello World\n",  # stdout_logs
                     "",  # stderr_logs
                     sandbox_id,  # sandbox_id
+                    [],  # files
                 ),
             },
         )
@@ -508,7 +564,7 @@ class ExecuteCodeStepBlock(Block, BaseE2BExecutorMixin):
         self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
     ) -> BlockOutput:
         try:
-            results, text_output, stdout, stderr, _ = await self.execute_code(
+            results, text_output, stdout, stderr, _, _ = await self.execute_code(
                 api_key=credentials.api_key.get_secret_value(),
                 code=input_data.step_code,
                 language=input_data.language,
diff --git a/autogpt_platform/backend/backend/util/sandbox_files.py b/autogpt_platform/backend/backend/util/sandbox_files.py
new file mode 100644
index 0000000000..2c45805519
--- /dev/null
+++ b/autogpt_platform/backend/backend/util/sandbox_files.py
@@ -0,0 +1,284 @@
+"""
+Shared utilities for extracting and storing files from E2B sandboxes.
+
+This module provides common file extraction and workspace storage functionality
+for blocks that run code in E2B sandboxes (Claude Code, Code Executor, etc.).
+"""
+
+import logging
+import shlex
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from pydantic import BaseModel
+
+from backend.util.file import store_media_file
+
+if TYPE_CHECKING:
+    from e2b import AsyncSandbox as BaseAsyncSandbox
+
+    from backend.executor.utils import ExecutionContext
+
+logger = logging.getLogger(__name__)
+
+# Text file extensions that can be safely read and stored as text
+TEXT_EXTENSIONS = {
+    ".txt",
+    ".md",
+    ".html",
+    ".htm",
+    ".css",
+    ".js",
+    ".ts",
+    ".jsx",
+    ".tsx",
+    ".json",
+    ".xml",
+    ".yaml",
+    ".yml",
+    ".toml",
+    ".ini",
+    ".cfg",
+    ".conf",
+    ".py",
+    ".rb",
+    ".php",
+    ".java",
+    ".c",
+    ".cpp",
+    ".h",
+    ".hpp",
+    ".cs",
+    ".go",
+    ".rs",
+    ".swift",
+    ".kt",
+    ".scala",
+    ".sh",
+    ".bash",
+    ".zsh",
+    ".sql",
+    ".graphql",
+    ".env",
+    ".gitignore",
+    ".dockerfile",
+    "Dockerfile",
+    ".vue",
+    ".svelte",
+    ".astro",
+    ".mdx",
+    ".rst",
+    ".tex",
+    ".csv",
+    ".log",
+}
+
+
+class SandboxFileOutput(BaseModel):
+    """A file extracted from a sandbox and optionally stored in workspace."""
+
+    path: str
+    """Full path in the sandbox."""
+
+    relative_path: str
+    """Path relative to the working directory."""
+
+    name: str
+    """Filename only."""
+
+    content: str
+    """File content as text (for backward compatibility)."""
+
+    workspace_ref: str | None = None
+    """Workspace reference (workspace://{id}#mime) if stored, None otherwise."""
+
+
+@dataclass
+class ExtractedFile:
+    """Internal representation of an extracted file before storage."""
+
+    path: str
+    relative_path: str
+    name: str
+    content: bytes
+    is_text: bool
+
+
+async def extract_sandbox_files(
+    sandbox: "BaseAsyncSandbox",
+    working_directory: str,
+    since_timestamp: str | None = None,
+    text_only: bool = True,
+) -> list[ExtractedFile]:
+    """
+    Extract files from an E2B sandbox.
+
+    Args:
+        sandbox: The E2B sandbox instance
+        working_directory: Directory to search for files
+        since_timestamp: ISO timestamp - only return files modified after this time
+        text_only: If True, only extract text files (default). If False, extract all files.
+
+    Returns:
+        List of ExtractedFile objects with path, content, and metadata
+    """
+    files: list[ExtractedFile] = []
+
+    try:
+        # Build find command
+        safe_working_dir = shlex.quote(working_directory)
+        timestamp_filter = ""
+        if since_timestamp:
+            timestamp_filter = f"-newermt {shlex.quote(since_timestamp)} "
+
+        find_result = await sandbox.commands.run(
+            f"find {safe_working_dir} -type f "
+            f"{timestamp_filter}"
+            f"-not -path '*/node_modules/*' "
+            f"-not -path '*/.git/*' "
+            f"2>/dev/null"
+        )
+
+        if not find_result.stdout:
+            return files
+
+        for file_path in find_result.stdout.strip().split("\n"):
+            if not file_path:
+                continue
+
+            # Check if it's a text file
+            is_text = any(
+                file_path.endswith(ext) for ext in TEXT_EXTENSIONS
+            ) or file_path.endswith("Dockerfile")
+
+            # Skip non-text files if text_only mode
+            if text_only and not is_text:
+                continue
+
+            try:
+                # Read file content as bytes
+                content = await sandbox.files.read(file_path, format="bytes")
+                if isinstance(content, str):
+                    content = content.encode("utf-8")
+
+                # Extract filename from path
+                file_name = file_path.split("/")[-1]
+
+                # Calculate relative path
+                relative_path = file_path
+                if file_path.startswith(working_directory):
+                    relative_path = file_path[len(working_directory) :]
+                    if relative_path.startswith("/"):
+                        relative_path = relative_path[1:]
+
+                files.append(
+                    ExtractedFile(
+                        path=file_path,
+                        relative_path=relative_path,
+                        name=file_name,
+                        content=content,
+                        is_text=is_text,
+                    )
+                )
+            except Exception as e:
+                logger.debug(f"Failed to read file {file_path}: {e}")
+                continue
+
+    except Exception as e:
+        logger.warning(f"File extraction failed: {e}")
+
+    return files
+
+
+async def store_sandbox_files(
+    extracted_files: list[ExtractedFile],
+    execution_context: "ExecutionContext",
+) -> list[SandboxFileOutput]:
+    """
+    Store extracted sandbox files to workspace and return output objects.
+
+    Args:
+        extracted_files: List of files extracted from sandbox
+        execution_context: Execution context for workspace storage
+
+    Returns:
+        List of SandboxFileOutput objects with workspace refs
+    """
+    outputs: list[SandboxFileOutput] = []
+
+    for file in extracted_files:
+        # Decode content for text files (for backward compat content field)
+        if file.is_text:
+            try:
+                content_str = file.content.decode("utf-8", errors="replace")
+            except Exception:
+                content_str = ""
+        else:
+            content_str = f"[Binary file: {len(file.content)} bytes]"
+
+        # Try to store in workspace
+        workspace_ref: str | None = None
+        try:
+            # Convert bytes to data URI for store_media_file
+            import base64
+            import mimetypes
+
+            mime_type = mimetypes.guess_type(file.name)[0] or "application/octet-stream"
+            data_uri = (
+                f"data:{mime_type};base64,{base64.b64encode(file.content).decode()}"
+            )
+
+            result = await store_media_file(
+                file=data_uri,
+                execution_context=execution_context,
+                return_format="for_block_output",
+            )
+            # Result is workspace://... or data:... depending on context
+            if result.startswith("workspace://"):
+                workspace_ref = result
+        except Exception as e:
+            logger.debug(f"Failed to store file {file.name} to workspace: {e}")
+
+        outputs.append(
+            SandboxFileOutput(
+                path=file.path,
+                relative_path=file.relative_path,
+                name=file.name,
+                content=content_str,
+                workspace_ref=workspace_ref,
+            )
+        )
+
+    return outputs
+
+
+async def extract_and_store_sandbox_files(
+    sandbox: "BaseAsyncSandbox",
+    working_directory: str,
+    execution_context: "ExecutionContext",
+    since_timestamp: str | None = None,
+    text_only: bool = True,
+) -> list[SandboxFileOutput]:
+    """
+    Extract files from sandbox and store them in workspace.
+
+    This is the main entry point combining extraction and storage.
+
+    Args:
+        sandbox: The E2B sandbox instance
+        working_directory: Directory to search for files
+        execution_context: Execution context for workspace storage
+        since_timestamp: ISO timestamp - only return files modified after this time
+        text_only: If True, only extract text files
+
+    Returns:
+        List of SandboxFileOutput objects with content and workspace refs
+    """
+    extracted = await extract_sandbox_files(
+        sandbox=sandbox,
+        working_directory=working_directory,
+        since_timestamp=since_timestamp,
+        text_only=text_only,
+    )
+
+    return await store_sandbox_files(extracted, execution_context)