fix: add explicit ValueError guard for stat output parsing

style: fix formatting and sync docs
- Fix Black formatting for is_text/is_binary checks - Update llm.md to reflect binary file support in Claude Code block
2026-02-17 02:03:00 -05:00 · 2026-02-16 14:46:06 +00:00 · 2026-02-16 14:40:53 +00:00 · 2026-02-16 14:18:25 +00:00 · 2026-02-16 14:10:05 +00:00
15 changed files with 115 additions and 28 deletions
--- a/.github/workflows/classic-autogpt-ci.yml
+++ b/.github/workflows/classic-autogpt-ci.yml
@@ -61,7 +61,7 @@ jobs:
      #   See: https://github.com/actions/runner/issues/598#issuecomment-2011890429

      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          submodules: true
--- a/.github/workflows/classic-autogpt-docker-cache-clean.yml
+++ b/.github/workflows/classic-autogpt-docker-cache-clean.yml
@@ -16,7 +16,7 @@ jobs:
        build-type: [release, dev]
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
--- a/.github/workflows/classic-autogpt-docker-ci.yml
+++ b/.github/workflows/classic-autogpt-docker-ci.yml
@@ -35,7 +35,7 @@ jobs:
        build-type: [release, dev]
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
@@ -101,7 +101,7 @@ jobs:

    steps:
      - name: Check out repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          submodules: true

--- a/.github/workflows/classic-autogpt-docker-release.yml
+++ b/.github/workflows/classic-autogpt-docker-release.yml
@@ -20,7 +20,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: Log in to Docker hub
        uses: docker/login-action@v3
--- a/.github/workflows/classic-autogpts-ci.yml
+++ b/.github/workflows/classic-autogpts-ci.yml
@@ -44,7 +44,7 @@ jobs:
      min-python-version: '3.10'
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          submodules: true
--- a/.github/workflows/classic-benchmark-ci.yml
+++ b/.github/workflows/classic-benchmark-ci.yml
@@ -42,7 +42,7 @@ jobs:
        working-directory: classic/benchmark
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          submodules: true
@@ -114,7 +114,7 @@ jobs:
    timeout-minutes: 20
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          submodules: true
--- a/.github/workflows/classic-benchmark_publish_package.yml
+++ b/.github/workflows/classic-benchmark_publish_package.yml
@@ -10,7 +10,7 @@ jobs:
      contents: write
    steps:
    - name: Checkout repository
-      uses: actions/checkout@v6
+      uses: actions/checkout@v4
      with:
        submodules: true
        fetch-depth: 0
--- a/.github/workflows/classic-forge-ci.yml
+++ b/.github/workflows/classic-forge-ci.yml
@@ -63,7 +63,7 @@ jobs:
      #   See: https://github.com/actions/runner/issues/598#issuecomment-2011890429

      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          submodules: true
--- a/.github/workflows/classic-frontend-ci.yml
+++ b/.github/workflows/classic-frontend-ci.yml
@@ -25,7 +25,7 @@ jobs:

    steps:
      - name: Checkout Repo
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: Setup Flutter
        uses: subosito/flutter-action@v2
--- a/.github/workflows/classic-python-checks.yml
+++ b/.github/workflows/classic-python-checks.yml
@@ -33,7 +33,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - id: changes-in
        name: Determine affected subprojects
@@ -68,7 +68,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -120,7 +120,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

--- a/.github/workflows/pr-overlap-check.yml
+++ b/.github/workflows/pr-overlap-check.yml
@@ -16,7 +16,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0  # Need full history for merge testing

--- a/autogpt_platform/backend/backend/blocks/claude_code.py
+++ b/autogpt_platform/backend/backend/blocks/claude_code.py
@@ -187,9 +187,11 @@ class ClaudeCodeBlock(Block):
        )
        files: list[SandboxFileOutput] = SchemaField(
            description=(
-                "List of text files created/modified by Claude Code during this execution. "
+                "List of files created/modified by Claude Code during this execution. "
+                "Includes text files and binary files (images, PDFs, etc.). "
                "Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. "
-                "workspace_ref contains a workspace:// URI if the file was stored to workspace."
+                "workspace_ref contains a workspace:// URI for workspace storage. "
+                "For binary files, content contains a placeholder; use workspace_ref to access the file."
            )
        )
        conversation_history: str = SchemaField(
@@ -453,12 +455,14 @@ class ClaudeCodeBlock(Block):
                    new_conversation_history = turn_entry

            # Extract files created/modified during this run and store to workspace
+            # Include binary files (images, PDFs, etc.) - they'll be stored via
+            # store_media_file which handles virus scanning and workspace storage
            sandbox_files = await extract_and_store_sandbox_files(
                sandbox=sandbox,
                working_directory=working_directory,
                execution_context=execution_context,
                since_timestamp=start_timestamp,
-                text_only=True,
+                text_only=False,  # Extract both text and binary files
            )

            return (
--- a/autogpt_platform/backend/backend/util/sandbox_files.py
+++ b/autogpt_platform/backend/backend/util/sandbox_files.py
@@ -74,8 +74,51 @@ TEXT_EXTENSIONS = {
    ".tex",
    ".csv",
    ".log",
+    ".svg",  # SVG is XML-based text
 }

+# Binary file extensions we explicitly support extracting
+# These are common output formats that users expect to retrieve
+BINARY_EXTENSIONS = {
+    # Images
+    ".png",
+    ".jpg",
+    ".jpeg",
+    ".gif",
+    ".webp",
+    ".ico",
+    ".bmp",
+    ".tiff",
+    ".tif",
+    # Documents
+    ".pdf",
+    # Archives
+    ".zip",
+    ".tar",
+    ".gz",
+    ".7z",
+    # Audio
+    ".mp3",
+    ".wav",
+    ".ogg",
+    ".flac",
+    # Video
+    ".mp4",
+    ".webm",
+    ".mov",
+    ".avi",
+    # Fonts
+    ".woff",
+    ".woff2",
+    ".ttf",
+    ".otf",
+    ".eot",
+}
+
+# Maximum file size for binary extraction (50MB)
+# Prevents OOM from accidentally extracting huge files
+MAX_BINARY_FILE_SIZE = 50 * 1024 * 1024
+

 class SandboxFileOutput(BaseModel):
    """A file extracted from a sandbox and optionally stored in workspace."""
@@ -120,7 +163,8 @@ async def extract_sandbox_files(
        sandbox: The E2B sandbox instance
        working_directory: Directory to search for files
        since_timestamp: ISO timestamp - only return files modified after this time
-        text_only: If True, only extract text files (default). If False, extract all files.
+        text_only: If True, only extract text files. If False, also extract
+                   supported binary files (images, PDFs, etc.).

    Returns:
        List of ExtractedFile objects with path, content, and metadata
@@ -149,14 +193,53 @@ async def extract_sandbox_files(
            if not file_path:
                continue

-            # Check if it's a text file
-            is_text = any(file_path.endswith(ext) for ext in TEXT_EXTENSIONS)
+            # Check file type (case-insensitive for extensions)
+            file_path_lower = file_path.lower()
+            is_text = any(
+                file_path_lower.endswith(ext.lower()) for ext in TEXT_EXTENSIONS
+            )
+            is_binary = any(
+                file_path_lower.endswith(ext.lower()) for ext in BINARY_EXTENSIONS
+            )

-            # Skip non-text files if text_only mode
-            if text_only and not is_text:
-                continue
+            # Determine if we should extract this file
+            if text_only:
+                # Only extract text files
+                if not is_text:
+                    continue
+            else:
+                # Extract text files and supported binary files
+                if not is_text and not is_binary:
+                    continue

            try:
+                # For binary files, check size before reading to prevent OOM
+                if is_binary:
+                    stat_result = await sandbox.commands.run(
+                        f"stat -c %s {shlex.quote(file_path)} 2>/dev/null"
+                    )
+                    if stat_result.exit_code != 0 or not stat_result.stdout:
+                        logger.debug(
+                            f"Skipping {file_path}: could not determine file size"
+                        )
+                        continue
+
+                    try:
+                        file_size = int(stat_result.stdout.strip())
+                    except ValueError:
+                        logger.debug(
+                            f"Skipping {file_path}: unexpected stat output "
+                            f"{stat_result.stdout.strip()!r}"
+                        )
+                        continue
+
+                    if file_size > MAX_BINARY_FILE_SIZE:
+                        logger.info(
+                            f"Skipping {file_path}: size {file_size} bytes "
+                            f"exceeds limit {MAX_BINARY_FILE_SIZE}"
+                        )
+                        continue
+
                # Read file content as bytes
                content = await sandbox.files.read(file_path, format="bytes")
                if isinstance(content, str):
--- a/docs/integrations/block-integrations/claude_code.md
+++ b/docs/integrations/block-integrations/claude_code.md
@@ -16,7 +16,7 @@ When activated, the block:
   - Install dependencies (npm, pip, etc.)
   - Run terminal commands
   - Build and test applications
-5. Extracts all text files created/modified during execution
+5. Extracts all files created/modified during execution (text files and binary files like images, PDFs, etc.)
 6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks

 The block supports conversation continuation through three mechanisms:
@@ -42,7 +42,7 @@ The block supports conversation continuation through three mechanisms:
 | Output | Description |
 |--------|-------------|
 | Response | The output/response from Claude Code execution |
-| Files | List of text files created/modified during execution. Each file includes path, relative_path, name, and content fields |
+| Files | List of files (text and binary) created/modified during execution. Includes images, PDFs, and other supported formats. Each file has path, relative_path, name, content, and workspace_ref fields. Binary files are stored in workspace and accessible via workspace_ref |
 | Conversation History | Full conversation history including this turn. Use to restore context on a fresh sandbox |
 | Session ID | Session ID for this conversation. Pass back with sandbox_id to continue the conversation |
 | Sandbox ID | ID of the sandbox instance (null if disposed). Pass back with session_id to continue the conversation |
--- a/docs/integrations/block-integrations/llm.md
+++ b/docs/integrations/block-integrations/llm.md
@@ -535,7 +535,7 @@ When activated, the block:
 2. Installs the latest version of Claude Code in the sandbox
 3. Optionally runs setup commands to prepare the environment
 4. Executes your prompt using Claude Code, which can create/edit files, install dependencies, run terminal commands, and build applications
-5. Extracts all text files created/modified during execution
+5. Extracts all files created/modified during execution (text files and binary files like images, PDFs, etc.)
 6. Returns the response and files, optionally keeping the sandbox alive for follow-up tasks

 The block supports conversation continuation through three mechanisms:
@@ -563,7 +563,7 @@ The block supports conversation continuation through three mechanisms:
 |--------|-------------|------|
 | error | Error message if execution failed | str |
 | response | The output/response from Claude Code execution | str |
-| files | List of text files created/modified by Claude Code during this execution. Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. workspace_ref contains a workspace:// URI if the file was stored to workspace. | List[SandboxFileOutput] |
+| files | List of files created/modified by Claude Code during this execution. Includes text files and binary files (images, PDFs, etc.). Each file has 'path', 'relative_path', 'name', 'content', and 'workspace_ref' fields. workspace_ref contains a workspace:// URI for workspace storage. For binary files, content contains a placeholder; use workspace_ref to access the file. | List[SandboxFileOutput] |
 | conversation_history | Full conversation history including this turn. Pass this to conversation_history input to continue on a fresh sandbox if the previous sandbox timed out. | str |
 | session_id | Session ID for this conversation. Pass this back along with sandbox_id to continue the conversation. | str |
 | sandbox_id | ID of the sandbox instance. Pass this back along with session_id to continue the conversation. This is None if dispose_sandbox was True (sandbox was disposed). | str |
Author	SHA1	Message	Date
Bentlybro	719c4ee1d1	fix: add explicit ValueError guard for stat output parsing	2026-02-16 14:46:06 +00:00
Bentlybro	411c399e03	style: fix formatting and sync docs - Fix Black formatting for is_text/is_binary checks - Update llm.md to reflect binary file support in Claude Code block	2026-02-16 14:40:53 +00:00
Bentlybro	6ac011e36c	fix: normalize extension case in sandbox file extraction Fixes bug where 'Dockerfile' in TEXT_EXTENSIONS wouldn't match after lowercasing file_path because the extension itself wasn't lowercased.	2026-02-16 14:18:25 +00:00
Bentlybro	5e554526e2	fix(backend): Extract binary files from ClaudeCodeBlock sandbox Enables binary file extraction (images, PDFs, etc.) for the Claude Code block by setting text_only=False in extract_and_store_sandbox_files. Changes: - sandbox_files.py: Add BINARY_EXTENSIONS set with supported formats - sandbox_files.py: Add MAX_BINARY_FILE_SIZE (50MB) limit to prevent OOM - sandbox_files.py: Add size check before reading binary files - sandbox_files.py: Add .svg to TEXT_EXTENSIONS (XML-based) - sandbox_files.py: Make extension matching case-insensitive - claude_code.py: Enable binary file extraction (text_only=False) - claude_code.py: Update output description to mention binary support - claude_code.md: Update docs to reflect binary file support Binary files are stored via store_media_file which handles: - Virus scanning via scan_content_safe() - Workspace storage (returns workspace:// URI in CoPilot) - Data URI fallback for graph execution Closes SECRT-1897	2026-02-16 14:10:05 +00:00