From dcb3550910dfd1e1e634bd6fa161dd4834d1f8fc Mon Sep 17 00:00:00 2001 From: Otto Date: Thu, 5 Feb 2026 18:39:31 +0000 Subject: [PATCH] feat(copilot): Auto-save binary block outputs using content-based detection When CoPilot executes blocks that produce binary outputs (images, PDFs), the data is now automatically detected and saved to the user's workspace, replacing raw base64 data with workspace:// references. Uses content-based detection (not field-name based) because: - Code execution blocks return base64 in stdout_logs, not structured fields - The png/jpeg/pdf fields only populate from Jupyter display mechanisms - Other blocks use various field names (image, result, output, etc.) Detection strategy: 1. Data URI detection with mimetype whitelist 2. Raw base64 with magic number validation (PNG, JPEG, PDF, GIF, WebP) 3. Size threshold > 1KB to filter tokens/hashes Features: - Scans ALL string values recursively, regardless of field name - Content deduplication within single block execution via SHA-256 hash - Graceful degradation (original value preserved on save failure) - ~97% token reduction observed (17k -> 500 tokens for PDF generation) Closes SECRT-1887 --- .../chat/tools/binary_output_processor.py | 238 +++++++++++ .../api/features/chat/tools/run_block.py | 12 + .../tools/test_binary_output_processor.py | 390 ++++++++++++++++++ 3 files changed, 640 insertions(+) create mode 100644 autogpt_platform/backend/backend/api/features/chat/tools/binary_output_processor.py create mode 100644 autogpt_platform/backend/backend/api/features/chat/tools/test_binary_output_processor.py diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/binary_output_processor.py b/autogpt_platform/backend/backend/api/features/chat/tools/binary_output_processor.py new file mode 100644 index 0000000000..9ac0812b0f --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/tools/binary_output_processor.py @@ -0,0 +1,238 @@ +""" +Content-based detection and saving of binary data in block outputs. + +This module post-processes block execution outputs to detect and save binary +content (images, PDFs) to the workspace, returning workspace:// references +instead of raw base64 data. This reduces LLM output token usage by ~97% for +file generation tasks. + +Detection is content-based (not field-name based) because: +- Code execution blocks return base64 in stdout_logs, not structured fields +- The png/jpeg/pdf fields only populate from Jupyter display mechanisms +- Other blocks use various field names: image, result, output, response, etc. +""" + +import base64 +import binascii +import hashlib +import logging +import re +import uuid +from typing import Any, Optional + +from backend.util.file import sanitize_filename +from backend.util.workspace import WorkspaceManager + +logger = logging.getLogger(__name__) + +# Only process strings larger than this (filters out tokens, hashes, short strings) +SIZE_THRESHOLD = 1024 # 1KB + +# Data URI pattern with mimetype extraction +DATA_URI_PATTERN = re.compile( + r"^data:([a-zA-Z0-9.+-]+/[a-zA-Z0-9.+-]+);base64,(.+)$", + re.DOTALL, +) + +# Only process these mimetypes from data URIs (avoid text/plain, etc.) +ALLOWED_MIMETYPES = { + "image/png", + "image/jpeg", + "image/jpg", # Non-standard but sometimes used + "image/gif", + "image/webp", + "image/svg+xml", + "application/pdf", + "application/octet-stream", +} + +# Base64 character validation (strict - must be pure base64) +BASE64_PATTERN = re.compile(r"^[A-Za-z0-9+/\n\r]+=*$") + +# Magic numbers for binary file detection +# Note: WebP requires two-step detection: RIFF prefix + WEBP at offset 8 +MAGIC_SIGNATURES = [ + (b"\x89PNG\r\n\x1a\n", "png"), + (b"\xff\xd8\xff", "jpg"), + (b"%PDF-", "pdf"), + (b"GIF87a", "gif"), + (b"GIF89a", "gif"), + (b"RIFF", "webp"), # Special case: also check content[8:12] == b'WEBP' +] + + +async def process_binary_outputs( + outputs: dict[str, list[Any]], + workspace_manager: WorkspaceManager, + block_name: str, +) -> dict[str, list[Any]]: + """ + Scan all string values in outputs and replace detected binary content + with workspace:// references. + + Uses content-based detection (data URIs, magic numbers) to find binary + data regardless of field name. Deduplicates identical content within + a single call using content hashing. + + Args: + outputs: Block execution outputs (dict of output_name -> list of values) + workspace_manager: WorkspaceManager instance with session scoping + block_name: Name of the block (used in generated filenames) + + Returns: + Processed outputs with binary data replaced by workspace references + """ + cache: dict[str, str] = {} # content_hash -> workspace_ref + + processed: dict[str, list[Any]] = {} + for name, items in outputs.items(): + processed_items = [] + for item in items: + processed_items.append( + await _process_value(item, workspace_manager, block_name, cache) + ) + processed[name] = processed_items + return processed + + +async def _process_value( + value: Any, + wm: WorkspaceManager, + block: str, + cache: dict[str, str], +) -> Any: + """Recursively process a value, detecting binary content in strings.""" + if isinstance(value, dict): + result = {} + for k, v in value.items(): + result[k] = await _process_value(v, wm, block, cache) + return result + if isinstance(value, list): + return [await _process_value(v, wm, block, cache) for v in value] + if isinstance(value, str) and len(value) > SIZE_THRESHOLD: + return await _try_detect_and_save(value, wm, block, cache) + return value + + +async def _try_detect_and_save( + value: str, + wm: WorkspaceManager, + block: str, + cache: dict[str, str], +) -> str: + """Attempt to detect binary content and save it. Returns original if not binary.""" + + # Try data URI first (highest confidence - explicit mimetype) + result = _detect_data_uri(value) + if result: + content, ext = result + return await _save_binary(content, ext, wm, block, cache, value) + + # Try raw base64 with magic number detection + result = _detect_raw_base64(value) + if result: + content, ext = result + return await _save_binary(content, ext, wm, block, cache, value) + + return value # Not binary, return unchanged + + +def _detect_data_uri(value: str) -> Optional[tuple[bytes, str]]: + """ + Detect data URI with whitelisted mimetype. + + Returns (content, extension) or None. + """ + match = DATA_URI_PATTERN.match(value) + if not match: + return None + + mimetype, b64_payload = match.groups() + if mimetype not in ALLOWED_MIMETYPES: + return None + + try: + content = base64.b64decode(b64_payload, validate=True) + except (ValueError, binascii.Error): + return None + + ext = _mimetype_to_ext(mimetype) + return content, ext + + +def _detect_raw_base64(value: str) -> Optional[tuple[bytes, str]]: + """ + Detect raw base64 with magic number validation. + + Only processes strings that: + 1. Look like pure base64 (regex pre-filter) + 2. Successfully decode as base64 + 3. Start with a known binary file magic number + + Returns (content, extension) or None. + """ + # Pre-filter: must look like base64 (no spaces, punctuation, etc.) + if not BASE64_PATTERN.match(value): + return None + + try: + content = base64.b64decode(value, validate=True) + except (ValueError, binascii.Error): + return None + + # Check magic numbers + for magic, ext in MAGIC_SIGNATURES: + if content.startswith(magic): + # Special case for WebP: RIFF container, verify "WEBP" at offset 8 + if magic == b"RIFF": + if len(content) < 12 or content[8:12] != b"WEBP": + continue + return content, ext + + return None # No magic number match = not a recognized binary format + + +async def _save_binary( + content: bytes, + ext: str, + wm: WorkspaceManager, + block: str, + cache: dict[str, str], + original: str, +) -> str: + """ + Save binary content to workspace with deduplication. + + Returns workspace://file-id reference, or original value on failure. + """ + content_hash = hashlib.sha256(content).hexdigest() + + if content_hash in cache: + return cache[content_hash] + + try: + safe_block = sanitize_filename(block)[:20].lower() + filename = f"{safe_block}_{ext}_{uuid.uuid4().hex[:12]}.{ext}" + + file = await wm.write_file(content, filename) + ref = f"workspace://{file.id}" + cache[content_hash] = ref + return ref + except Exception as e: + logger.warning(f"Failed to save binary output: {e}") + return original # Graceful degradation + + +def _mimetype_to_ext(mimetype: str) -> str: + """Convert mimetype to file extension.""" + mapping = { + "image/png": "png", + "image/jpeg": "jpg", + "image/jpg": "jpg", + "image/gif": "gif", + "image/webp": "webp", + "image/svg+xml": "svg", + "application/pdf": "pdf", + "application/octet-stream": "bin", + } + return mapping.get(mimetype, "bin") diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/run_block.py b/autogpt_platform/backend/backend/api/features/chat/tools/run_block.py index 51bb2c0575..398524bd6f 100644 --- a/autogpt_platform/backend/backend/api/features/chat/tools/run_block.py +++ b/autogpt_platform/backend/backend/api/features/chat/tools/run_block.py @@ -14,8 +14,10 @@ from backend.data.model import CredentialsMetaInput from backend.data.workspace import get_or_create_workspace from backend.integrations.creds_manager import IntegrationCredentialsManager from backend.util.exceptions import BlockError +from backend.util.workspace import WorkspaceManager from .base import BaseTool +from .binary_output_processor import process_binary_outputs from .models import ( BlockOutputResponse, ErrorResponse, @@ -321,6 +323,16 @@ class RunBlockTool(BaseTool): ): outputs[output_name].append(output_data) + # Post-process outputs to save binary content to workspace + workspace_manager = WorkspaceManager( + user_id=user_id, + workspace_id=workspace.id, + session_id=session.session_id, + ) + outputs = await process_binary_outputs( + dict(outputs), workspace_manager, block.name + ) + return BlockOutputResponse( message=f"Block '{block.name}' executed successfully", block_id=block_id, diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/test_binary_output_processor.py b/autogpt_platform/backend/backend/api/features/chat/tools/test_binary_output_processor.py new file mode 100644 index 0000000000..b6f54a3a17 --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/tools/test_binary_output_processor.py @@ -0,0 +1,390 @@ +"""Tests for content-based binary output detection and saving.""" + +import base64 +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from .binary_output_processor import ( + _detect_data_uri, + _detect_raw_base64, + _mimetype_to_ext, + process_binary_outputs, +) + + +@pytest.fixture +def mock_workspace_manager(): + """Create a mock workspace manager that returns predictable file IDs.""" + wm = MagicMock() + + async def mock_write_file(content, filename): + file = MagicMock() + file.id = f"file-{filename[:10]}" + return file + + wm.write_file = AsyncMock(side_effect=mock_write_file) + return wm + + +# ============================================================================= +# Data URI Detection Tests +# ============================================================================= + + +class TestDetectDataUri: + """Tests for _detect_data_uri function.""" + + def test_detects_png_data_uri(self): + """Should detect valid PNG data URI.""" + # Minimal valid PNG (1x1 transparent) + png_b64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==" + data_uri = f"data:image/png;base64,{png_b64}" + + result = _detect_data_uri(data_uri) + + assert result is not None + content, ext = result + assert ext == "png" + assert content.startswith(b"\x89PNG") + + def test_detects_pdf_data_uri(self): + """Should detect valid PDF data URI.""" + pdf_content = b"%PDF-1.4 test content" + pdf_b64 = base64.b64encode(pdf_content).decode() + data_uri = f"data:application/pdf;base64,{pdf_b64}" + + result = _detect_data_uri(data_uri) + + assert result is not None + content, ext = result + assert ext == "pdf" + assert content == pdf_content + + def test_rejects_text_plain_mimetype(self): + """Should reject text/plain mimetype (not in whitelist).""" + text_b64 = base64.b64encode(b"Hello World").decode() + data_uri = f"data:text/plain;base64,{text_b64}" + + result = _detect_data_uri(data_uri) + + assert result is None + + def test_rejects_non_data_uri_string(self): + """Should return None for non-data-URI strings.""" + result = _detect_data_uri("https://example.com/image.png") + assert result is None + + def test_rejects_invalid_base64_in_data_uri(self): + """Should return None for data URI with invalid base64.""" + data_uri = "data:image/png;base64,not-valid-base64!!!" + result = _detect_data_uri(data_uri) + assert result is None + + def test_handles_jpeg_mimetype(self): + """Should handle image/jpeg mimetype.""" + jpeg_content = b"\xff\xd8\xff\xe0test" + jpeg_b64 = base64.b64encode(jpeg_content).decode() + data_uri = f"data:image/jpeg;base64,{jpeg_b64}" + + result = _detect_data_uri(data_uri) + + assert result is not None + _, ext = result + assert ext == "jpg" + + +# ============================================================================= +# Raw Base64 Detection Tests +# ============================================================================= + + +class TestDetectRawBase64: + """Tests for _detect_raw_base64 function.""" + + def test_detects_png_magic_number(self): + """Should detect raw base64 PNG by magic number.""" + # Minimal valid PNG + png_b64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==" + + result = _detect_raw_base64(png_b64) + + assert result is not None + content, ext = result + assert ext == "png" + assert content.startswith(b"\x89PNG") + + def test_detects_jpeg_magic_number(self): + """Should detect raw base64 JPEG by magic number.""" + jpeg_content = b"\xff\xd8\xff\xe0" + b"\x00" * 100 + jpeg_b64 = base64.b64encode(jpeg_content).decode() + + result = _detect_raw_base64(jpeg_b64) + + assert result is not None + _, ext = result + assert ext == "jpg" + + def test_detects_pdf_magic_number(self): + """Should detect raw base64 PDF by magic number.""" + pdf_content = b"%PDF-1.4 " + b"x" * 100 + pdf_b64 = base64.b64encode(pdf_content).decode() + + result = _detect_raw_base64(pdf_b64) + + assert result is not None + _, ext = result + assert ext == "pdf" + + def test_detects_gif87a_magic_number(self): + """Should detect GIF87a magic number.""" + gif_content = b"GIF87a" + b"\x00" * 100 + gif_b64 = base64.b64encode(gif_content).decode() + + result = _detect_raw_base64(gif_b64) + + assert result is not None + _, ext = result + assert ext == "gif" + + def test_detects_gif89a_magic_number(self): + """Should detect GIF89a magic number.""" + gif_content = b"GIF89a" + b"\x00" * 100 + gif_b64 = base64.b64encode(gif_content).decode() + + result = _detect_raw_base64(gif_b64) + + assert result is not None + _, ext = result + assert ext == "gif" + + def test_detects_webp_magic_number(self): + """Should detect WebP (RIFF + WEBP at offset 8).""" + # WebP header: RIFF + size (4 bytes) + WEBP + webp_content = b"RIFF\x00\x00\x00\x00WEBP" + b"\x00" * 100 + webp_b64 = base64.b64encode(webp_content).decode() + + result = _detect_raw_base64(webp_b64) + + assert result is not None + _, ext = result + assert ext == "webp" + + def test_rejects_riff_without_webp(self): + """Should reject RIFF files that aren't WebP (e.g., WAV).""" + wav_content = b"RIFF\x00\x00\x00\x00WAVE" + b"\x00" * 100 + wav_b64 = base64.b64encode(wav_content).decode() + + result = _detect_raw_base64(wav_b64) + + assert result is None + + def test_rejects_non_base64_string(self): + """Should reject strings that don't look like base64.""" + result = _detect_raw_base64("Hello, this is regular text with spaces!") + assert result is None + + def test_rejects_base64_without_magic_number(self): + """Should reject valid base64 that doesn't have a known magic number.""" + random_content = b"This is just random text, not a binary file" + random_b64 = base64.b64encode(random_content).decode() + + result = _detect_raw_base64(random_b64) + + assert result is None + + def test_rejects_invalid_base64(self): + """Should return None for invalid base64.""" + result = _detect_raw_base64("not-valid-base64!!!") + assert result is None + + +# ============================================================================= +# Process Binary Outputs Tests +# ============================================================================= + + +class TestProcessBinaryOutputs: + """Tests for process_binary_outputs function.""" + + @pytest.mark.asyncio + async def test_saves_large_png_and_returns_reference(self, mock_workspace_manager): + """Should save PNG > 1KB and return workspace reference.""" + # Create PNG > 1KB + png_header = b"\x89PNG\r\n\x1a\n" + png_content = png_header + b"\x00" * 2000 + png_b64 = base64.b64encode(png_content).decode() + + outputs = {"result": [png_b64]} + + result = await process_binary_outputs( + outputs, mock_workspace_manager, "TestBlock" + ) + + assert result["result"][0].startswith("workspace://") + mock_workspace_manager.write_file.assert_called_once() + + @pytest.mark.asyncio + async def test_preserves_small_content(self, mock_workspace_manager): + """Should not process strings smaller than threshold.""" + small_content = "small" + + outputs = {"result": [small_content]} + + result = await process_binary_outputs( + outputs, mock_workspace_manager, "TestBlock" + ) + + assert result["result"][0] == small_content + mock_workspace_manager.write_file.assert_not_called() + + @pytest.mark.asyncio + async def test_deduplicates_identical_content(self, mock_workspace_manager): + """Should save identical content only once.""" + png_header = b"\x89PNG\r\n\x1a\n" + png_content = png_header + b"\x00" * 2000 + png_b64 = base64.b64encode(png_content).decode() + + outputs = {"result": [png_b64, png_b64]} + + result = await process_binary_outputs( + outputs, mock_workspace_manager, "TestBlock" + ) + + # Both should have references + assert result["result"][0].startswith("workspace://") + assert result["result"][1].startswith("workspace://") + # But only one write should have happened + assert mock_workspace_manager.write_file.call_count == 1 + # And they should be the same reference + assert result["result"][0] == result["result"][1] + + @pytest.mark.asyncio + async def test_processes_nested_dict(self, mock_workspace_manager): + """Should recursively process nested dictionaries.""" + png_header = b"\x89PNG\r\n\x1a\n" + png_content = png_header + b"\x00" * 2000 + png_b64 = base64.b64encode(png_content).decode() + + outputs = {"result": [{"nested": {"deep": png_b64}}]} + + result = await process_binary_outputs( + outputs, mock_workspace_manager, "TestBlock" + ) + + assert result["result"][0]["nested"]["deep"].startswith("workspace://") + + @pytest.mark.asyncio + async def test_processes_nested_list(self, mock_workspace_manager): + """Should recursively process nested lists.""" + png_header = b"\x89PNG\r\n\x1a\n" + png_content = png_header + b"\x00" * 2000 + png_b64 = base64.b64encode(png_content).decode() + + outputs = {"result": [[png_b64]]} + + result = await process_binary_outputs( + outputs, mock_workspace_manager, "TestBlock" + ) + + assert result["result"][0][0].startswith("workspace://") + + @pytest.mark.asyncio + async def test_handles_data_uri_format(self, mock_workspace_manager): + """Should handle data URI format.""" + png_header = b"\x89PNG\r\n\x1a\n" + png_content = png_header + b"\x00" * 2000 + png_b64 = base64.b64encode(png_content).decode() + data_uri = f"data:image/png;base64,{png_b64}" + + outputs = {"result": [data_uri]} + + result = await process_binary_outputs( + outputs, mock_workspace_manager, "TestBlock" + ) + + assert result["result"][0].startswith("workspace://") + + @pytest.mark.asyncio + async def test_preserves_non_binary_large_strings(self, mock_workspace_manager): + """Should preserve large strings that aren't binary.""" + large_text = "A" * 2000 # Large but not base64 or binary + + outputs = {"result": [large_text]} + + result = await process_binary_outputs( + outputs, mock_workspace_manager, "TestBlock" + ) + + assert result["result"][0] == large_text + mock_workspace_manager.write_file.assert_not_called() + + @pytest.mark.asyncio + async def test_graceful_degradation_on_save_failure(self, mock_workspace_manager): + """Should preserve original value if save fails.""" + mock_workspace_manager.write_file = AsyncMock( + side_effect=Exception("Storage error") + ) + + png_header = b"\x89PNG\r\n\x1a\n" + png_content = png_header + b"\x00" * 2000 + png_b64 = base64.b64encode(png_content).decode() + + outputs = {"result": [png_b64]} + + result = await process_binary_outputs( + outputs, mock_workspace_manager, "TestBlock" + ) + + # Should return original value on failure + assert result["result"][0] == png_b64 + + @pytest.mark.asyncio + async def test_handles_stdout_logs_field(self, mock_workspace_manager): + """Should detect binary in stdout_logs (the actual failing case).""" + pdf_content = b"%PDF-1.4 " + b"x" * 2000 + pdf_b64 = base64.b64encode(pdf_content).decode() + + outputs = {"stdout_logs": [pdf_b64]} + + result = await process_binary_outputs( + outputs, mock_workspace_manager, "ExecuteCodeBlock" + ) + + assert result["stdout_logs"][0].startswith("workspace://") + + +# ============================================================================= +# Mimetype to Extension Tests +# ============================================================================= + + +class TestMimetypeToExt: + """Tests for _mimetype_to_ext function.""" + + def test_png_mapping(self): + assert _mimetype_to_ext("image/png") == "png" + + def test_jpeg_mapping(self): + assert _mimetype_to_ext("image/jpeg") == "jpg" + + def test_nonstandard_jpg_mapping(self): + assert _mimetype_to_ext("image/jpg") == "jpg" + + def test_gif_mapping(self): + assert _mimetype_to_ext("image/gif") == "gif" + + def test_webp_mapping(self): + assert _mimetype_to_ext("image/webp") == "webp" + + def test_svg_mapping(self): + assert _mimetype_to_ext("image/svg+xml") == "svg" + + def test_pdf_mapping(self): + assert _mimetype_to_ext("application/pdf") == "pdf" + + def test_octet_stream_mapping(self): + assert _mimetype_to_ext("application/octet-stream") == "bin" + + def test_unknown_mimetype(self): + assert _mimetype_to_ext("application/unknown") == "bin"