feat(copilot): Auto-save binary block outputs using content-based detection

When CoPilot executes blocks that produce binary outputs (images, PDFs),
the data is now automatically detected and saved to the user's workspace,
replacing raw base64 data with workspace:// references.

Uses content-based detection (not field-name based) because:
- Code execution blocks return base64 in stdout_logs, not structured fields
- The png/jpeg/pdf fields only populate from Jupyter display mechanisms
- Other blocks use various field names (image, result, output, etc.)

Detection strategy:
1. Data URI detection with mimetype whitelist
2. Raw base64 with magic number validation (PNG, JPEG, PDF, GIF, WebP)
3. Size threshold > 1KB to filter tokens/hashes

Features:
- Scans ALL string values recursively, regardless of field name
- Content deduplication within single block execution via SHA-256 hash
- Graceful degradation (original value preserved on save failure)
- ~97% token reduction observed (17k -> 500 tokens for PDF generation)

Closes SECRT-1887
This commit is contained in:
Otto
2026-02-05 18:39:31 +00:00
parent 11256076d8
commit dcb3550910
3 changed files with 640 additions and 0 deletions

View File

@@ -0,0 +1,238 @@
"""
Content-based detection and saving of binary data in block outputs.
This module post-processes block execution outputs to detect and save binary
content (images, PDFs) to the workspace, returning workspace:// references
instead of raw base64 data. This reduces LLM output token usage by ~97% for
file generation tasks.
Detection is content-based (not field-name based) because:
- Code execution blocks return base64 in stdout_logs, not structured fields
- The png/jpeg/pdf fields only populate from Jupyter display mechanisms
- Other blocks use various field names: image, result, output, response, etc.
"""
import base64
import binascii
import hashlib
import logging
import re
import uuid
from typing import Any, Optional
from backend.util.file import sanitize_filename
from backend.util.workspace import WorkspaceManager
logger = logging.getLogger(__name__)
# Only process strings larger than this (filters out tokens, hashes, short strings)
SIZE_THRESHOLD = 1024 # 1KB
# Data URI pattern with mimetype extraction
DATA_URI_PATTERN = re.compile(
r"^data:([a-zA-Z0-9.+-]+/[a-zA-Z0-9.+-]+);base64,(.+)$",
re.DOTALL,
)
# Only process these mimetypes from data URIs (avoid text/plain, etc.)
ALLOWED_MIMETYPES = {
"image/png",
"image/jpeg",
"image/jpg", # Non-standard but sometimes used
"image/gif",
"image/webp",
"image/svg+xml",
"application/pdf",
"application/octet-stream",
}
# Base64 character validation (strict - must be pure base64)
BASE64_PATTERN = re.compile(r"^[A-Za-z0-9+/\n\r]+=*$")
# Magic numbers for binary file detection
# Note: WebP requires two-step detection: RIFF prefix + WEBP at offset 8
MAGIC_SIGNATURES = [
(b"\x89PNG\r\n\x1a\n", "png"),
(b"\xff\xd8\xff", "jpg"),
(b"%PDF-", "pdf"),
(b"GIF87a", "gif"),
(b"GIF89a", "gif"),
(b"RIFF", "webp"), # Special case: also check content[8:12] == b'WEBP'
]
async def process_binary_outputs(
outputs: dict[str, list[Any]],
workspace_manager: WorkspaceManager,
block_name: str,
) -> dict[str, list[Any]]:
"""
Scan all string values in outputs and replace detected binary content
with workspace:// references.
Uses content-based detection (data URIs, magic numbers) to find binary
data regardless of field name. Deduplicates identical content within
a single call using content hashing.
Args:
outputs: Block execution outputs (dict of output_name -> list of values)
workspace_manager: WorkspaceManager instance with session scoping
block_name: Name of the block (used in generated filenames)
Returns:
Processed outputs with binary data replaced by workspace references
"""
cache: dict[str, str] = {} # content_hash -> workspace_ref
processed: dict[str, list[Any]] = {}
for name, items in outputs.items():
processed_items = []
for item in items:
processed_items.append(
await _process_value(item, workspace_manager, block_name, cache)
)
processed[name] = processed_items
return processed
async def _process_value(
value: Any,
wm: WorkspaceManager,
block: str,
cache: dict[str, str],
) -> Any:
"""Recursively process a value, detecting binary content in strings."""
if isinstance(value, dict):
result = {}
for k, v in value.items():
result[k] = await _process_value(v, wm, block, cache)
return result
if isinstance(value, list):
return [await _process_value(v, wm, block, cache) for v in value]
if isinstance(value, str) and len(value) > SIZE_THRESHOLD:
return await _try_detect_and_save(value, wm, block, cache)
return value
async def _try_detect_and_save(
value: str,
wm: WorkspaceManager,
block: str,
cache: dict[str, str],
) -> str:
"""Attempt to detect binary content and save it. Returns original if not binary."""
# Try data URI first (highest confidence - explicit mimetype)
result = _detect_data_uri(value)
if result:
content, ext = result
return await _save_binary(content, ext, wm, block, cache, value)
# Try raw base64 with magic number detection
result = _detect_raw_base64(value)
if result:
content, ext = result
return await _save_binary(content, ext, wm, block, cache, value)
return value # Not binary, return unchanged
def _detect_data_uri(value: str) -> Optional[tuple[bytes, str]]:
"""
Detect data URI with whitelisted mimetype.
Returns (content, extension) or None.
"""
match = DATA_URI_PATTERN.match(value)
if not match:
return None
mimetype, b64_payload = match.groups()
if mimetype not in ALLOWED_MIMETYPES:
return None
try:
content = base64.b64decode(b64_payload, validate=True)
except (ValueError, binascii.Error):
return None
ext = _mimetype_to_ext(mimetype)
return content, ext
def _detect_raw_base64(value: str) -> Optional[tuple[bytes, str]]:
"""
Detect raw base64 with magic number validation.
Only processes strings that:
1. Look like pure base64 (regex pre-filter)
2. Successfully decode as base64
3. Start with a known binary file magic number
Returns (content, extension) or None.
"""
# Pre-filter: must look like base64 (no spaces, punctuation, etc.)
if not BASE64_PATTERN.match(value):
return None
try:
content = base64.b64decode(value, validate=True)
except (ValueError, binascii.Error):
return None
# Check magic numbers
for magic, ext in MAGIC_SIGNATURES:
if content.startswith(magic):
# Special case for WebP: RIFF container, verify "WEBP" at offset 8
if magic == b"RIFF":
if len(content) < 12 or content[8:12] != b"WEBP":
continue
return content, ext
return None # No magic number match = not a recognized binary format
async def _save_binary(
content: bytes,
ext: str,
wm: WorkspaceManager,
block: str,
cache: dict[str, str],
original: str,
) -> str:
"""
Save binary content to workspace with deduplication.
Returns workspace://file-id reference, or original value on failure.
"""
content_hash = hashlib.sha256(content).hexdigest()
if content_hash in cache:
return cache[content_hash]
try:
safe_block = sanitize_filename(block)[:20].lower()
filename = f"{safe_block}_{ext}_{uuid.uuid4().hex[:12]}.{ext}"
file = await wm.write_file(content, filename)
ref = f"workspace://{file.id}"
cache[content_hash] = ref
return ref
except Exception as e:
logger.warning(f"Failed to save binary output: {e}")
return original # Graceful degradation
def _mimetype_to_ext(mimetype: str) -> str:
"""Convert mimetype to file extension."""
mapping = {
"image/png": "png",
"image/jpeg": "jpg",
"image/jpg": "jpg",
"image/gif": "gif",
"image/webp": "webp",
"image/svg+xml": "svg",
"application/pdf": "pdf",
"application/octet-stream": "bin",
}
return mapping.get(mimetype, "bin")

View File

@@ -14,8 +14,10 @@ from backend.data.model import CredentialsMetaInput
from backend.data.workspace import get_or_create_workspace
from backend.integrations.creds_manager import IntegrationCredentialsManager
from backend.util.exceptions import BlockError
from backend.util.workspace import WorkspaceManager
from .base import BaseTool
from .binary_output_processor import process_binary_outputs
from .models import (
BlockOutputResponse,
ErrorResponse,
@@ -321,6 +323,16 @@ class RunBlockTool(BaseTool):
):
outputs[output_name].append(output_data)
# Post-process outputs to save binary content to workspace
workspace_manager = WorkspaceManager(
user_id=user_id,
workspace_id=workspace.id,
session_id=session.session_id,
)
outputs = await process_binary_outputs(
dict(outputs), workspace_manager, block.name
)
return BlockOutputResponse(
message=f"Block '{block.name}' executed successfully",
block_id=block_id,

View File

@@ -0,0 +1,390 @@
"""Tests for content-based binary output detection and saving."""
import base64
from unittest.mock import AsyncMock, MagicMock
import pytest
from .binary_output_processor import (
_detect_data_uri,
_detect_raw_base64,
_mimetype_to_ext,
process_binary_outputs,
)
@pytest.fixture
def mock_workspace_manager():
"""Create a mock workspace manager that returns predictable file IDs."""
wm = MagicMock()
async def mock_write_file(content, filename):
file = MagicMock()
file.id = f"file-{filename[:10]}"
return file
wm.write_file = AsyncMock(side_effect=mock_write_file)
return wm
# =============================================================================
# Data URI Detection Tests
# =============================================================================
class TestDetectDataUri:
"""Tests for _detect_data_uri function."""
def test_detects_png_data_uri(self):
"""Should detect valid PNG data URI."""
# Minimal valid PNG (1x1 transparent)
png_b64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
data_uri = f"data:image/png;base64,{png_b64}"
result = _detect_data_uri(data_uri)
assert result is not None
content, ext = result
assert ext == "png"
assert content.startswith(b"\x89PNG")
def test_detects_pdf_data_uri(self):
"""Should detect valid PDF data URI."""
pdf_content = b"%PDF-1.4 test content"
pdf_b64 = base64.b64encode(pdf_content).decode()
data_uri = f"data:application/pdf;base64,{pdf_b64}"
result = _detect_data_uri(data_uri)
assert result is not None
content, ext = result
assert ext == "pdf"
assert content == pdf_content
def test_rejects_text_plain_mimetype(self):
"""Should reject text/plain mimetype (not in whitelist)."""
text_b64 = base64.b64encode(b"Hello World").decode()
data_uri = f"data:text/plain;base64,{text_b64}"
result = _detect_data_uri(data_uri)
assert result is None
def test_rejects_non_data_uri_string(self):
"""Should return None for non-data-URI strings."""
result = _detect_data_uri("https://example.com/image.png")
assert result is None
def test_rejects_invalid_base64_in_data_uri(self):
"""Should return None for data URI with invalid base64."""
data_uri = "-valid-base64!!!"
result = _detect_data_uri(data_uri)
assert result is None
def test_handles_jpeg_mimetype(self):
"""Should handle image/jpeg mimetype."""
jpeg_content = b"\xff\xd8\xff\xe0test"
jpeg_b64 = base64.b64encode(jpeg_content).decode()
data_uri = f"data:image/jpeg;base64,{jpeg_b64}"
result = _detect_data_uri(data_uri)
assert result is not None
_, ext = result
assert ext == "jpg"
# =============================================================================
# Raw Base64 Detection Tests
# =============================================================================
class TestDetectRawBase64:
"""Tests for _detect_raw_base64 function."""
def test_detects_png_magic_number(self):
"""Should detect raw base64 PNG by magic number."""
# Minimal valid PNG
png_b64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
result = _detect_raw_base64(png_b64)
assert result is not None
content, ext = result
assert ext == "png"
assert content.startswith(b"\x89PNG")
def test_detects_jpeg_magic_number(self):
"""Should detect raw base64 JPEG by magic number."""
jpeg_content = b"\xff\xd8\xff\xe0" + b"\x00" * 100
jpeg_b64 = base64.b64encode(jpeg_content).decode()
result = _detect_raw_base64(jpeg_b64)
assert result is not None
_, ext = result
assert ext == "jpg"
def test_detects_pdf_magic_number(self):
"""Should detect raw base64 PDF by magic number."""
pdf_content = b"%PDF-1.4 " + b"x" * 100
pdf_b64 = base64.b64encode(pdf_content).decode()
result = _detect_raw_base64(pdf_b64)
assert result is not None
_, ext = result
assert ext == "pdf"
def test_detects_gif87a_magic_number(self):
"""Should detect GIF87a magic number."""
gif_content = b"GIF87a" + b"\x00" * 100
gif_b64 = base64.b64encode(gif_content).decode()
result = _detect_raw_base64(gif_b64)
assert result is not None
_, ext = result
assert ext == "gif"
def test_detects_gif89a_magic_number(self):
"""Should detect GIF89a magic number."""
gif_content = b"GIF89a" + b"\x00" * 100
gif_b64 = base64.b64encode(gif_content).decode()
result = _detect_raw_base64(gif_b64)
assert result is not None
_, ext = result
assert ext == "gif"
def test_detects_webp_magic_number(self):
"""Should detect WebP (RIFF + WEBP at offset 8)."""
# WebP header: RIFF + size (4 bytes) + WEBP
webp_content = b"RIFF\x00\x00\x00\x00WEBP" + b"\x00" * 100
webp_b64 = base64.b64encode(webp_content).decode()
result = _detect_raw_base64(webp_b64)
assert result is not None
_, ext = result
assert ext == "webp"
def test_rejects_riff_without_webp(self):
"""Should reject RIFF files that aren't WebP (e.g., WAV)."""
wav_content = b"RIFF\x00\x00\x00\x00WAVE" + b"\x00" * 100
wav_b64 = base64.b64encode(wav_content).decode()
result = _detect_raw_base64(wav_b64)
assert result is None
def test_rejects_non_base64_string(self):
"""Should reject strings that don't look like base64."""
result = _detect_raw_base64("Hello, this is regular text with spaces!")
assert result is None
def test_rejects_base64_without_magic_number(self):
"""Should reject valid base64 that doesn't have a known magic number."""
random_content = b"This is just random text, not a binary file"
random_b64 = base64.b64encode(random_content).decode()
result = _detect_raw_base64(random_b64)
assert result is None
def test_rejects_invalid_base64(self):
"""Should return None for invalid base64."""
result = _detect_raw_base64("not-valid-base64!!!")
assert result is None
# =============================================================================
# Process Binary Outputs Tests
# =============================================================================
class TestProcessBinaryOutputs:
"""Tests for process_binary_outputs function."""
@pytest.mark.asyncio
async def test_saves_large_png_and_returns_reference(self, mock_workspace_manager):
"""Should save PNG > 1KB and return workspace reference."""
# Create PNG > 1KB
png_header = b"\x89PNG\r\n\x1a\n"
png_content = png_header + b"\x00" * 2000
png_b64 = base64.b64encode(png_content).decode()
outputs = {"result": [png_b64]}
result = await process_binary_outputs(
outputs, mock_workspace_manager, "TestBlock"
)
assert result["result"][0].startswith("workspace://")
mock_workspace_manager.write_file.assert_called_once()
@pytest.mark.asyncio
async def test_preserves_small_content(self, mock_workspace_manager):
"""Should not process strings smaller than threshold."""
small_content = "small"
outputs = {"result": [small_content]}
result = await process_binary_outputs(
outputs, mock_workspace_manager, "TestBlock"
)
assert result["result"][0] == small_content
mock_workspace_manager.write_file.assert_not_called()
@pytest.mark.asyncio
async def test_deduplicates_identical_content(self, mock_workspace_manager):
"""Should save identical content only once."""
png_header = b"\x89PNG\r\n\x1a\n"
png_content = png_header + b"\x00" * 2000
png_b64 = base64.b64encode(png_content).decode()
outputs = {"result": [png_b64, png_b64]}
result = await process_binary_outputs(
outputs, mock_workspace_manager, "TestBlock"
)
# Both should have references
assert result["result"][0].startswith("workspace://")
assert result["result"][1].startswith("workspace://")
# But only one write should have happened
assert mock_workspace_manager.write_file.call_count == 1
# And they should be the same reference
assert result["result"][0] == result["result"][1]
@pytest.mark.asyncio
async def test_processes_nested_dict(self, mock_workspace_manager):
"""Should recursively process nested dictionaries."""
png_header = b"\x89PNG\r\n\x1a\n"
png_content = png_header + b"\x00" * 2000
png_b64 = base64.b64encode(png_content).decode()
outputs = {"result": [{"nested": {"deep": png_b64}}]}
result = await process_binary_outputs(
outputs, mock_workspace_manager, "TestBlock"
)
assert result["result"][0]["nested"]["deep"].startswith("workspace://")
@pytest.mark.asyncio
async def test_processes_nested_list(self, mock_workspace_manager):
"""Should recursively process nested lists."""
png_header = b"\x89PNG\r\n\x1a\n"
png_content = png_header + b"\x00" * 2000
png_b64 = base64.b64encode(png_content).decode()
outputs = {"result": [[png_b64]]}
result = await process_binary_outputs(
outputs, mock_workspace_manager, "TestBlock"
)
assert result["result"][0][0].startswith("workspace://")
@pytest.mark.asyncio
async def test_handles_data_uri_format(self, mock_workspace_manager):
"""Should handle data URI format."""
png_header = b"\x89PNG\r\n\x1a\n"
png_content = png_header + b"\x00" * 2000
png_b64 = base64.b64encode(png_content).decode()
data_uri = f"data:image/png;base64,{png_b64}"
outputs = {"result": [data_uri]}
result = await process_binary_outputs(
outputs, mock_workspace_manager, "TestBlock"
)
assert result["result"][0].startswith("workspace://")
@pytest.mark.asyncio
async def test_preserves_non_binary_large_strings(self, mock_workspace_manager):
"""Should preserve large strings that aren't binary."""
large_text = "A" * 2000 # Large but not base64 or binary
outputs = {"result": [large_text]}
result = await process_binary_outputs(
outputs, mock_workspace_manager, "TestBlock"
)
assert result["result"][0] == large_text
mock_workspace_manager.write_file.assert_not_called()
@pytest.mark.asyncio
async def test_graceful_degradation_on_save_failure(self, mock_workspace_manager):
"""Should preserve original value if save fails."""
mock_workspace_manager.write_file = AsyncMock(
side_effect=Exception("Storage error")
)
png_header = b"\x89PNG\r\n\x1a\n"
png_content = png_header + b"\x00" * 2000
png_b64 = base64.b64encode(png_content).decode()
outputs = {"result": [png_b64]}
result = await process_binary_outputs(
outputs, mock_workspace_manager, "TestBlock"
)
# Should return original value on failure
assert result["result"][0] == png_b64
@pytest.mark.asyncio
async def test_handles_stdout_logs_field(self, mock_workspace_manager):
"""Should detect binary in stdout_logs (the actual failing case)."""
pdf_content = b"%PDF-1.4 " + b"x" * 2000
pdf_b64 = base64.b64encode(pdf_content).decode()
outputs = {"stdout_logs": [pdf_b64]}
result = await process_binary_outputs(
outputs, mock_workspace_manager, "ExecuteCodeBlock"
)
assert result["stdout_logs"][0].startswith("workspace://")
# =============================================================================
# Mimetype to Extension Tests
# =============================================================================
class TestMimetypeToExt:
"""Tests for _mimetype_to_ext function."""
def test_png_mapping(self):
assert _mimetype_to_ext("image/png") == "png"
def test_jpeg_mapping(self):
assert _mimetype_to_ext("image/jpeg") == "jpg"
def test_nonstandard_jpg_mapping(self):
assert _mimetype_to_ext("image/jpg") == "jpg"
def test_gif_mapping(self):
assert _mimetype_to_ext("image/gif") == "gif"
def test_webp_mapping(self):
assert _mimetype_to_ext("image/webp") == "webp"
def test_svg_mapping(self):
assert _mimetype_to_ext("image/svg+xml") == "svg"
def test_pdf_mapping(self):
assert _mimetype_to_ext("application/pdf") == "pdf"
def test_octet_stream_mapping(self):
assert _mimetype_to_ext("application/octet-stream") == "bin"
def test_unknown_mimetype(self):
assert _mimetype_to_ext("application/unknown") == "bin"