From 513ec16357c10967dfea4523ac47548b2c758e36 Mon Sep 17 00:00:00 2001 From: Otto Date: Thu, 5 Feb 2026 20:00:02 +0000 Subject: [PATCH] fix: Handle RFC 2045 line-wrapped base64 in embedded detection Sentry correctly identified that the regex and decode would fail on line-wrapped base64. Fixed by: - Updated regex to allow whitespace in base64 chunks - Strip whitespace before decoding with validate=True - Added test for line-wrapped base64 --- .../features/chat/tools/binary_output_processor.py | 10 +++++++--- .../chat/tools/test_binary_output_processor.py | 12 ++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/binary_output_processor.py b/autogpt_platform/backend/backend/api/features/chat/tools/binary_output_processor.py index 00c6617d5e..fce98bf17c 100644 --- a/autogpt_platform/backend/backend/api/features/chat/tools/binary_output_processor.py +++ b/autogpt_platform/backend/backend/api/features/chat/tools/binary_output_processor.py @@ -28,8 +28,9 @@ logger = logging.getLogger(__name__) MIN_DECODED_SIZE = 1024 # 1KB # Pattern to find base64 chunks in text (at least 100 chars to be worth checking) -# Matches continuous base64 characters, optionally ending with = padding -EMBEDDED_BASE64_PATTERN = re.compile(r"[A-Za-z0-9+/]{100,}={0,2}") +# Matches continuous base64 characters (with optional whitespace for line wrapping), +# optionally ending with = padding +EMBEDDED_BASE64_PATTERN = re.compile(r"[A-Za-z0-9+/\s]{100,}={0,2}") # Magic numbers for binary file detection MAGIC_SIGNATURES = [ @@ -141,8 +142,11 @@ def _decode_and_validate(b64_str: str) -> Optional[tuple[bytes, str]]: Returns (content, extension) if valid binary, None otherwise. """ + # Strip whitespace for RFC 2045 line-wrapped base64 + normalized = re.sub(r"\s+", "", b64_str) + try: - content = base64.b64decode(b64_str, validate=True) + content = base64.b64decode(normalized, validate=True) except (ValueError, binascii.Error): return None diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/test_binary_output_processor.py b/autogpt_platform/backend/backend/api/features/chat/tools/test_binary_output_processor.py index 6018d64789..5a26261842 100644 --- a/autogpt_platform/backend/backend/api/features/chat/tools/test_binary_output_processor.py +++ b/autogpt_platform/backend/backend/api/features/chat/tools/test_binary_output_processor.py @@ -116,6 +116,18 @@ class TestDecodeAndValidate: result = _decode_and_validate(wav_b64) assert result is None + def test_handles_line_wrapped_base64(self): + """Should handle RFC 2045 line-wrapped base64.""" + pdf_content = b"%PDF-1.4 " + b"x" * 2000 + pdf_b64 = base64.b64encode(pdf_content).decode() + # Simulate line wrapping at 76 chars + wrapped = "\n".join(pdf_b64[i : i + 76] for i in range(0, len(pdf_b64), 76)) + result = _decode_and_validate(wrapped) + assert result is not None + content, ext = result + assert ext == "pdf" + assert content == pdf_content + # ============================================================================= # Marker Expansion Tests