fix: Handle RFC 2045 line-wrapped base64 in embedded detection

Sentry correctly identified that the regex and decode would fail on
line-wrapped base64. Fixed by:
- Updated regex to allow whitespace in base64 chunks
- Strip whitespace before decoding with validate=True
- Added test for line-wrapped base64
This commit is contained in:
Otto
2026-02-05 20:00:02 +00:00
parent 8cbf07ce40
commit 513ec16357
2 changed files with 19 additions and 3 deletions

View File

@@ -28,8 +28,9 @@ logger = logging.getLogger(__name__)
MIN_DECODED_SIZE = 1024 # 1KB
# Pattern to find base64 chunks in text (at least 100 chars to be worth checking)
# Matches continuous base64 characters, optionally ending with = padding
EMBEDDED_BASE64_PATTERN = re.compile(r"[A-Za-z0-9+/]{100,}={0,2}")
# Matches continuous base64 characters (with optional whitespace for line wrapping),
# optionally ending with = padding
EMBEDDED_BASE64_PATTERN = re.compile(r"[A-Za-z0-9+/\s]{100,}={0,2}")
# Magic numbers for binary file detection
MAGIC_SIGNATURES = [
@@ -141,8 +142,11 @@ def _decode_and_validate(b64_str: str) -> Optional[tuple[bytes, str]]:
Returns (content, extension) if valid binary, None otherwise.
"""
# Strip whitespace for RFC 2045 line-wrapped base64
normalized = re.sub(r"\s+", "", b64_str)
try:
content = base64.b64decode(b64_str, validate=True)
content = base64.b64decode(normalized, validate=True)
except (ValueError, binascii.Error):
return None

View File

@@ -116,6 +116,18 @@ class TestDecodeAndValidate:
result = _decode_and_validate(wav_b64)
assert result is None
def test_handles_line_wrapped_base64(self):
"""Should handle RFC 2045 line-wrapped base64."""
pdf_content = b"%PDF-1.4 " + b"x" * 2000
pdf_b64 = base64.b64encode(pdf_content).decode()
# Simulate line wrapping at 76 chars
wrapped = "\n".join(pdf_b64[i : i + 76] for i in range(0, len(pdf_b64), 76))
result = _decode_and_validate(wrapped)
assert result is not None
content, ext = result
assert ext == "pdf"
assert content == pdf_content
# =============================================================================
# Marker Expansion Tests