fix: Handle RFC 2045 line-wrapped base64 in binary detection

- Strip whitespace before base64 decoding (encoders often add line breaks)
- Update regex comment to clarify whitespace handling
- Add test case for line-wrapped base64

Addresses CodeRabbit review feedback.
This commit is contained in:
Otto
2026-02-05 19:11:56 +00:00
parent dcb3550910
commit c32c7fb959
2 changed files with 21 additions and 3 deletions

View File

@@ -47,7 +47,8 @@ ALLOWED_MIMETYPES = {
}
# Base64 character validation (strict - must be pure base64)
BASE64_PATTERN = re.compile(r"^[A-Za-z0-9+/\n\r]+=*$")
# Allows whitespace which will be stripped before decoding (RFC 2045 line wrapping)
BASE64_PATTERN = re.compile(r"^[A-Za-z0-9+/\s]+=*$")
# Magic numbers for binary file detection
# Note: WebP requires two-step detection: RIFF prefix + WEBP at offset 8
@@ -171,12 +172,15 @@ def _detect_raw_base64(value: str) -> Optional[tuple[bytes, str]]:
Returns (content, extension) or None.
"""
# Pre-filter: must look like base64 (no spaces, punctuation, etc.)
# Pre-filter: must look like base64 (allows whitespace for RFC 2045 line wrapping)
if not BASE64_PATTERN.match(value):
return None
# Strip whitespace before decoding (RFC 2045 allows line breaks in base64)
normalized = re.sub(r"\s+", "", value)
try:
content = base64.b64decode(value, validate=True)
content = base64.b64decode(normalized, validate=True)
except (ValueError, binascii.Error):
return None

View File

@@ -198,6 +198,20 @@ class TestDetectRawBase64:
result = _detect_raw_base64("not-valid-base64!!!")
assert result is None
def test_detects_base64_with_line_breaks(self):
"""Should detect raw base64 with RFC 2045 line breaks."""
png_content = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100
png_b64 = base64.b64encode(png_content).decode()
# Simulate RFC 2045 line wrapping at 76 chars
wrapped = png_b64[:76] + "\n" + png_b64[76:]
result = _detect_raw_base64(wrapped)
assert result is not None
content, ext = result
assert ext == "png"
assert content == png_content
# =============================================================================
# Process Binary Outputs Tests