fix(backend): use actual byte size for string size guard, narrow exception handling

- Size guard in _expand now computes len(content.encode("utf-8")) for
  strings instead of len(content) which returns character count. This
  fixes a security issue where multi-byte UTF-8 strings (e.g. emoji)
  could pass up to 40MB through a 10MB byte limit.
- Narrow except Exception in _infer_format_from_workspace to only catch
  expected IO/lookup failures (ValueError, FileNotFoundError, OSError,
  PermissionError).
- Narrow except Exception in parse_file_content to only catch expected
  parse failures, letting programming bugs surface.
This commit is contained in:
Zamil Majdy
2026-03-13 19:40:20 +07:00
parent f1366ea139
commit b7551a1d18
2 changed files with 20 additions and 5 deletions

View File

@@ -286,7 +286,7 @@ async def _infer_format_from_workspace(
if fmt:
return fmt
return infer_format(info.name)
except Exception:
except (ValueError, FileNotFoundError, OSError, PermissionError):
logger.debug("workspace metadata lookup failed for %s", uri, exc_info=True)
return None
@@ -447,9 +447,14 @@ async def expand_file_refs_in_args(
raise FileRefExpansionError(str(exc)) from exc
# Guard against oversized content before parsing.
# For strings, len() returns character count which is a lower
# bound on UTF-8 byte size — sufficient for a safety guard.
content_size = len(content)
if isinstance(content, bytes):
content_size = len(content)
else:
# len() on str returns character count, but multi-byte
# UTF-8 chars (e.g. emoji) mean byte size can be up to
# 4x the character count. Use the actual encoded byte
# length for an accurate guard.
content_size = len(content.encode("utf-8"))
if content_size > _MAX_BARE_REF_BYTES:
raise FileRefExpansionError(
f"File too large for structured parsing "

View File

@@ -244,7 +244,17 @@ def parse_file_content(content: str | bytes, fmt: str, *, strict: bool = False)
content = content.decode("utf-8", errors="replace")
return parser(content)
except Exception:
except (
json.JSONDecodeError,
csv.Error,
yaml.YAMLError,
tomllib.TOMLDecodeError,
ValueError,
UnicodeDecodeError,
pd.errors.ParserError,
ImportError,
OSError,
):
if strict:
raise
logger.debug("Structured parsing failed for format=%s, falling back", fmt)