fix(backend): address PR review comments for file content parser

- Move local imports (yaml, pandas) to top-level
- Add size limit guard (_MAX_BARE_REF_BYTES=10MB) for bare ref expansion
- Ignore line ranges for binary formats (parquet/xlsx) instead of garbling
- Strip query strings in infer_format URI parsing
- Fix type annotations for parser dicts (Callable instead of Any)
- Replace NaN with None in parquet/xlsx output for JSON serializability
- Fix TOML empty dict fallback (always return parsed dict)
- Remove .xls/.vnd.ms-excel mappings (openpyxl doesn't support legacy .xls)
- Add posixpath.splitext usage comment
- Update prompt to mention first-sheet-only and .xls limitation
- Restore pyyaml cp38 entries in poetry.lock
This commit is contained in:
Zamil Majdy
2026-03-13 03:28:31 +07:00
parent 35f54549b5
commit 340b18d40b
5 changed files with 69 additions and 41 deletions

View File

@@ -55,10 +55,11 @@ value. Multiple references in one argument are all expanded.
**Structured data**: When the **entire** argument value is a single file
reference (no surrounding text), the platform automatically parses the file
content based on its extension or MIME type. Supported formats: JSON, JSONL,
CSV, TSV, YAML, TOML, Parquet, and Excel (.xlsx). For example, pass
`@@agptfile:workspace://<id>` where the file is a `.csv` and the rows will be
parsed into `list[list[str]]` automatically. If the format is unrecognised or
parsing fails, the content is returned as a plain string.
CSV, TSV, YAML, TOML, Parquet, and Excel (.xlsx — first sheet only).
For example, pass `@@agptfile:workspace://<id>` where the file is a `.csv` and
the rows will be parsed into `list[list[str]]` automatically. If the format is
unrecognised or parsing fails, the content is returned as a plain string.
Legacy `.xls` files are **not** supported — only the modern `.xlsx` format.
**Type coercion**: The platform also coerces expanded values to match the
block's expected input types. For example, if a block expects `list[list[str]]`

View File

@@ -47,6 +47,11 @@ from backend.copilot.context import (
from backend.copilot.model import ChatSession
from backend.copilot.tools.workspace_files import get_manager
from backend.util.file import parse_workspace_uri
from backend.util.file_content_parser import (
BINARY_FORMATS,
infer_format,
parse_file_content,
)
class FileRefExpansionError(Exception):
@@ -74,6 +79,8 @@ _FILE_REF_RE = re.compile(
_MAX_EXPAND_CHARS = 200_000
# Maximum total characters across all @@agptfile: expansions in one string.
_MAX_TOTAL_EXPAND_CHARS = 1_000_000
# Maximum raw byte size for bare ref structured parsing (10 MB).
_MAX_BARE_REF_BYTES = 10_000_000
@dataclass
@@ -276,12 +283,6 @@ async def expand_file_refs_in_args(
caller (the MCP tool wrapper) should convert this into an MCP error
response that lets the model correct the reference before retrying.
"""
from backend.util.file_content_parser import (
BINARY_FORMATS,
infer_format,
parse_file_content,
)
if not args:
return args
@@ -294,20 +295,32 @@ async def expand_file_refs_in_args(
try:
if fmt is not None and fmt in BINARY_FORMATS:
# Binary formats need raw bytes, not UTF-8 text.
# Line ranges are meaningless for binary formats
# (parquet/xlsx) — ignore them and parse full bytes.
raw = await read_file_bytes(ref.uri, user_id, session)
content: str | bytes = (
_apply_line_range(
raw.decode("utf-8", errors="replace"),
ref.start_line,
ref.end_line,
if len(raw) > _MAX_BARE_REF_BYTES:
raise FileRefExpansionError(
f"File too large for structured parsing "
f"({len(raw)} bytes, limit {_MAX_BARE_REF_BYTES})"
)
if ref.start_line or ref.end_line
else raw
)
content: str | bytes = raw
else:
content = await resolve_file_ref(ref, user_id, session)
except ValueError as exc:
raise FileRefExpansionError(str(exc)) from exc
# Guard against oversized content before parsing.
content_size = (
len(content.encode("utf-8"))
if isinstance(content, str)
else len(content)
)
if content_size > _MAX_BARE_REF_BYTES:
raise FileRefExpansionError(
f"File too large for structured parsing "
f"({content_size} bytes, limit {_MAX_BARE_REF_BYTES})"
)
if fmt is not None:
return parse_file_content(content, fmt)
return (

View File

@@ -15,7 +15,8 @@ Supported formats:
- **YAML** (``.yaml``, ``.yml``) — parsed via PyYAML; containers only
- **TOML** (``.toml``) — parsed via stdlib ``tomllib``
- **Parquet** (``.parquet``) — via pandas/pyarrow → ``list[list[Any]]`` with header row
- **Excel** (``.xlsx``, ``.xls``) — via pandas → ``list[list[Any]]`` with header row
- **Excel** (``.xlsx``) — via pandas/openpyxl → ``list[list[Any]]`` with header row
(legacy ``.xls`` is **not** supported — only the modern OOXML format)
All parsers follow the **fallback contract**: if parsing fails for *any* reason,
the original content is returned unchanged (string for text formats, bytes for
@@ -27,9 +28,16 @@ import io
import json
import logging
import tomllib
from collections.abc import Callable
# posixpath.splitext handles forward-slash URI paths correctly on all platforms,
# unlike os.path.splitext which uses platform-native separators.
from posixpath import splitext
from typing import Any
import pandas as pd
import yaml
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
@@ -47,7 +55,6 @@ _EXT_TO_FORMAT: dict[str, str] = {
".toml": "toml",
".parquet": "parquet",
".xlsx": "xlsx",
".xls": "xlsx",
}
_MIME_TO_FORMAT: dict[str, str] = {
@@ -61,7 +68,6 @@ _MIME_TO_FORMAT: dict[str, str] = {
"application/toml": "toml",
"application/vnd.apache.parquet": "parquet",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
"application/vnd.ms-excel": "xlsx",
}
# Formats that require raw bytes rather than decoded text.
@@ -83,7 +89,7 @@ def infer_format(uri: str) -> str | None:
# 2. Check file extension from the path portion.
# Strip the fragment first so ".json#mime" doesn't confuse splitext.
path = uri.split("#")[0]
path = uri.split("#")[0].split("?")[0]
_, ext = splitext(path)
return _EXT_TO_FORMAT.get(ext.lower())
@@ -126,8 +132,6 @@ def _parse_delimited(content: str, *, delimiter: str) -> Any:
def _parse_yaml(content: str) -> Any:
import yaml
parsed = yaml.safe_load(content)
if isinstance(parsed, (list, dict)):
return parsed
@@ -136,11 +140,11 @@ def _parse_yaml(content: str) -> Any:
def _parse_toml(content: str) -> Any:
parsed = tomllib.loads(content)
# tomllib.loads always returns a dict.
return parsed if parsed else content
# tomllib.loads always returns a dict — return it even if empty.
return parsed
_TEXT_PARSERS: dict[str, Any] = {
_TEXT_PARSERS: dict[str, Callable[[str], Any]] = {
"json": _parse_json,
"jsonl": _parse_jsonl,
"csv": _parse_csv,
@@ -154,26 +158,28 @@ _TEXT_PARSERS: dict[str, Any] = {
# ---------------------------------------------------------------------------
def _parse_parquet(content: bytes) -> Any:
import pandas as pd
def _df_to_rows(df: pd.DataFrame) -> list[list[Any]]:
"""Convert a DataFrame to ``list[list[Any]]`` with a header row.
df = pd.read_parquet(io.BytesIO(content))
# Return as list[list[Any]] with the first row being the header.
NaN values are replaced with ``None`` so the result is JSON-serializable.
"""
df = df.where(df.notna(), None)
header = df.columns.tolist()
rows = df.values.tolist()
return [header] + rows
def _parse_parquet(content: bytes) -> Any:
df = pd.read_parquet(io.BytesIO(content))
return _df_to_rows(df)
def _parse_xlsx(content: bytes) -> Any:
import pandas as pd
df = pd.read_excel(io.BytesIO(content))
header = df.columns.tolist()
rows = df.values.tolist()
return [header] + rows
return _df_to_rows(df)
_BINARY_PARSERS: dict[str, Any] = {
_BINARY_PARSERS: dict[str, Callable[[bytes], Any]] = {
"parquet": _parse_parquet,
"xlsx": _parse_xlsx,
}

View File

@@ -49,8 +49,9 @@ class TestInferFormat:
def test_xlsx_extension(self):
assert infer_format("/data/spreadsheet.xlsx") == "xlsx"
def test_xls_extension(self):
assert infer_format("/data/old_spreadsheet.xls") == "xlsx"
def test_xls_extension_not_supported(self):
# Legacy .xls requires xlrd which we don't bundle.
assert infer_format("/data/old_spreadsheet.xls") is None
def test_case_insensitive(self):
assert infer_format("/data/FILE.JSON") == "json"
@@ -280,9 +281,9 @@ class TestParseToml:
result = parse_file_content(content, "toml")
assert result == {"name": "test", "count": 42}
def test_empty_table_fallback(self):
def test_empty_string_returns_empty_dict(self):
result = parse_file_content("", "toml")
assert result == ""
assert result == {}
def test_invalid_toml_fallback(self):
result = parse_file_content("not = [valid toml", "toml")

View File

@@ -6289,6 +6289,13 @@ optional = false
python-versions = ">=3.8"
groups = ["main", "dev"]
files = [
{file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"},
{file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"},
{file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3"},
{file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6"},
{file = "PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369"},
{file = "PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295"},
{file = "PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b"},
{file = "pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b"},
{file = "pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956"},
{file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8"},