fix(backend): address PR review comments for file content parser

- Move local imports (yaml, pandas) to top-level - Add size limit guard (_MAX_BARE_REF_BYTES=10MB) for bare ref expansion - Ignore line ranges for binary formats (parquet/xlsx) instead of garbling - Strip query strings in infer_format URI parsing - Fix type annotations for parser dicts (Callable instead of Any) - Replace NaN with None in parquet/xlsx output for JSON serializability - Fix TOML empty dict fallback (always return parsed dict) - Remove .xls/.vnd.ms-excel mappings (openpyxl doesn't support legacy .xls) - Add posixpath.splitext usage comment - Update prompt to mention first-sheet-only and .xls limitation - Restore pyyaml cp38 entries in poetry.lock
2026-04-08 03:00:28 -04:00 · 2026-03-13 03:28:31 +07:00
parent 35f54549b5
commit 340b18d40b
5 changed files with 69 additions and 41 deletions
--- a/autogpt_platform/backend/backend/copilot/prompting.py
+++ b/autogpt_platform/backend/backend/copilot/prompting.py
@@ -55,10 +55,11 @@ value.  Multiple references in one argument are all expanded.
 **Structured data**: When the **entire** argument value is a single file
 reference (no surrounding text), the platform automatically parses the file
 content based on its extension or MIME type.  Supported formats: JSON, JSONL,
-CSV, TSV, YAML, TOML, Parquet, and Excel (.xlsx).  For example, pass
-`@@agptfile:workspace://<id>` where the file is a `.csv` and the rows will be
-parsed into `list[list[str]]` automatically.  If the format is unrecognised or
-parsing fails, the content is returned as a plain string.
+CSV, TSV, YAML, TOML, Parquet, and Excel (.xlsx — first sheet only).
+For example, pass `@@agptfile:workspace://<id>` where the file is a `.csv` and
+the rows will be parsed into `list[list[str]]` automatically.  If the format is
+unrecognised or parsing fails, the content is returned as a plain string.
+Legacy `.xls` files are **not** supported — only the modern `.xlsx` format.

 **Type coercion**: The platform also coerces expanded values to match the
 block's expected input types.  For example, if a block expects `list[list[str]]`
--- a/autogpt_platform/backend/backend/copilot/sdk/file_ref.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/file_ref.py
@@ -47,6 +47,11 @@ from backend.copilot.context import (
 from backend.copilot.model import ChatSession
 from backend.copilot.tools.workspace_files import get_manager
 from backend.util.file import parse_workspace_uri
+from backend.util.file_content_parser import (
+    BINARY_FORMATS,
+    infer_format,
+    parse_file_content,
+)


 class FileRefExpansionError(Exception):
@@ -74,6 +79,8 @@ _FILE_REF_RE = re.compile(
 _MAX_EXPAND_CHARS = 200_000
 # Maximum total characters across all @@agptfile: expansions in one string.
 _MAX_TOTAL_EXPAND_CHARS = 1_000_000
+# Maximum raw byte size for bare ref structured parsing (10 MB).
+_MAX_BARE_REF_BYTES = 10_000_000


@dataclass
@@ -276,12 +283,6 @@ async def expand_file_refs_in_args(
    caller (the MCP tool wrapper) should convert this into an MCP error
    response that lets the model correct the reference before retrying.
    """
-    from backend.util.file_content_parser import (
-        BINARY_FORMATS,
-        infer_format,
-        parse_file_content,
-    )
-
    if not args:
        return args

@@ -294,20 +295,32 @@ async def expand_file_refs_in_args(
                try:
                    if fmt is not None and fmt in BINARY_FORMATS:
                        # Binary formats need raw bytes, not UTF-8 text.
+                        # Line ranges are meaningless for binary formats
+                        # (parquet/xlsx) — ignore them and parse full bytes.
                        raw = await read_file_bytes(ref.uri, user_id, session)
-                        content: str | bytes = (
-                            _apply_line_range(
-                                raw.decode("utf-8", errors="replace"),
-                                ref.start_line,
-                                ref.end_line,
+                        if len(raw) > _MAX_BARE_REF_BYTES:
+                            raise FileRefExpansionError(
+                                f"File too large for structured parsing "
+                                f"({len(raw)} bytes, limit {_MAX_BARE_REF_BYTES})"
                            )
-                            if ref.start_line or ref.end_line
-                            else raw
-                        )
+                        content: str | bytes = raw
                    else:
                        content = await resolve_file_ref(ref, user_id, session)
                except ValueError as exc:
                    raise FileRefExpansionError(str(exc)) from exc
+
+                # Guard against oversized content before parsing.
+                content_size = (
+                    len(content.encode("utf-8"))
+                    if isinstance(content, str)
+                    else len(content)
+                )
+                if content_size > _MAX_BARE_REF_BYTES:
+                    raise FileRefExpansionError(
+                        f"File too large for structured parsing "
+                        f"({content_size} bytes, limit {_MAX_BARE_REF_BYTES})"
+                    )
+
                if fmt is not None:
                    return parse_file_content(content, fmt)
                return (
--- a/autogpt_platform/backend/backend/util/file_content_parser.py
+++ b/autogpt_platform/backend/backend/util/file_content_parser.py
@@ -15,7 +15,8 @@ Supported formats:
 - **YAML** (``.yaml``, ``.yml``) — parsed via PyYAML; containers only
 - **TOML** (``.toml``) — parsed via stdlib ``tomllib``
 - **Parquet** (``.parquet``) — via pandas/pyarrow → ``list[list[Any]]`` with header row
- **Excel** (``.xlsx``, ``.xls``) — via pandas → ``list[list[Any]]`` with header row
+- **Excel** (``.xlsx``) — via pandas/openpyxl → ``list[list[Any]]`` with header row
+  (legacy ``.xls`` is **not** supported — only the modern OOXML format)

 All parsers follow the **fallback contract**: if parsing fails for *any* reason,
 the original content is returned unchanged (string for text formats, bytes for
@@ -27,9 +28,16 @@ import io
 import json
 import logging
 import tomllib
+from collections.abc import Callable
+
+# posixpath.splitext handles forward-slash URI paths correctly on all platforms,
+# unlike os.path.splitext which uses platform-native separators.
 from posixpath import splitext
 from typing import Any

+import pandas as pd
+import yaml
+
 logger = logging.getLogger(__name__)

 # ---------------------------------------------------------------------------
@@ -47,7 +55,6 @@ _EXT_TO_FORMAT: dict[str, str] = {
    ".toml": "toml",
    ".parquet": "parquet",
    ".xlsx": "xlsx",
-    ".xls": "xlsx",
 }

 _MIME_TO_FORMAT: dict[str, str] = {
@@ -61,7 +68,6 @@ _MIME_TO_FORMAT: dict[str, str] = {
    "application/toml": "toml",
    "application/vnd.apache.parquet": "parquet",
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
-    "application/vnd.ms-excel": "xlsx",
 }

 # Formats that require raw bytes rather than decoded text.
@@ -83,7 +89,7 @@ def infer_format(uri: str) -> str | None:

    # 2. Check file extension from the path portion.
    #    Strip the fragment first so ".json#mime" doesn't confuse splitext.
-    path = uri.split("#")[0]
+    path = uri.split("#")[0].split("?")[0]
    _, ext = splitext(path)
    return _EXT_TO_FORMAT.get(ext.lower())

@@ -126,8 +132,6 @@ def _parse_delimited(content: str, *, delimiter: str) -> Any:


 def _parse_yaml(content: str) -> Any:
-    import yaml
-
    parsed = yaml.safe_load(content)
    if isinstance(parsed, (list, dict)):
        return parsed
@@ -136,11 +140,11 @@ def _parse_yaml(content: str) -> Any:

 def _parse_toml(content: str) -> Any:
    parsed = tomllib.loads(content)
-    # tomllib.loads always returns a dict.
-    return parsed if parsed else content
+    # tomllib.loads always returns a dict — return it even if empty.
+    return parsed


-_TEXT_PARSERS: dict[str, Any] = {
+_TEXT_PARSERS: dict[str, Callable[[str], Any]] = {
    "json": _parse_json,
    "jsonl": _parse_jsonl,
    "csv": _parse_csv,
@@ -154,26 +158,28 @@ _TEXT_PARSERS: dict[str, Any] = {
 # ---------------------------------------------------------------------------


-def _parse_parquet(content: bytes) -> Any:
-    import pandas as pd
+def _df_to_rows(df: pd.DataFrame) -> list[list[Any]]:
+    """Convert a DataFrame to ``list[list[Any]]`` with a header row.

-    df = pd.read_parquet(io.BytesIO(content))
-    # Return as list[list[Any]] with the first row being the header.
+    NaN values are replaced with ``None`` so the result is JSON-serializable.
+    """
+    df = df.where(df.notna(), None)
    header = df.columns.tolist()
    rows = df.values.tolist()
    return [header] + rows


+def _parse_parquet(content: bytes) -> Any:
+    df = pd.read_parquet(io.BytesIO(content))
+    return _df_to_rows(df)
+
+
 def _parse_xlsx(content: bytes) -> Any:
-    import pandas as pd
-
    df = pd.read_excel(io.BytesIO(content))
-    header = df.columns.tolist()
-    rows = df.values.tolist()
-    return [header] + rows
+    return _df_to_rows(df)


-_BINARY_PARSERS: dict[str, Any] = {
+_BINARY_PARSERS: dict[str, Callable[[bytes], Any]] = {
    "parquet": _parse_parquet,
    "xlsx": _parse_xlsx,
 }
--- a/autogpt_platform/backend/backend/util/file_content_parser_test.py
+++ b/autogpt_platform/backend/backend/util/file_content_parser_test.py
@@ -49,8 +49,9 @@ class TestInferFormat:
    def test_xlsx_extension(self):
        assert infer_format("/data/spreadsheet.xlsx") == "xlsx"

-    def test_xls_extension(self):
-        assert infer_format("/data/old_spreadsheet.xls") == "xlsx"
+    def test_xls_extension_not_supported(self):
+        # Legacy .xls requires xlrd which we don't bundle.
+        assert infer_format("/data/old_spreadsheet.xls") is None

    def test_case_insensitive(self):
        assert infer_format("/data/FILE.JSON") == "json"
@@ -280,9 +281,9 @@ class TestParseToml:
        result = parse_file_content(content, "toml")
        assert result == {"name": "test", "count": 42}

-    def test_empty_table_fallback(self):
+    def test_empty_string_returns_empty_dict(self):
        result = parse_file_content("", "toml")
-        assert result == ""
+        assert result == {}

    def test_invalid_toml_fallback(self):
        result = parse_file_content("not = [valid toml", "toml")
--- a/autogpt_platform/backend/poetry.lock
+++ b/autogpt_platform/backend/poetry.lock
@@ -6289,6 +6289,13 @@ optional = false
 python-versions = ">=3.8"
 groups = ["main", "dev"]
 files = [
+    {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"},
+    {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"},
+    {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3"},
+    {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6"},
+    {file = "PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369"},
+    {file = "PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295"},
+    {file = "PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b"},
    {file = "pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b"},
    {file = "pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956"},
    {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8"},