mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
fix(backend): address PR review comments for file content parser
- Move local imports (yaml, pandas) to top-level - Add size limit guard (_MAX_BARE_REF_BYTES=10MB) for bare ref expansion - Ignore line ranges for binary formats (parquet/xlsx) instead of garbling - Strip query strings in infer_format URI parsing - Fix type annotations for parser dicts (Callable instead of Any) - Replace NaN with None in parquet/xlsx output for JSON serializability - Fix TOML empty dict fallback (always return parsed dict) - Remove .xls/.vnd.ms-excel mappings (openpyxl doesn't support legacy .xls) - Add posixpath.splitext usage comment - Update prompt to mention first-sheet-only and .xls limitation - Restore pyyaml cp38 entries in poetry.lock
This commit is contained in:
@@ -55,10 +55,11 @@ value. Multiple references in one argument are all expanded.
|
||||
**Structured data**: When the **entire** argument value is a single file
|
||||
reference (no surrounding text), the platform automatically parses the file
|
||||
content based on its extension or MIME type. Supported formats: JSON, JSONL,
|
||||
CSV, TSV, YAML, TOML, Parquet, and Excel (.xlsx). For example, pass
|
||||
`@@agptfile:workspace://<id>` where the file is a `.csv` and the rows will be
|
||||
parsed into `list[list[str]]` automatically. If the format is unrecognised or
|
||||
parsing fails, the content is returned as a plain string.
|
||||
CSV, TSV, YAML, TOML, Parquet, and Excel (.xlsx — first sheet only).
|
||||
For example, pass `@@agptfile:workspace://<id>` where the file is a `.csv` and
|
||||
the rows will be parsed into `list[list[str]]` automatically. If the format is
|
||||
unrecognised or parsing fails, the content is returned as a plain string.
|
||||
Legacy `.xls` files are **not** supported — only the modern `.xlsx` format.
|
||||
|
||||
**Type coercion**: The platform also coerces expanded values to match the
|
||||
block's expected input types. For example, if a block expects `list[list[str]]`
|
||||
|
||||
@@ -47,6 +47,11 @@ from backend.copilot.context import (
|
||||
from backend.copilot.model import ChatSession
|
||||
from backend.copilot.tools.workspace_files import get_manager
|
||||
from backend.util.file import parse_workspace_uri
|
||||
from backend.util.file_content_parser import (
|
||||
BINARY_FORMATS,
|
||||
infer_format,
|
||||
parse_file_content,
|
||||
)
|
||||
|
||||
|
||||
class FileRefExpansionError(Exception):
|
||||
@@ -74,6 +79,8 @@ _FILE_REF_RE = re.compile(
|
||||
_MAX_EXPAND_CHARS = 200_000
|
||||
# Maximum total characters across all @@agptfile: expansions in one string.
|
||||
_MAX_TOTAL_EXPAND_CHARS = 1_000_000
|
||||
# Maximum raw byte size for bare ref structured parsing (10 MB).
|
||||
_MAX_BARE_REF_BYTES = 10_000_000
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -276,12 +283,6 @@ async def expand_file_refs_in_args(
|
||||
caller (the MCP tool wrapper) should convert this into an MCP error
|
||||
response that lets the model correct the reference before retrying.
|
||||
"""
|
||||
from backend.util.file_content_parser import (
|
||||
BINARY_FORMATS,
|
||||
infer_format,
|
||||
parse_file_content,
|
||||
)
|
||||
|
||||
if not args:
|
||||
return args
|
||||
|
||||
@@ -294,20 +295,32 @@ async def expand_file_refs_in_args(
|
||||
try:
|
||||
if fmt is not None and fmt in BINARY_FORMATS:
|
||||
# Binary formats need raw bytes, not UTF-8 text.
|
||||
# Line ranges are meaningless for binary formats
|
||||
# (parquet/xlsx) — ignore them and parse full bytes.
|
||||
raw = await read_file_bytes(ref.uri, user_id, session)
|
||||
content: str | bytes = (
|
||||
_apply_line_range(
|
||||
raw.decode("utf-8", errors="replace"),
|
||||
ref.start_line,
|
||||
ref.end_line,
|
||||
if len(raw) > _MAX_BARE_REF_BYTES:
|
||||
raise FileRefExpansionError(
|
||||
f"File too large for structured parsing "
|
||||
f"({len(raw)} bytes, limit {_MAX_BARE_REF_BYTES})"
|
||||
)
|
||||
if ref.start_line or ref.end_line
|
||||
else raw
|
||||
)
|
||||
content: str | bytes = raw
|
||||
else:
|
||||
content = await resolve_file_ref(ref, user_id, session)
|
||||
except ValueError as exc:
|
||||
raise FileRefExpansionError(str(exc)) from exc
|
||||
|
||||
# Guard against oversized content before parsing.
|
||||
content_size = (
|
||||
len(content.encode("utf-8"))
|
||||
if isinstance(content, str)
|
||||
else len(content)
|
||||
)
|
||||
if content_size > _MAX_BARE_REF_BYTES:
|
||||
raise FileRefExpansionError(
|
||||
f"File too large for structured parsing "
|
||||
f"({content_size} bytes, limit {_MAX_BARE_REF_BYTES})"
|
||||
)
|
||||
|
||||
if fmt is not None:
|
||||
return parse_file_content(content, fmt)
|
||||
return (
|
||||
|
||||
@@ -15,7 +15,8 @@ Supported formats:
|
||||
- **YAML** (``.yaml``, ``.yml``) — parsed via PyYAML; containers only
|
||||
- **TOML** (``.toml``) — parsed via stdlib ``tomllib``
|
||||
- **Parquet** (``.parquet``) — via pandas/pyarrow → ``list[list[Any]]`` with header row
|
||||
- **Excel** (``.xlsx``, ``.xls``) — via pandas → ``list[list[Any]]`` with header row
|
||||
- **Excel** (``.xlsx``) — via pandas/openpyxl → ``list[list[Any]]`` with header row
|
||||
(legacy ``.xls`` is **not** supported — only the modern OOXML format)
|
||||
|
||||
All parsers follow the **fallback contract**: if parsing fails for *any* reason,
|
||||
the original content is returned unchanged (string for text formats, bytes for
|
||||
@@ -27,9 +28,16 @@ import io
|
||||
import json
|
||||
import logging
|
||||
import tomllib
|
||||
from collections.abc import Callable
|
||||
|
||||
# posixpath.splitext handles forward-slash URI paths correctly on all platforms,
|
||||
# unlike os.path.splitext which uses platform-native separators.
|
||||
from posixpath import splitext
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
import yaml
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -47,7 +55,6 @@ _EXT_TO_FORMAT: dict[str, str] = {
|
||||
".toml": "toml",
|
||||
".parquet": "parquet",
|
||||
".xlsx": "xlsx",
|
||||
".xls": "xlsx",
|
||||
}
|
||||
|
||||
_MIME_TO_FORMAT: dict[str, str] = {
|
||||
@@ -61,7 +68,6 @@ _MIME_TO_FORMAT: dict[str, str] = {
|
||||
"application/toml": "toml",
|
||||
"application/vnd.apache.parquet": "parquet",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
||||
"application/vnd.ms-excel": "xlsx",
|
||||
}
|
||||
|
||||
# Formats that require raw bytes rather than decoded text.
|
||||
@@ -83,7 +89,7 @@ def infer_format(uri: str) -> str | None:
|
||||
|
||||
# 2. Check file extension from the path portion.
|
||||
# Strip the fragment first so ".json#mime" doesn't confuse splitext.
|
||||
path = uri.split("#")[0]
|
||||
path = uri.split("#")[0].split("?")[0]
|
||||
_, ext = splitext(path)
|
||||
return _EXT_TO_FORMAT.get(ext.lower())
|
||||
|
||||
@@ -126,8 +132,6 @@ def _parse_delimited(content: str, *, delimiter: str) -> Any:
|
||||
|
||||
|
||||
def _parse_yaml(content: str) -> Any:
|
||||
import yaml
|
||||
|
||||
parsed = yaml.safe_load(content)
|
||||
if isinstance(parsed, (list, dict)):
|
||||
return parsed
|
||||
@@ -136,11 +140,11 @@ def _parse_yaml(content: str) -> Any:
|
||||
|
||||
def _parse_toml(content: str) -> Any:
|
||||
parsed = tomllib.loads(content)
|
||||
# tomllib.loads always returns a dict.
|
||||
return parsed if parsed else content
|
||||
# tomllib.loads always returns a dict — return it even if empty.
|
||||
return parsed
|
||||
|
||||
|
||||
_TEXT_PARSERS: dict[str, Any] = {
|
||||
_TEXT_PARSERS: dict[str, Callable[[str], Any]] = {
|
||||
"json": _parse_json,
|
||||
"jsonl": _parse_jsonl,
|
||||
"csv": _parse_csv,
|
||||
@@ -154,26 +158,28 @@ _TEXT_PARSERS: dict[str, Any] = {
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _parse_parquet(content: bytes) -> Any:
|
||||
import pandas as pd
|
||||
def _df_to_rows(df: pd.DataFrame) -> list[list[Any]]:
|
||||
"""Convert a DataFrame to ``list[list[Any]]`` with a header row.
|
||||
|
||||
df = pd.read_parquet(io.BytesIO(content))
|
||||
# Return as list[list[Any]] with the first row being the header.
|
||||
NaN values are replaced with ``None`` so the result is JSON-serializable.
|
||||
"""
|
||||
df = df.where(df.notna(), None)
|
||||
header = df.columns.tolist()
|
||||
rows = df.values.tolist()
|
||||
return [header] + rows
|
||||
|
||||
|
||||
def _parse_parquet(content: bytes) -> Any:
|
||||
df = pd.read_parquet(io.BytesIO(content))
|
||||
return _df_to_rows(df)
|
||||
|
||||
|
||||
def _parse_xlsx(content: bytes) -> Any:
|
||||
import pandas as pd
|
||||
|
||||
df = pd.read_excel(io.BytesIO(content))
|
||||
header = df.columns.tolist()
|
||||
rows = df.values.tolist()
|
||||
return [header] + rows
|
||||
return _df_to_rows(df)
|
||||
|
||||
|
||||
_BINARY_PARSERS: dict[str, Any] = {
|
||||
_BINARY_PARSERS: dict[str, Callable[[bytes], Any]] = {
|
||||
"parquet": _parse_parquet,
|
||||
"xlsx": _parse_xlsx,
|
||||
}
|
||||
|
||||
@@ -49,8 +49,9 @@ class TestInferFormat:
|
||||
def test_xlsx_extension(self):
|
||||
assert infer_format("/data/spreadsheet.xlsx") == "xlsx"
|
||||
|
||||
def test_xls_extension(self):
|
||||
assert infer_format("/data/old_spreadsheet.xls") == "xlsx"
|
||||
def test_xls_extension_not_supported(self):
|
||||
# Legacy .xls requires xlrd which we don't bundle.
|
||||
assert infer_format("/data/old_spreadsheet.xls") is None
|
||||
|
||||
def test_case_insensitive(self):
|
||||
assert infer_format("/data/FILE.JSON") == "json"
|
||||
@@ -280,9 +281,9 @@ class TestParseToml:
|
||||
result = parse_file_content(content, "toml")
|
||||
assert result == {"name": "test", "count": 42}
|
||||
|
||||
def test_empty_table_fallback(self):
|
||||
def test_empty_string_returns_empty_dict(self):
|
||||
result = parse_file_content("", "toml")
|
||||
assert result == ""
|
||||
assert result == {}
|
||||
|
||||
def test_invalid_toml_fallback(self):
|
||||
result = parse_file_content("not = [valid toml", "toml")
|
||||
|
||||
7
autogpt_platform/backend/poetry.lock
generated
7
autogpt_platform/backend/poetry.lock
generated
@@ -6289,6 +6289,13 @@ optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main", "dev"]
|
||||
files = [
|
||||
{file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"},
|
||||
{file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"},
|
||||
{file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3"},
|
||||
{file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6"},
|
||||
{file = "PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369"},
|
||||
{file = "PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295"},
|
||||
{file = "PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b"},
|
||||
{file = "pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b"},
|
||||
{file = "pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956"},
|
||||
{file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8"},
|
||||
|
||||
Reference in New Issue
Block a user