fix(backend): use actual byte size for string size guard, narrow exception handling

- Size guard in _expand now computes len(content.encode("utf-8")) for strings instead of len(content) which returns character count. This fixes a security issue where multi-byte UTF-8 strings (e.g. emoji) could pass up to 40MB through a 10MB byte limit. - Narrow except Exception in _infer_format_from_workspace to only catch expected IO/lookup failures (ValueError, FileNotFoundError, OSError, PermissionError). - Narrow except Exception in parse_file_content to only catch expected parse failures, letting programming bugs surface.
2026-04-08 03:00:28 -04:00 · 2026-03-13 19:40:20 +07:00
parent f1366ea139
commit b7551a1d18
2 changed files with 20 additions and 5 deletions
--- a/autogpt_platform/backend/backend/copilot/sdk/file_ref.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/file_ref.py
@@ -286,7 +286,7 @@ async def _infer_format_from_workspace(
        if fmt:
            return fmt
        return infer_format(info.name)
-    except Exception:
+    except (ValueError, FileNotFoundError, OSError, PermissionError):
        logger.debug("workspace metadata lookup failed for %s", uri, exc_info=True)
        return None

@@ -447,9 +447,14 @@ async def expand_file_refs_in_args(
                    raise FileRefExpansionError(str(exc)) from exc

                # Guard against oversized content before parsing.
-                # For strings, len() returns character count which is a lower
-                # bound on UTF-8 byte size — sufficient for a safety guard.
-                content_size = len(content)
+                if isinstance(content, bytes):
+                    content_size = len(content)
+                else:
+                    # len() on str returns character count, but multi-byte
+                    # UTF-8 chars (e.g. emoji) mean byte size can be up to
+                    # 4x the character count.  Use the actual encoded byte
+                    # length for an accurate guard.
+                    content_size = len(content.encode("utf-8"))
                if content_size > _MAX_BARE_REF_BYTES:
                    raise FileRefExpansionError(
                        f"File too large for structured parsing "
--- a/autogpt_platform/backend/backend/util/file_content_parser.py
+++ b/autogpt_platform/backend/backend/util/file_content_parser.py
@@ -244,7 +244,17 @@ def parse_file_content(content: str | bytes, fmt: str, *, strict: bool = False)
            content = content.decode("utf-8", errors="replace")
        return parser(content)

-    except Exception:
+    except (
+        json.JSONDecodeError,
+        csv.Error,
+        yaml.YAMLError,
+        tomllib.TOMLDecodeError,
+        ValueError,
+        UnicodeDecodeError,
+        pd.errors.ParserError,
+        ImportError,
+        OSError,
+    ):
        if strict:
            raise
        logger.debug("Structured parsing failed for format=%s, falling back", fmt)