fix: add workspace/sandbox size limits, deduplicate parse exceptions, case-insensitive MIME

- Enforce 10MB size limit on workspace and sandbox file reads
- Extract shared PARSE_EXCEPTIONS constant to eliminate duplication
- Consolidate _create_blob/_create_binary_blob into single function
- Add globals() caching for lazy PEP 562 imports
- Fix case-insensitive MIME fragment matching per RFC 2045
This commit is contained in:
Zamil Majdy
2026-03-14 22:27:15 +07:00
parent f037e02bd6
commit c39e1a2701
4 changed files with 40 additions and 50 deletions

View File

@@ -278,19 +278,11 @@ class GithubMultiFileCommitBlock(Block):
base_tree_sha = commit_data["tree"]["sha"]
# 3. Build tree entries for each file operation (blobs created concurrently)
async def _create_blob(content: str) -> str:
async def _create_blob(content: str, encoding: str = "utf-8") -> str:
blob_url = repo_url + "/git/blobs"
blob_response = await api.post(
blob_url,
json={"content": content, "encoding": "utf-8"},
)
return blob_response.json()["sha"]
async def _create_binary_blob(b64_content: str) -> str:
blob_url = repo_url + "/git/blobs"
blob_response = await api.post(
blob_url,
json={"content": b64_content, "encoding": "base64"},
json={"content": content, "encoding": encoding},
)
return blob_response.json()["sha"]
@@ -312,7 +304,7 @@ class GithubMultiFileCommitBlock(Block):
else:
upsert_files.append((path, file_op.get("content", "")))
# Create all blobs concurrently. Data URIs (from store_media_file)
# Create all blobs concurrently. Data URIs (from store_media_file)
# are sent as base64 blobs to preserve binary content.
if upsert_files:
@@ -320,7 +312,7 @@ class GithubMultiFileCommitBlock(Block):
parsed = parse_data_uri(content)
if parsed is not None:
_, b64_payload = parsed
return await _create_binary_blob(b64_payload)
return await _create_blob(b64_payload, encoding="base64")
return await _create_blob(content)
blob_shas = await asyncio.gather(

View File

@@ -20,9 +20,11 @@ def __getattr__(name: str) -> Any:
if name == "stream_chat_completion_sdk":
from .service import stream_chat_completion_sdk
globals()["stream_chat_completion_sdk"] = stream_chat_completion_sdk
return stream_chat_completion_sdk
if name == "create_copilot_mcp_server":
from .tool_adapter import create_copilot_mcp_server
globals()["create_copilot_mcp_server"] = create_copilot_mcp_server
return create_copilot_mcp_server
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

View File

@@ -31,20 +31,13 @@ Examples
@@agptfile:/home/user/script.sh
"""
import csv
import itertools
import json
import logging
import os
import re
import tomllib
import zipfile
from dataclasses import dataclass
from typing import Any
import yaml
from openpyxl.utils.exceptions import InvalidFileException as OpenpyxlInvalidFile
from backend.copilot.context import (
get_current_sandbox,
get_sdk_cwd,
@@ -57,6 +50,7 @@ from backend.util.file import parse_workspace_uri
from backend.util.file_content_parser import (
BINARY_FORMATS,
MIME_TO_FORMAT,
PARSE_EXCEPTIONS,
infer_format,
parse_file_content,
)
@@ -157,7 +151,7 @@ async def read_file_bytes(
manager = await get_workspace_manager(user_id, session.session_id)
ws = parse_workspace_uri(plain)
try:
return await (
data = await (
manager.read_file(ws.file_ref)
if ws.is_path
else manager.read_file_by_id(ws.file_ref)
@@ -166,6 +160,11 @@ async def read_file_bytes(
raise ValueError(f"File not found: {plain}")
except (PermissionError, OSError) as exc:
raise ValueError(f"Failed to read {plain}: {exc}") from exc
if len(data) > _MAX_BARE_REF_BYTES:
raise ValueError(
f"File too large ({len(data)} bytes, limit {_MAX_BARE_REF_BYTES})"
)
return data
if is_allowed_local_path(plain, get_sdk_cwd()):
resolved = os.path.realpath(os.path.expanduser(plain))
@@ -191,9 +190,14 @@ async def read_file_bytes(
f"Path is not allowed (not in workspace, sdk_cwd, or sandbox): {plain}"
) from exc
try:
return bytes(await sandbox.files.read(remote, format="bytes"))
data = bytes(await sandbox.files.read(remote, format="bytes"))
except Exception as exc:
raise ValueError(f"Failed to read from sandbox: {plain}: {exc}") from exc
if len(data) > _MAX_BARE_REF_BYTES:
raise ValueError(
f"File too large ({len(data)} bytes, limit {_MAX_BARE_REF_BYTES})"
)
return data
raise ValueError(
f"Path is not allowed (not in workspace, sdk_cwd, or sandbox): {plain}"
@@ -483,20 +487,7 @@ async def _expand_bare_ref(
strict = fmt in BINARY_FORMATS
try:
parsed = parse_file_content(content, fmt, strict=strict)
except (
json.JSONDecodeError,
csv.Error,
yaml.YAMLError,
tomllib.TOMLDecodeError,
ValueError,
UnicodeDecodeError,
ImportError,
OSError,
KeyError,
TypeError,
zipfile.BadZipFile,
OpenpyxlInvalidFile,
) as exc:
except PARSE_EXCEPTIONS as exc:
raise FileRefExpansionError(f"Failed to parse {fmt} file: {exc}") from exc
# Normalize bytes fallback to str so tools never
# receive raw bytes when parsing fails.

View File

@@ -80,6 +80,24 @@ MIME_TO_FORMAT: dict[str, str] = {
# Formats that require raw bytes rather than decoded text.
BINARY_FORMATS: frozenset[str] = frozenset({"parquet", "xlsx"})
# Exception types that can be raised during file content parsing.
# Shared between ``parse_file_content`` (which catches them in non-strict mode)
# and ``file_ref._expand_bare_ref`` (which re-raises them as FileRefExpansionError).
PARSE_EXCEPTIONS: tuple[type[BaseException], ...] = (
json.JSONDecodeError,
csv.Error,
yaml.YAMLError,
tomllib.TOMLDecodeError,
ValueError,
UnicodeDecodeError,
ImportError,
OSError,
KeyError,
TypeError,
zipfile.BadZipFile,
OpenpyxlInvalidFile,
)
def infer_format(uri: str) -> str | None:
"""Return a format label based on URI extension or MIME fragment.
@@ -90,7 +108,7 @@ def infer_format(uri: str) -> str | None:
# 1. Check MIME fragment (workspace://abc123#application/json)
if "#" in uri:
_, fragment = uri.rsplit("#", 1)
fmt = MIME_TO_FORMAT.get(fragment)
fmt = MIME_TO_FORMAT.get(fragment.lower())
if fmt:
return fmt
@@ -270,20 +288,7 @@ def parse_file_content(content: str | bytes, fmt: str, *, strict: bool = False)
content = content.decode("utf-8", errors="replace")
return parser(content)
except (
json.JSONDecodeError,
csv.Error,
yaml.YAMLError,
tomllib.TOMLDecodeError,
ValueError,
UnicodeDecodeError,
ImportError,
OSError,
KeyError,
TypeError,
zipfile.BadZipFile,
OpenpyxlInvalidFile,
):
except PARSE_EXCEPTIONS:
if strict:
raise
logger.debug("Structured parsing failed for format=%s, falling back", fmt)