mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
The `@@agptfile:` expansion system previously used content-sniffing (trying `json.loads` then `csv.Sniffer`) to decide whether to parse file content as structured data. This was fragile — a file containing just `"42"` would be parsed as an integer, and the heuristics could misfire on ambiguous content. This PR replaces content-sniffing with **extension/MIME-based format detection**. When the file has a well-known extension (`.json`, `.csv`, etc.) or MIME type fragment (`workspace://id#application/json`), the content is parsed accordingly. Unknown formats or parse failures always fall back to plain string — no surprises. > [!NOTE] > This PR builds on the `@@agptfile:` file reference protocol introduced in #12332 and the structured data auto-parsing added in #12390. > > **What is `@@agptfile:`?** > It is a special URI prefix (e.g. `@@agptfile:workspace:///report.csv`) that the CoPilot SDK expands inline before sending tool arguments to blocks. This lets the AI reference workspace files by name, and the SDK automatically reads and injects the file content. See #12332 for the full design. ### Changes 🏗️ **New utility: `backend/util/file_content_parser.py`** - `infer_format(uri)` — determines format from file extension or MIME fragment - `parse_file_content(content, fmt)` — parses content, never raises - Supported text formats: JSON, JSONL/NDJSON, CSV, TSV, YAML, TOML - Supported binary formats: Parquet (via pyarrow), Excel/XLSX (via openpyxl) - JSON scalars (strings, numbers, booleans, null) stay as strings — only containers (arrays, objects) are promoted - CSV/TSV require ≥1 row and ≥2 columns to qualify as tabular data - Added `openpyxl` dependency for Excel reading via pandas - Case-insensitive MIME fragment matching per RFC 2045 - Shared `PARSE_EXCEPTIONS` constant to avoid duplication between modules **Updated `expand_file_refs_in_args` in `file_ref.py`** - Bare refs now use `infer_format` + `parse_file_content` instead of the old `_try_parse_structured` content-sniffing function - Binary formats (parquet, xlsx) read raw bytes via `read_file_bytes` - Embedded refs (text around `@@agptfile:`) still produce plain strings - **Size guards**: Workspace and sandbox file reads now enforce a 10 MB limit (matching the existing local file limit) to prevent OOM on large files **Updated `blocks/github/commits.py`** - Consolidated `_create_blob` and `_create_binary_blob` into a single function with an `encoding` parameter **Updated copilot system prompt** - Documents the extension-based structured data parsing and supported formats **66 new tests** in `file_content_parser_test.py` covering: - Format inference (extension, MIME, case-insensitive, precedence) - All 8 format parsers (happy path + edge cases + fallbacks) - Binary format handling (string input fallback, invalid bytes fallback) - Unknown format passthrough ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] All 66 file_content_parser_test.py tests pass - [x] All 31 file_ref_test.py tests pass - [x] All 13 file_ref_integration_test.py tests pass - [x] `poetry run format` passes clean (including pyright)
511 lines
19 KiB
Python
511 lines
19 KiB
Python
"""
|
|
Tests for cloud storage integration in file utilities.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from backend.data.execution import ExecutionContext
|
|
from backend.util.file import (
|
|
is_media_file_ref,
|
|
parse_data_uri,
|
|
resolve_media_content,
|
|
store_media_file,
|
|
)
|
|
from backend.util.type import MediaFileType
|
|
|
|
|
|
def make_test_context(
|
|
graph_exec_id: str = "test-exec-123",
|
|
user_id: str = "test-user-123",
|
|
) -> ExecutionContext:
|
|
"""Helper to create test ExecutionContext."""
|
|
return ExecutionContext(
|
|
user_id=user_id,
|
|
graph_exec_id=graph_exec_id,
|
|
)
|
|
|
|
|
|
class TestFileCloudIntegration:
|
|
"""Test cases for cloud storage integration in file utilities."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_store_media_file_cloud_path(self):
|
|
"""Test storing a file from cloud storage path."""
|
|
graph_exec_id = "test-exec-123"
|
|
cloud_path = "gcs://test-bucket/uploads/456/source.txt"
|
|
cloud_content = b"cloud file content"
|
|
|
|
with patch(
|
|
"backend.util.file.get_cloud_storage_handler"
|
|
) as mock_handler_getter, patch(
|
|
"backend.util.file.scan_content_safe"
|
|
) as mock_scan, patch(
|
|
"backend.util.file.Path"
|
|
) as mock_path_class:
|
|
|
|
# Mock cloud storage handler
|
|
mock_handler = MagicMock()
|
|
mock_handler.is_cloud_path.return_value = True
|
|
mock_handler.parse_cloud_path.return_value = (
|
|
"gcs",
|
|
"test-bucket/uploads/456/source.txt",
|
|
)
|
|
mock_handler.retrieve_file = AsyncMock(return_value=cloud_content)
|
|
mock_handler_getter.return_value = mock_handler
|
|
|
|
# Mock virus scanner
|
|
mock_scan.return_value = None
|
|
|
|
# Mock file system operations
|
|
mock_base_path = MagicMock()
|
|
mock_target_path = MagicMock()
|
|
mock_resolved_path = MagicMock()
|
|
|
|
mock_path_class.return_value = mock_base_path
|
|
mock_base_path.mkdir = MagicMock()
|
|
mock_base_path.__truediv__ = MagicMock(return_value=mock_target_path)
|
|
mock_target_path.resolve.return_value = mock_resolved_path
|
|
mock_resolved_path.is_relative_to.return_value = True
|
|
mock_resolved_path.write_bytes = MagicMock()
|
|
mock_resolved_path.relative_to.return_value = Path("source.txt")
|
|
|
|
# Configure the main Path mock to handle filename extraction
|
|
# When Path(path_part) is called, it should return a mock with .name = "source.txt"
|
|
mock_path_for_filename = MagicMock()
|
|
mock_path_for_filename.name = "source.txt"
|
|
|
|
# The Path constructor should return different mocks for different calls
|
|
def path_constructor(*args, **kwargs):
|
|
if len(args) == 1 and "source.txt" in str(args[0]):
|
|
return mock_path_for_filename
|
|
else:
|
|
return mock_base_path
|
|
|
|
mock_path_class.side_effect = path_constructor
|
|
|
|
result = await store_media_file(
|
|
file=MediaFileType(cloud_path),
|
|
execution_context=make_test_context(graph_exec_id=graph_exec_id),
|
|
return_format="for_local_processing",
|
|
)
|
|
|
|
# Verify cloud storage operations
|
|
mock_handler.is_cloud_path.assert_called_once_with(cloud_path)
|
|
mock_handler.parse_cloud_path.assert_called_once_with(cloud_path)
|
|
mock_handler.retrieve_file.assert_called_once_with(
|
|
cloud_path, user_id="test-user-123", graph_exec_id=graph_exec_id
|
|
)
|
|
|
|
# Verify virus scan
|
|
mock_scan.assert_called_once_with(cloud_content, filename="source.txt")
|
|
|
|
# Verify file operations
|
|
mock_resolved_path.write_bytes.assert_called_once_with(cloud_content)
|
|
|
|
# Result should be the relative path
|
|
assert str(result) == "source.txt"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_store_media_file_cloud_path_return_content(self):
|
|
"""Test storing a file from cloud storage and returning content."""
|
|
graph_exec_id = "test-exec-123"
|
|
cloud_path = "gcs://test-bucket/uploads/456/image.png"
|
|
cloud_content = b"\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIHDR" # PNG header
|
|
|
|
with patch(
|
|
"backend.util.file.get_cloud_storage_handler"
|
|
) as mock_handler_getter, patch(
|
|
"backend.util.file.scan_content_safe"
|
|
) as mock_scan, patch(
|
|
"backend.util.file.get_mime_type"
|
|
) as mock_mime, patch(
|
|
"backend.util.file.base64.b64encode"
|
|
) as mock_b64, patch(
|
|
"backend.util.file.Path"
|
|
) as mock_path_class:
|
|
|
|
# Mock cloud storage handler
|
|
mock_handler = MagicMock()
|
|
mock_handler.is_cloud_path.return_value = True
|
|
mock_handler.parse_cloud_path.return_value = (
|
|
"gcs",
|
|
"test-bucket/uploads/456/image.png",
|
|
)
|
|
mock_handler.retrieve_file = AsyncMock(return_value=cloud_content)
|
|
mock_handler_getter.return_value = mock_handler
|
|
|
|
# Mock other operations
|
|
mock_scan.return_value = None
|
|
mock_mime.return_value = "image/png"
|
|
mock_b64.return_value.decode.return_value = "iVBORw0KGgoAAAANSUhEUgA="
|
|
|
|
# Mock file system operations
|
|
mock_base_path = MagicMock()
|
|
mock_target_path = MagicMock()
|
|
mock_resolved_path = MagicMock()
|
|
|
|
mock_path_class.return_value = mock_base_path
|
|
mock_base_path.mkdir = MagicMock()
|
|
mock_base_path.__truediv__ = MagicMock(return_value=mock_target_path)
|
|
mock_target_path.resolve.return_value = mock_resolved_path
|
|
mock_resolved_path.is_relative_to.return_value = True
|
|
mock_resolved_path.write_bytes = MagicMock()
|
|
mock_resolved_path.read_bytes.return_value = cloud_content
|
|
|
|
# Mock Path constructor for filename extraction
|
|
mock_path_obj = MagicMock()
|
|
mock_path_obj.name = "image.png"
|
|
with patch("backend.util.file.Path", return_value=mock_path_obj):
|
|
result = await store_media_file(
|
|
file=MediaFileType(cloud_path),
|
|
execution_context=make_test_context(graph_exec_id=graph_exec_id),
|
|
return_format="for_external_api",
|
|
)
|
|
|
|
# Verify result is a data URI
|
|
assert str(result).startswith("data:image/png;base64,")
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_store_media_file_non_cloud_path(self):
|
|
"""Test that non-cloud paths are handled normally."""
|
|
graph_exec_id = "test-exec-123"
|
|
data_uri = "data:text/plain;base64,SGVsbG8gd29ybGQ="
|
|
|
|
with patch(
|
|
"backend.util.file.get_cloud_storage_handler"
|
|
) as mock_handler_getter, patch(
|
|
"backend.util.file.scan_content_safe"
|
|
) as mock_scan, patch(
|
|
"backend.util.file.base64.b64decode"
|
|
) as mock_b64decode, patch(
|
|
"backend.util.file.uuid.uuid4"
|
|
) as mock_uuid, patch(
|
|
"backend.util.file.Path"
|
|
) as mock_path_class:
|
|
|
|
# Mock cloud storage handler
|
|
mock_handler = MagicMock()
|
|
mock_handler.is_cloud_path.return_value = False
|
|
mock_handler.retrieve_file = (
|
|
AsyncMock()
|
|
) # Add this even though it won't be called
|
|
mock_handler_getter.return_value = mock_handler
|
|
|
|
# Mock other operations
|
|
mock_scan.return_value = None
|
|
mock_b64decode.return_value = b"Hello world"
|
|
mock_uuid.return_value = "test-uuid-789"
|
|
|
|
# Mock file system operations
|
|
mock_base_path = MagicMock()
|
|
mock_target_path = MagicMock()
|
|
mock_resolved_path = MagicMock()
|
|
|
|
mock_path_class.return_value = mock_base_path
|
|
mock_base_path.mkdir = MagicMock()
|
|
mock_base_path.__truediv__ = MagicMock(return_value=mock_target_path)
|
|
mock_target_path.resolve.return_value = mock_resolved_path
|
|
mock_resolved_path.is_relative_to.return_value = True
|
|
mock_resolved_path.write_bytes = MagicMock()
|
|
mock_resolved_path.relative_to.return_value = Path("test-uuid-789.txt")
|
|
|
|
await store_media_file(
|
|
file=MediaFileType(data_uri),
|
|
execution_context=make_test_context(graph_exec_id=graph_exec_id),
|
|
return_format="for_local_processing",
|
|
)
|
|
|
|
# Verify cloud handler was checked but not used for retrieval
|
|
mock_handler.is_cloud_path.assert_called_once_with(data_uri)
|
|
mock_handler.retrieve_file.assert_not_called()
|
|
|
|
# Verify normal data URI processing occurred
|
|
mock_b64decode.assert_called_once()
|
|
mock_resolved_path.write_bytes.assert_called_once_with(b"Hello world")
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_store_media_file_cloud_retrieval_error(self):
|
|
"""Test error handling when cloud retrieval fails."""
|
|
graph_exec_id = "test-exec-123"
|
|
cloud_path = "gcs://test-bucket/nonexistent.txt"
|
|
|
|
with patch(
|
|
"backend.util.file.get_cloud_storage_handler"
|
|
) as mock_handler_getter:
|
|
|
|
# Mock cloud storage handler to raise error
|
|
mock_handler = AsyncMock()
|
|
mock_handler.is_cloud_path.return_value = True
|
|
mock_handler.retrieve_file.side_effect = FileNotFoundError(
|
|
"File not found in cloud storage"
|
|
)
|
|
mock_handler_getter.return_value = mock_handler
|
|
|
|
with pytest.raises(
|
|
FileNotFoundError, match="File not found in cloud storage"
|
|
):
|
|
await store_media_file(
|
|
file=MediaFileType(cloud_path),
|
|
execution_context=make_test_context(graph_exec_id=graph_exec_id),
|
|
return_format="for_local_processing",
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_store_media_file_local_path_scanned(self):
|
|
"""Test that local file paths are scanned for viruses."""
|
|
graph_exec_id = "test-exec-123"
|
|
local_file = "test_video.mp4"
|
|
file_content = b"fake video content"
|
|
|
|
with patch(
|
|
"backend.util.file.get_cloud_storage_handler"
|
|
) as mock_handler_getter, patch(
|
|
"backend.util.file.scan_content_safe"
|
|
) as mock_scan, patch(
|
|
"backend.util.file.Path"
|
|
) as mock_path_class:
|
|
|
|
# Mock cloud storage handler - not a cloud path
|
|
mock_handler = MagicMock()
|
|
mock_handler.is_cloud_path.return_value = False
|
|
mock_handler_getter.return_value = mock_handler
|
|
|
|
# Mock virus scanner
|
|
mock_scan.return_value = None
|
|
|
|
# Mock file system operations
|
|
mock_base_path = MagicMock()
|
|
mock_target_path = MagicMock()
|
|
mock_resolved_path = MagicMock()
|
|
|
|
mock_path_class.return_value = mock_base_path
|
|
mock_base_path.mkdir = MagicMock()
|
|
mock_base_path.__truediv__ = MagicMock(return_value=mock_target_path)
|
|
mock_target_path.resolve.return_value = mock_resolved_path
|
|
mock_resolved_path.is_relative_to.return_value = True
|
|
mock_resolved_path.is_file.return_value = True
|
|
mock_resolved_path.read_bytes.return_value = file_content
|
|
mock_resolved_path.relative_to.return_value = Path(local_file)
|
|
mock_resolved_path.name = local_file
|
|
|
|
result = await store_media_file(
|
|
file=MediaFileType(local_file),
|
|
execution_context=make_test_context(graph_exec_id=graph_exec_id),
|
|
return_format="for_local_processing",
|
|
)
|
|
|
|
# Verify virus scan was called for local file
|
|
mock_scan.assert_called_once_with(file_content, filename=local_file)
|
|
|
|
# Result should be the relative path
|
|
assert str(result) == local_file
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_store_media_file_local_path_virus_detected(self):
|
|
"""Test that infected local files raise VirusDetectedError."""
|
|
from backend.api.features.store.exceptions import VirusDetectedError
|
|
|
|
graph_exec_id = "test-exec-123"
|
|
local_file = "infected.exe"
|
|
file_content = b"malicious content"
|
|
|
|
with patch(
|
|
"backend.util.file.get_cloud_storage_handler"
|
|
) as mock_handler_getter, patch(
|
|
"backend.util.file.scan_content_safe"
|
|
) as mock_scan, patch(
|
|
"backend.util.file.Path"
|
|
) as mock_path_class:
|
|
|
|
# Mock cloud storage handler - not a cloud path
|
|
mock_handler = MagicMock()
|
|
mock_handler.is_cloud_path.return_value = False
|
|
mock_handler_getter.return_value = mock_handler
|
|
|
|
# Mock virus scanner to detect virus
|
|
mock_scan.side_effect = VirusDetectedError(
|
|
"EICAR-Test-File", "File rejected due to virus detection"
|
|
)
|
|
|
|
# Mock file system operations
|
|
mock_base_path = MagicMock()
|
|
mock_target_path = MagicMock()
|
|
mock_resolved_path = MagicMock()
|
|
|
|
mock_path_class.return_value = mock_base_path
|
|
mock_base_path.mkdir = MagicMock()
|
|
mock_base_path.__truediv__ = MagicMock(return_value=mock_target_path)
|
|
mock_target_path.resolve.return_value = mock_resolved_path
|
|
mock_resolved_path.is_relative_to.return_value = True
|
|
mock_resolved_path.is_file.return_value = True
|
|
mock_resolved_path.read_bytes.return_value = file_content
|
|
|
|
with pytest.raises(VirusDetectedError):
|
|
await store_media_file(
|
|
file=MediaFileType(local_file),
|
|
execution_context=make_test_context(graph_exec_id=graph_exec_id),
|
|
return_format="for_local_processing",
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# is_media_file_ref
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestIsMediaFileRef:
|
|
def test_data_uri(self):
|
|
assert is_media_file_ref("data:image/png;base64,iVBORw0KGg==") is True
|
|
|
|
def test_workspace_uri(self):
|
|
assert is_media_file_ref("workspace://abc123") is True
|
|
|
|
def test_workspace_uri_with_mime(self):
|
|
assert is_media_file_ref("workspace://abc123#image/png") is True
|
|
|
|
def test_http_url(self):
|
|
assert is_media_file_ref("http://example.com/image.png") is True
|
|
|
|
def test_https_url(self):
|
|
assert is_media_file_ref("https://example.com/image.png") is True
|
|
|
|
def test_plain_text(self):
|
|
assert is_media_file_ref("print('hello')") is False
|
|
|
|
def test_local_path(self):
|
|
assert is_media_file_ref("/tmp/file.txt") is False
|
|
|
|
def test_empty_string(self):
|
|
assert is_media_file_ref("") is False
|
|
|
|
def test_filename(self):
|
|
assert is_media_file_ref("image.png") is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# parse_data_uri
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestParseDataUri:
|
|
def test_valid_png(self):
|
|
result = parse_data_uri("data:image/png;base64,iVBORw0KGg==")
|
|
assert result is not None
|
|
mime, payload = result
|
|
assert mime == "image/png"
|
|
assert payload == "iVBORw0KGg=="
|
|
|
|
def test_valid_text(self):
|
|
result = parse_data_uri("data:text/plain;base64,SGVsbG8=")
|
|
assert result is not None
|
|
assert result[0] == "text/plain"
|
|
assert result[1] == "SGVsbG8="
|
|
|
|
def test_mime_case_normalized(self):
|
|
result = parse_data_uri("data:IMAGE/PNG;base64,abc")
|
|
assert result is not None
|
|
assert result[0] == "image/png"
|
|
|
|
def test_not_data_uri(self):
|
|
assert parse_data_uri("workspace://abc123") is None
|
|
|
|
def test_plain_text(self):
|
|
assert parse_data_uri("hello world") is None
|
|
|
|
def test_missing_base64(self):
|
|
assert parse_data_uri("data:image/png;utf-8,abc") is None
|
|
|
|
def test_empty_payload(self):
|
|
result = parse_data_uri("data:image/png;base64,")
|
|
assert result is not None
|
|
assert result[1] == ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# resolve_media_content
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestResolveMediaContent:
|
|
@pytest.mark.asyncio
|
|
async def test_plain_text_passthrough(self):
|
|
"""Plain text content (not a media ref) passes through unchanged."""
|
|
ctx = make_test_context()
|
|
result = await resolve_media_content(
|
|
MediaFileType("print('hello')"),
|
|
ctx,
|
|
return_format="for_external_api",
|
|
)
|
|
assert result == "print('hello')"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_empty_string_passthrough(self):
|
|
"""Empty string passes through unchanged."""
|
|
ctx = make_test_context()
|
|
result = await resolve_media_content(
|
|
MediaFileType(""),
|
|
ctx,
|
|
return_format="for_external_api",
|
|
)
|
|
assert result == ""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_media_ref_delegates_to_store(self):
|
|
"""Media references are resolved via store_media_file."""
|
|
ctx = make_test_context()
|
|
with patch(
|
|
"backend.util.file.store_media_file",
|
|
new=AsyncMock(return_value=MediaFileType("data:image/png;base64,abc")),
|
|
) as mock_store:
|
|
result = await resolve_media_content(
|
|
MediaFileType("workspace://img123"),
|
|
ctx,
|
|
return_format="for_external_api",
|
|
)
|
|
assert result == "data:image/png;base64,abc"
|
|
mock_store.assert_called_once_with(
|
|
MediaFileType("workspace://img123"),
|
|
ctx,
|
|
return_format="for_external_api",
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_data_uri_delegates_to_store(self):
|
|
"""Data URIs are also resolved via store_media_file."""
|
|
ctx = make_test_context()
|
|
data_uri = "data:image/png;base64,iVBORw0KGg=="
|
|
with patch(
|
|
"backend.util.file.store_media_file",
|
|
new=AsyncMock(return_value=MediaFileType(data_uri)),
|
|
) as mock_store:
|
|
result = await resolve_media_content(
|
|
MediaFileType(data_uri),
|
|
ctx,
|
|
return_format="for_external_api",
|
|
)
|
|
assert result == data_uri
|
|
mock_store.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_https_url_delegates_to_store(self):
|
|
"""HTTPS URLs are resolved via store_media_file."""
|
|
ctx = make_test_context()
|
|
with patch(
|
|
"backend.util.file.store_media_file",
|
|
new=AsyncMock(return_value=MediaFileType("data:image/png;base64,abc")),
|
|
) as mock_store:
|
|
result = await resolve_media_content(
|
|
MediaFileType("https://example.com/image.png"),
|
|
ctx,
|
|
return_format="for_local_processing",
|
|
)
|
|
assert result == "data:image/png;base64,abc"
|
|
mock_store.assert_called_once_with(
|
|
MediaFileType("https://example.com/image.png"),
|
|
ctx,
|
|
return_format="for_local_processing",
|
|
)
|