Files
AutoGPT/autogpt_platform/backend/backend/util/file_test.py
Zamil Majdy 9a41312769 feat(backend/copilot): parse @@agptfile bare refs by file extension (#12392)
The `@@agptfile:` expansion system previously used content-sniffing
(trying
`json.loads` then `csv.Sniffer`) to decide whether to parse file content
as
structured data. This was fragile — a file containing just `"42"` would
be
parsed as an integer, and the heuristics could misfire on ambiguous
content.

This PR replaces content-sniffing with **extension/MIME-based format
detection**.
When the file has a well-known extension (`.json`, `.csv`, etc.) or MIME
type
fragment (`workspace://id#application/json`), the content is parsed
accordingly.
Unknown formats or parse failures always fall back to plain string — no
surprises.

> [!NOTE]
> This PR builds on the `@@agptfile:` file reference protocol introduced
in #12332 and the structured data auto-parsing added in #12390.
>
> **What is `@@agptfile:`?**
> It is a special URI prefix (e.g. `@@agptfile:workspace:///report.csv`)
that the CoPilot SDK expands inline before sending tool arguments to
blocks. This lets the AI reference workspace files by name, and the SDK
automatically reads and injects the file content. See #12332 for the
full design.

### Changes 🏗️

**New utility: `backend/util/file_content_parser.py`**
- `infer_format(uri)` — determines format from file extension or MIME
fragment
- `parse_file_content(content, fmt)` — parses content, never raises
- Supported text formats: JSON, JSONL/NDJSON, CSV, TSV, YAML, TOML
- Supported binary formats: Parquet (via pyarrow), Excel/XLSX (via
openpyxl)
- JSON scalars (strings, numbers, booleans, null) stay as strings — only
  containers (arrays, objects) are promoted
- CSV/TSV require ≥1 row and ≥2 columns to qualify as tabular data
- Added `openpyxl` dependency for Excel reading via pandas
- Case-insensitive MIME fragment matching per RFC 2045
- Shared `PARSE_EXCEPTIONS` constant to avoid duplication between
modules

**Updated `expand_file_refs_in_args` in `file_ref.py`**
- Bare refs now use `infer_format` + `parse_file_content` instead of the
  old `_try_parse_structured` content-sniffing function
- Binary formats (parquet, xlsx) read raw bytes via `read_file_bytes`
- Embedded refs (text around `@@agptfile:`) still produce plain strings
- **Size guards**: Workspace and sandbox file reads now enforce a 10 MB
limit
  (matching the existing local file limit) to prevent OOM on large files

**Updated `blocks/github/commits.py`**
- Consolidated `_create_blob` and `_create_binary_blob` into a single
function
  with an `encoding` parameter

**Updated copilot system prompt**
- Documents the extension-based structured data parsing and supported
formats

**66 new tests** in `file_content_parser_test.py` covering:
- Format inference (extension, MIME, case-insensitive, precedence)
- All 8 format parsers (happy path + edge cases + fallbacks)
- Binary format handling (string input fallback, invalid bytes fallback)
- Unknown format passthrough

### Checklist 📋

#### For code changes:
- [x] I have clearly listed my changes in the PR description
- [x] I have made a test plan
- [x] I have tested my changes according to the test plan:
  - [x] All 66 file_content_parser_test.py tests pass
  - [x] All 31 file_ref_test.py tests pass
  - [x] All 13 file_ref_integration_test.py tests pass
  - [x] `poetry run format` passes clean (including pyright)
2026-03-16 22:31:21 +00:00

511 lines
19 KiB
Python

"""
Tests for cloud storage integration in file utilities.
"""
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from backend.data.execution import ExecutionContext
from backend.util.file import (
is_media_file_ref,
parse_data_uri,
resolve_media_content,
store_media_file,
)
from backend.util.type import MediaFileType
def make_test_context(
graph_exec_id: str = "test-exec-123",
user_id: str = "test-user-123",
) -> ExecutionContext:
"""Helper to create test ExecutionContext."""
return ExecutionContext(
user_id=user_id,
graph_exec_id=graph_exec_id,
)
class TestFileCloudIntegration:
"""Test cases for cloud storage integration in file utilities."""
@pytest.mark.asyncio
async def test_store_media_file_cloud_path(self):
"""Test storing a file from cloud storage path."""
graph_exec_id = "test-exec-123"
cloud_path = "gcs://test-bucket/uploads/456/source.txt"
cloud_content = b"cloud file content"
with patch(
"backend.util.file.get_cloud_storage_handler"
) as mock_handler_getter, patch(
"backend.util.file.scan_content_safe"
) as mock_scan, patch(
"backend.util.file.Path"
) as mock_path_class:
# Mock cloud storage handler
mock_handler = MagicMock()
mock_handler.is_cloud_path.return_value = True
mock_handler.parse_cloud_path.return_value = (
"gcs",
"test-bucket/uploads/456/source.txt",
)
mock_handler.retrieve_file = AsyncMock(return_value=cloud_content)
mock_handler_getter.return_value = mock_handler
# Mock virus scanner
mock_scan.return_value = None
# Mock file system operations
mock_base_path = MagicMock()
mock_target_path = MagicMock()
mock_resolved_path = MagicMock()
mock_path_class.return_value = mock_base_path
mock_base_path.mkdir = MagicMock()
mock_base_path.__truediv__ = MagicMock(return_value=mock_target_path)
mock_target_path.resolve.return_value = mock_resolved_path
mock_resolved_path.is_relative_to.return_value = True
mock_resolved_path.write_bytes = MagicMock()
mock_resolved_path.relative_to.return_value = Path("source.txt")
# Configure the main Path mock to handle filename extraction
# When Path(path_part) is called, it should return a mock with .name = "source.txt"
mock_path_for_filename = MagicMock()
mock_path_for_filename.name = "source.txt"
# The Path constructor should return different mocks for different calls
def path_constructor(*args, **kwargs):
if len(args) == 1 and "source.txt" in str(args[0]):
return mock_path_for_filename
else:
return mock_base_path
mock_path_class.side_effect = path_constructor
result = await store_media_file(
file=MediaFileType(cloud_path),
execution_context=make_test_context(graph_exec_id=graph_exec_id),
return_format="for_local_processing",
)
# Verify cloud storage operations
mock_handler.is_cloud_path.assert_called_once_with(cloud_path)
mock_handler.parse_cloud_path.assert_called_once_with(cloud_path)
mock_handler.retrieve_file.assert_called_once_with(
cloud_path, user_id="test-user-123", graph_exec_id=graph_exec_id
)
# Verify virus scan
mock_scan.assert_called_once_with(cloud_content, filename="source.txt")
# Verify file operations
mock_resolved_path.write_bytes.assert_called_once_with(cloud_content)
# Result should be the relative path
assert str(result) == "source.txt"
@pytest.mark.asyncio
async def test_store_media_file_cloud_path_return_content(self):
"""Test storing a file from cloud storage and returning content."""
graph_exec_id = "test-exec-123"
cloud_path = "gcs://test-bucket/uploads/456/image.png"
cloud_content = b"\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIHDR" # PNG header
with patch(
"backend.util.file.get_cloud_storage_handler"
) as mock_handler_getter, patch(
"backend.util.file.scan_content_safe"
) as mock_scan, patch(
"backend.util.file.get_mime_type"
) as mock_mime, patch(
"backend.util.file.base64.b64encode"
) as mock_b64, patch(
"backend.util.file.Path"
) as mock_path_class:
# Mock cloud storage handler
mock_handler = MagicMock()
mock_handler.is_cloud_path.return_value = True
mock_handler.parse_cloud_path.return_value = (
"gcs",
"test-bucket/uploads/456/image.png",
)
mock_handler.retrieve_file = AsyncMock(return_value=cloud_content)
mock_handler_getter.return_value = mock_handler
# Mock other operations
mock_scan.return_value = None
mock_mime.return_value = "image/png"
mock_b64.return_value.decode.return_value = "iVBORw0KGgoAAAANSUhEUgA="
# Mock file system operations
mock_base_path = MagicMock()
mock_target_path = MagicMock()
mock_resolved_path = MagicMock()
mock_path_class.return_value = mock_base_path
mock_base_path.mkdir = MagicMock()
mock_base_path.__truediv__ = MagicMock(return_value=mock_target_path)
mock_target_path.resolve.return_value = mock_resolved_path
mock_resolved_path.is_relative_to.return_value = True
mock_resolved_path.write_bytes = MagicMock()
mock_resolved_path.read_bytes.return_value = cloud_content
# Mock Path constructor for filename extraction
mock_path_obj = MagicMock()
mock_path_obj.name = "image.png"
with patch("backend.util.file.Path", return_value=mock_path_obj):
result = await store_media_file(
file=MediaFileType(cloud_path),
execution_context=make_test_context(graph_exec_id=graph_exec_id),
return_format="for_external_api",
)
# Verify result is a data URI
assert str(result).startswith("data:image/png;base64,")
@pytest.mark.asyncio
async def test_store_media_file_non_cloud_path(self):
"""Test that non-cloud paths are handled normally."""
graph_exec_id = "test-exec-123"
data_uri = "data:text/plain;base64,SGVsbG8gd29ybGQ="
with patch(
"backend.util.file.get_cloud_storage_handler"
) as mock_handler_getter, patch(
"backend.util.file.scan_content_safe"
) as mock_scan, patch(
"backend.util.file.base64.b64decode"
) as mock_b64decode, patch(
"backend.util.file.uuid.uuid4"
) as mock_uuid, patch(
"backend.util.file.Path"
) as mock_path_class:
# Mock cloud storage handler
mock_handler = MagicMock()
mock_handler.is_cloud_path.return_value = False
mock_handler.retrieve_file = (
AsyncMock()
) # Add this even though it won't be called
mock_handler_getter.return_value = mock_handler
# Mock other operations
mock_scan.return_value = None
mock_b64decode.return_value = b"Hello world"
mock_uuid.return_value = "test-uuid-789"
# Mock file system operations
mock_base_path = MagicMock()
mock_target_path = MagicMock()
mock_resolved_path = MagicMock()
mock_path_class.return_value = mock_base_path
mock_base_path.mkdir = MagicMock()
mock_base_path.__truediv__ = MagicMock(return_value=mock_target_path)
mock_target_path.resolve.return_value = mock_resolved_path
mock_resolved_path.is_relative_to.return_value = True
mock_resolved_path.write_bytes = MagicMock()
mock_resolved_path.relative_to.return_value = Path("test-uuid-789.txt")
await store_media_file(
file=MediaFileType(data_uri),
execution_context=make_test_context(graph_exec_id=graph_exec_id),
return_format="for_local_processing",
)
# Verify cloud handler was checked but not used for retrieval
mock_handler.is_cloud_path.assert_called_once_with(data_uri)
mock_handler.retrieve_file.assert_not_called()
# Verify normal data URI processing occurred
mock_b64decode.assert_called_once()
mock_resolved_path.write_bytes.assert_called_once_with(b"Hello world")
@pytest.mark.asyncio
async def test_store_media_file_cloud_retrieval_error(self):
"""Test error handling when cloud retrieval fails."""
graph_exec_id = "test-exec-123"
cloud_path = "gcs://test-bucket/nonexistent.txt"
with patch(
"backend.util.file.get_cloud_storage_handler"
) as mock_handler_getter:
# Mock cloud storage handler to raise error
mock_handler = AsyncMock()
mock_handler.is_cloud_path.return_value = True
mock_handler.retrieve_file.side_effect = FileNotFoundError(
"File not found in cloud storage"
)
mock_handler_getter.return_value = mock_handler
with pytest.raises(
FileNotFoundError, match="File not found in cloud storage"
):
await store_media_file(
file=MediaFileType(cloud_path),
execution_context=make_test_context(graph_exec_id=graph_exec_id),
return_format="for_local_processing",
)
@pytest.mark.asyncio
async def test_store_media_file_local_path_scanned(self):
"""Test that local file paths are scanned for viruses."""
graph_exec_id = "test-exec-123"
local_file = "test_video.mp4"
file_content = b"fake video content"
with patch(
"backend.util.file.get_cloud_storage_handler"
) as mock_handler_getter, patch(
"backend.util.file.scan_content_safe"
) as mock_scan, patch(
"backend.util.file.Path"
) as mock_path_class:
# Mock cloud storage handler - not a cloud path
mock_handler = MagicMock()
mock_handler.is_cloud_path.return_value = False
mock_handler_getter.return_value = mock_handler
# Mock virus scanner
mock_scan.return_value = None
# Mock file system operations
mock_base_path = MagicMock()
mock_target_path = MagicMock()
mock_resolved_path = MagicMock()
mock_path_class.return_value = mock_base_path
mock_base_path.mkdir = MagicMock()
mock_base_path.__truediv__ = MagicMock(return_value=mock_target_path)
mock_target_path.resolve.return_value = mock_resolved_path
mock_resolved_path.is_relative_to.return_value = True
mock_resolved_path.is_file.return_value = True
mock_resolved_path.read_bytes.return_value = file_content
mock_resolved_path.relative_to.return_value = Path(local_file)
mock_resolved_path.name = local_file
result = await store_media_file(
file=MediaFileType(local_file),
execution_context=make_test_context(graph_exec_id=graph_exec_id),
return_format="for_local_processing",
)
# Verify virus scan was called for local file
mock_scan.assert_called_once_with(file_content, filename=local_file)
# Result should be the relative path
assert str(result) == local_file
@pytest.mark.asyncio
async def test_store_media_file_local_path_virus_detected(self):
"""Test that infected local files raise VirusDetectedError."""
from backend.api.features.store.exceptions import VirusDetectedError
graph_exec_id = "test-exec-123"
local_file = "infected.exe"
file_content = b"malicious content"
with patch(
"backend.util.file.get_cloud_storage_handler"
) as mock_handler_getter, patch(
"backend.util.file.scan_content_safe"
) as mock_scan, patch(
"backend.util.file.Path"
) as mock_path_class:
# Mock cloud storage handler - not a cloud path
mock_handler = MagicMock()
mock_handler.is_cloud_path.return_value = False
mock_handler_getter.return_value = mock_handler
# Mock virus scanner to detect virus
mock_scan.side_effect = VirusDetectedError(
"EICAR-Test-File", "File rejected due to virus detection"
)
# Mock file system operations
mock_base_path = MagicMock()
mock_target_path = MagicMock()
mock_resolved_path = MagicMock()
mock_path_class.return_value = mock_base_path
mock_base_path.mkdir = MagicMock()
mock_base_path.__truediv__ = MagicMock(return_value=mock_target_path)
mock_target_path.resolve.return_value = mock_resolved_path
mock_resolved_path.is_relative_to.return_value = True
mock_resolved_path.is_file.return_value = True
mock_resolved_path.read_bytes.return_value = file_content
with pytest.raises(VirusDetectedError):
await store_media_file(
file=MediaFileType(local_file),
execution_context=make_test_context(graph_exec_id=graph_exec_id),
return_format="for_local_processing",
)
# ---------------------------------------------------------------------------
# is_media_file_ref
# ---------------------------------------------------------------------------
class TestIsMediaFileRef:
def test_data_uri(self):
assert is_media_file_ref("data:image/png;base64,iVBORw0KGg==") is True
def test_workspace_uri(self):
assert is_media_file_ref("workspace://abc123") is True
def test_workspace_uri_with_mime(self):
assert is_media_file_ref("workspace://abc123#image/png") is True
def test_http_url(self):
assert is_media_file_ref("http://example.com/image.png") is True
def test_https_url(self):
assert is_media_file_ref("https://example.com/image.png") is True
def test_plain_text(self):
assert is_media_file_ref("print('hello')") is False
def test_local_path(self):
assert is_media_file_ref("/tmp/file.txt") is False
def test_empty_string(self):
assert is_media_file_ref("") is False
def test_filename(self):
assert is_media_file_ref("image.png") is False
# ---------------------------------------------------------------------------
# parse_data_uri
# ---------------------------------------------------------------------------
class TestParseDataUri:
def test_valid_png(self):
result = parse_data_uri("data:image/png;base64,iVBORw0KGg==")
assert result is not None
mime, payload = result
assert mime == "image/png"
assert payload == "iVBORw0KGg=="
def test_valid_text(self):
result = parse_data_uri("data:text/plain;base64,SGVsbG8=")
assert result is not None
assert result[0] == "text/plain"
assert result[1] == "SGVsbG8="
def test_mime_case_normalized(self):
result = parse_data_uri("data:IMAGE/PNG;base64,abc")
assert result is not None
assert result[0] == "image/png"
def test_not_data_uri(self):
assert parse_data_uri("workspace://abc123") is None
def test_plain_text(self):
assert parse_data_uri("hello world") is None
def test_missing_base64(self):
assert parse_data_uri("data:image/png;utf-8,abc") is None
def test_empty_payload(self):
result = parse_data_uri("data:image/png;base64,")
assert result is not None
assert result[1] == ""
# ---------------------------------------------------------------------------
# resolve_media_content
# ---------------------------------------------------------------------------
class TestResolveMediaContent:
@pytest.mark.asyncio
async def test_plain_text_passthrough(self):
"""Plain text content (not a media ref) passes through unchanged."""
ctx = make_test_context()
result = await resolve_media_content(
MediaFileType("print('hello')"),
ctx,
return_format="for_external_api",
)
assert result == "print('hello')"
@pytest.mark.asyncio
async def test_empty_string_passthrough(self):
"""Empty string passes through unchanged."""
ctx = make_test_context()
result = await resolve_media_content(
MediaFileType(""),
ctx,
return_format="for_external_api",
)
assert result == ""
@pytest.mark.asyncio
async def test_media_ref_delegates_to_store(self):
"""Media references are resolved via store_media_file."""
ctx = make_test_context()
with patch(
"backend.util.file.store_media_file",
new=AsyncMock(return_value=MediaFileType("data:image/png;base64,abc")),
) as mock_store:
result = await resolve_media_content(
MediaFileType("workspace://img123"),
ctx,
return_format="for_external_api",
)
assert result == "data:image/png;base64,abc"
mock_store.assert_called_once_with(
MediaFileType("workspace://img123"),
ctx,
return_format="for_external_api",
)
@pytest.mark.asyncio
async def test_data_uri_delegates_to_store(self):
"""Data URIs are also resolved via store_media_file."""
ctx = make_test_context()
data_uri = "data:image/png;base64,iVBORw0KGg=="
with patch(
"backend.util.file.store_media_file",
new=AsyncMock(return_value=MediaFileType(data_uri)),
) as mock_store:
result = await resolve_media_content(
MediaFileType(data_uri),
ctx,
return_format="for_external_api",
)
assert result == data_uri
mock_store.assert_called_once()
@pytest.mark.asyncio
async def test_https_url_delegates_to_store(self):
"""HTTPS URLs are resolved via store_media_file."""
ctx = make_test_context()
with patch(
"backend.util.file.store_media_file",
new=AsyncMock(return_value=MediaFileType("data:image/png;base64,abc")),
) as mock_store:
result = await resolve_media_content(
MediaFileType("https://example.com/image.png"),
ctx,
return_format="for_local_processing",
)
assert result == "data:image/png;base64,abc"
mock_store.assert_called_once_with(
MediaFileType("https://example.com/image.png"),
ctx,
return_format="for_local_processing",
)