mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
The `@@agptfile:` expansion system previously used content-sniffing (trying `json.loads` then `csv.Sniffer`) to decide whether to parse file content as structured data. This was fragile — a file containing just `"42"` would be parsed as an integer, and the heuristics could misfire on ambiguous content. This PR replaces content-sniffing with **extension/MIME-based format detection**. When the file has a well-known extension (`.json`, `.csv`, etc.) or MIME type fragment (`workspace://id#application/json`), the content is parsed accordingly. Unknown formats or parse failures always fall back to plain string — no surprises. > [!NOTE] > This PR builds on the `@@agptfile:` file reference protocol introduced in #12332 and the structured data auto-parsing added in #12390. > > **What is `@@agptfile:`?** > It is a special URI prefix (e.g. `@@agptfile:workspace:///report.csv`) that the CoPilot SDK expands inline before sending tool arguments to blocks. This lets the AI reference workspace files by name, and the SDK automatically reads and injects the file content. See #12332 for the full design. ### Changes 🏗️ **New utility: `backend/util/file_content_parser.py`** - `infer_format(uri)` — determines format from file extension or MIME fragment - `parse_file_content(content, fmt)` — parses content, never raises - Supported text formats: JSON, JSONL/NDJSON, CSV, TSV, YAML, TOML - Supported binary formats: Parquet (via pyarrow), Excel/XLSX (via openpyxl) - JSON scalars (strings, numbers, booleans, null) stay as strings — only containers (arrays, objects) are promoted - CSV/TSV require ≥1 row and ≥2 columns to qualify as tabular data - Added `openpyxl` dependency for Excel reading via pandas - Case-insensitive MIME fragment matching per RFC 2045 - Shared `PARSE_EXCEPTIONS` constant to avoid duplication between modules **Updated `expand_file_refs_in_args` in `file_ref.py`** - Bare refs now use `infer_format` + `parse_file_content` instead of the old `_try_parse_structured` content-sniffing function - Binary formats (parquet, xlsx) read raw bytes via `read_file_bytes` - Embedded refs (text around `@@agptfile:`) still produce plain strings - **Size guards**: Workspace and sandbox file reads now enforce a 10 MB limit (matching the existing local file limit) to prevent OOM on large files **Updated `blocks/github/commits.py`** - Consolidated `_create_blob` and `_create_binary_blob` into a single function with an `encoding` parameter **Updated copilot system prompt** - Documents the extension-based structured data parsing and supported formats **66 new tests** in `file_content_parser_test.py` covering: - Format inference (extension, MIME, case-insensitive, precedence) - All 8 format parsers (happy path + edge cases + fallbacks) - Binary format handling (string input fallback, invalid bytes fallback) - Unknown format passthrough ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] All 66 file_content_parser_test.py tests pass - [x] All 31 file_ref_test.py tests pass - [x] All 13 file_ref_integration_test.py tests pass - [x] `poetry run format` passes clean (including pyright)
129 lines
4.5 KiB
Python
129 lines
4.5 KiB
Python
"""Shared execution context for copilot SDK tool handlers.
|
|
|
|
All context variables and their accessors live here so that
|
|
``tool_adapter``, ``file_ref``, and ``e2b_file_tools`` can import them
|
|
without creating circular dependencies.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
from contextvars import ContextVar
|
|
from typing import TYPE_CHECKING
|
|
|
|
from backend.copilot.model import ChatSession
|
|
from backend.data.db_accessors import workspace_db
|
|
from backend.util.workspace import WorkspaceManager
|
|
|
|
if TYPE_CHECKING:
|
|
from e2b import AsyncSandbox
|
|
|
|
# Allowed base directory for the Read tool.
|
|
_SDK_PROJECTS_DIR = os.path.realpath(os.path.expanduser("~/.claude/projects"))
|
|
|
|
# Encoded project-directory name for the current session (e.g.
|
|
# "-private-tmp-copilot-<uuid>"). Set by set_execution_context() so path
|
|
# validation can scope tool-results reads to the current session.
|
|
_current_project_dir: ContextVar[str] = ContextVar("_current_project_dir", default="")
|
|
|
|
_current_user_id: ContextVar[str | None] = ContextVar("current_user_id", default=None)
|
|
_current_session: ContextVar[ChatSession | None] = ContextVar(
|
|
"current_session", default=None
|
|
)
|
|
_current_sandbox: ContextVar["AsyncSandbox | None"] = ContextVar(
|
|
"_current_sandbox", default=None
|
|
)
|
|
_current_sdk_cwd: ContextVar[str] = ContextVar("_current_sdk_cwd", default="")
|
|
|
|
|
|
def _encode_cwd_for_cli(cwd: str) -> str:
|
|
"""Encode a working directory path the same way the Claude CLI does."""
|
|
return re.sub(r"[^a-zA-Z0-9]", "-", os.path.realpath(cwd))
|
|
|
|
|
|
def set_execution_context(
|
|
user_id: str | None,
|
|
session: ChatSession,
|
|
sandbox: "AsyncSandbox | None" = None,
|
|
sdk_cwd: str | None = None,
|
|
) -> None:
|
|
"""Set per-turn context variables used by file-resolution tool handlers."""
|
|
_current_user_id.set(user_id)
|
|
_current_session.set(session)
|
|
_current_sandbox.set(sandbox)
|
|
_current_sdk_cwd.set(sdk_cwd or "")
|
|
_current_project_dir.set(_encode_cwd_for_cli(sdk_cwd) if sdk_cwd else "")
|
|
|
|
|
|
def get_execution_context() -> tuple[str | None, ChatSession | None]:
|
|
"""Return the current (user_id, session) pair for the active request."""
|
|
return _current_user_id.get(), _current_session.get()
|
|
|
|
|
|
def get_current_sandbox() -> "AsyncSandbox | None":
|
|
"""Return the E2B sandbox for the current session, or None if not active."""
|
|
return _current_sandbox.get()
|
|
|
|
|
|
def get_sdk_cwd() -> str:
|
|
"""Return the SDK working directory for the current session (empty string if unset)."""
|
|
return _current_sdk_cwd.get()
|
|
|
|
|
|
E2B_WORKDIR = "/home/user"
|
|
|
|
|
|
def resolve_sandbox_path(path: str) -> str:
|
|
"""Normalise *path* to an absolute sandbox path under ``/home/user``.
|
|
|
|
Raises :class:`ValueError` if the resolved path escapes the sandbox.
|
|
"""
|
|
candidate = path if os.path.isabs(path) else os.path.join(E2B_WORKDIR, path)
|
|
normalized = os.path.normpath(candidate)
|
|
if normalized != E2B_WORKDIR and not normalized.startswith(E2B_WORKDIR + "/"):
|
|
raise ValueError(f"Path must be within {E2B_WORKDIR}: {path}")
|
|
return normalized
|
|
|
|
|
|
async def get_workspace_manager(user_id: str, session_id: str) -> WorkspaceManager:
|
|
"""Create a session-scoped :class:`WorkspaceManager`.
|
|
|
|
Placed here (rather than in ``tools/workspace_files``) so that modules
|
|
like ``sdk/file_ref`` can import it without triggering the heavy
|
|
``tools/__init__`` import chain.
|
|
"""
|
|
workspace = await workspace_db().get_or_create_workspace(user_id)
|
|
return WorkspaceManager(user_id, workspace.id, session_id)
|
|
|
|
|
|
def is_allowed_local_path(path: str, sdk_cwd: str | None = None) -> bool:
|
|
"""Return True if *path* is within an allowed host-filesystem location.
|
|
|
|
Allowed:
|
|
- Files under *sdk_cwd* (``/tmp/copilot-<session>/``)
|
|
- Files under ``~/.claude/projects/<encoded-cwd>/tool-results/`` (SDK tool-results)
|
|
"""
|
|
if not path:
|
|
return False
|
|
|
|
if path.startswith("~"):
|
|
resolved = os.path.realpath(os.path.expanduser(path))
|
|
elif not os.path.isabs(path) and sdk_cwd:
|
|
resolved = os.path.realpath(os.path.join(sdk_cwd, path))
|
|
else:
|
|
resolved = os.path.realpath(path)
|
|
|
|
if sdk_cwd:
|
|
norm_cwd = os.path.realpath(sdk_cwd)
|
|
if resolved == norm_cwd or resolved.startswith(norm_cwd + os.sep):
|
|
return True
|
|
|
|
encoded = _current_project_dir.get("")
|
|
if encoded:
|
|
tool_results_dir = os.path.join(_SDK_PROJECTS_DIR, encoded, "tool-results")
|
|
if resolved == tool_results_dir or resolved.startswith(
|
|
tool_results_dir + os.sep
|
|
):
|
|
return True
|
|
|
|
return False
|