mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-02-18 02:32:04 -05:00
Uncouple Copilot task execution from the REST API server. This should help performance and scalability, and allows task execution to continue regardless of the state of the user's connection. - Resolves #12023 ### Changes 🏗️ - Add `backend.copilot.executor`->`CoPilotExecutor` (setup similar to `backend.executor`->`ExecutionManager`). This executor service uses RabbitMQ-based task distribution, and sticks with the existing Redis Streams setup for task output. It uses a cluster lock mechanism to ensure a task is only executed by one pod, and the `DatabaseManager` for pooled DB access. - Add `backend.data.db_accessors` for automatic choice of direct/proxied DB access Chat requests now flow: API → RabbitMQ → CoPilot Executor → Redis Streams → SSE Client. This enables horizontal scaling of chat processing and isolates long-running LLM operations from the API service. - Move non-API Copilot stuff into `backend.copilot` (from `backend.api.features.chat`) - Updated import paths for all usages - Move `backend.executor.database` to `backend.data.db_manager` and add methods for copilot executor - Updated import paths for all usages - Make `backend.copilot.db` RPC-compatible (-> DB ops return ~~Prisma~~ Pydantic models) - Make `backend.data.workspace` RPC-compatible - Make `backend.data.graphs.get_store_listed_graphs` RPC-compatible DX: - Add `copilot_executor` service to Docker setup Config: - Add `Config.num_copilot_workers` (default 5) and `Config.copilot_executor_port` (default 8008) - Remove unused `Config.agent_server_port` > [!WARNING] > **This change adds a new microservice to the system, with entrypoint `backend.copilot.executor`.** > The `docker compose` setup has been updated, but if you run the Platform on something else, you'll have to update your deployment config to include this new service. > > When running locally, the `CoPilotExecutor` uses port 8008 by default. ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] Copilot works - [x] Processes messages when triggered - [x] Can use its tools #### For configuration changes: - [x] `.env.default` is updated or already compatible with my changes - [x] `docker-compose.yml` is updated or already compatible with my changes - [x] I have included a list of my configuration changes in the PR description (under **Changes**) --------- Co-authored-by: Zamil Majdy <zamil.majdy@agpt.co>
434 lines
17 KiB
Python
434 lines
17 KiB
Python
import base64
|
|
import mimetypes
|
|
import re
|
|
import shutil
|
|
import tempfile
|
|
import uuid
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, Literal
|
|
from urllib.parse import urlparse
|
|
|
|
from pydantic import BaseModel
|
|
|
|
from backend.util.cloud_storage import get_cloud_storage_handler
|
|
from backend.util.request import Requests
|
|
from backend.util.settings import Config
|
|
from backend.util.type import MediaFileType
|
|
from backend.util.virus_scanner import scan_content_safe
|
|
|
|
if TYPE_CHECKING:
|
|
from backend.data.execution import ExecutionContext
|
|
|
|
|
|
class WorkspaceUri(BaseModel):
|
|
"""Parsed workspace:// URI."""
|
|
|
|
file_ref: str # File ID or path (e.g. "abc123" or "/path/to/file.txt")
|
|
mime_type: str | None = None # MIME type from fragment (e.g. "video/mp4")
|
|
is_path: bool = False # True if file_ref is a path (starts with "/")
|
|
|
|
|
|
def parse_workspace_uri(uri: str) -> WorkspaceUri:
|
|
"""Parse a workspace:// URI into its components.
|
|
|
|
Examples:
|
|
"workspace://abc123" → WorkspaceUri(file_ref="abc123", mime_type=None, is_path=False)
|
|
"workspace://abc123#video/mp4" → WorkspaceUri(file_ref="abc123", mime_type="video/mp4", is_path=False)
|
|
"workspace:///path/to/file.txt" → WorkspaceUri(file_ref="/path/to/file.txt", mime_type=None, is_path=True)
|
|
"""
|
|
raw = uri.removeprefix("workspace://")
|
|
mime_type: str | None = None
|
|
if "#" in raw:
|
|
raw, fragment = raw.split("#", 1)
|
|
mime_type = fragment or None
|
|
return WorkspaceUri(
|
|
file_ref=raw,
|
|
mime_type=mime_type,
|
|
is_path=raw.startswith("/"),
|
|
)
|
|
|
|
|
|
# Return format options for store_media_file
|
|
# - "for_local_processing": Returns local file path - use with ffmpeg, MoviePy, PIL, etc.
|
|
# - "for_external_api": Returns data URI (base64) - use when sending content to external APIs
|
|
# - "for_block_output": Returns best format for output - workspace:// in CoPilot, data URI in graphs
|
|
MediaReturnFormat = Literal[
|
|
"for_local_processing", "for_external_api", "for_block_output"
|
|
]
|
|
|
|
TEMP_DIR = Path(tempfile.gettempdir()).resolve()
|
|
|
|
# Maximum filename length (conservative limit for most filesystems)
|
|
MAX_FILENAME_LENGTH = 200
|
|
|
|
|
|
def sanitize_filename(filename: str) -> str:
|
|
"""
|
|
Sanitize and truncate filename to prevent filesystem errors.
|
|
"""
|
|
# Remove or replace invalid characters
|
|
sanitized = re.sub(r'[<>:"/\\|?*\n\r\t]', "_", filename)
|
|
|
|
# Truncate if too long
|
|
if len(sanitized) > MAX_FILENAME_LENGTH:
|
|
# Keep the extension if possible
|
|
if "." in sanitized:
|
|
name, ext = sanitized.rsplit(".", 1)
|
|
max_name_length = MAX_FILENAME_LENGTH - len(ext) - 1
|
|
sanitized = name[:max_name_length] + "." + ext
|
|
else:
|
|
sanitized = sanitized[:MAX_FILENAME_LENGTH]
|
|
|
|
# Ensure it's not empty or just dots
|
|
if not sanitized or sanitized.strip(".") == "":
|
|
sanitized = f"file_{uuid.uuid4().hex[:8]}"
|
|
|
|
return sanitized
|
|
|
|
|
|
def get_exec_file_path(graph_exec_id: str, path: str) -> str:
|
|
"""
|
|
Utility to build an absolute path in the {temp}/exec_file/{exec_id}/... folder.
|
|
"""
|
|
try:
|
|
full_path = TEMP_DIR / "exec_file" / graph_exec_id / path
|
|
return str(full_path)
|
|
except OSError as e:
|
|
if "File name too long" in str(e):
|
|
raise ValueError(
|
|
f"File path too long: {len(path)} characters. Maximum path length exceeded."
|
|
) from e
|
|
raise ValueError(f"Invalid file path: {e}") from e
|
|
|
|
|
|
def clean_exec_files(graph_exec_id: str, file: str = "") -> None:
|
|
"""
|
|
Utility to remove the {temp}/exec_file/{exec_id} folder and its contents.
|
|
"""
|
|
exec_path = Path(get_exec_file_path(graph_exec_id, file))
|
|
if exec_path.exists() and exec_path.is_dir():
|
|
shutil.rmtree(exec_path)
|
|
|
|
|
|
async def store_media_file(
|
|
file: MediaFileType,
|
|
execution_context: "ExecutionContext",
|
|
*,
|
|
return_format: MediaReturnFormat,
|
|
) -> MediaFileType:
|
|
"""
|
|
Safely handle 'file' (a data URI, a URL, a workspace:// reference, or a local path
|
|
relative to {temp}/exec_file/{exec_id}), placing or verifying it under:
|
|
{tempdir}/exec_file/{exec_id}/...
|
|
|
|
For each MediaFileType input:
|
|
- Data URI: decode and store locally
|
|
- URL: download and store locally
|
|
- workspace:// reference: read from workspace, store locally
|
|
- Local path: verify it exists in exec_file directory
|
|
|
|
Return format options:
|
|
- "for_local_processing": Returns local file path - use with ffmpeg, MoviePy, PIL, etc.
|
|
- "for_external_api": Returns data URI (base64) - use when sending to external APIs
|
|
- "for_block_output": Returns best format for output - workspace:// in CoPilot, data URI in graphs
|
|
|
|
:param file: Data URI, URL, workspace://, or local (relative) path.
|
|
:param execution_context: ExecutionContext with user_id, graph_exec_id, workspace_id.
|
|
:param return_format: What to return: "for_local_processing", "for_external_api", or "for_block_output".
|
|
:return: The requested result based on return_format.
|
|
"""
|
|
# Extract values from execution_context
|
|
graph_exec_id = execution_context.graph_exec_id
|
|
user_id = execution_context.user_id
|
|
|
|
if not graph_exec_id:
|
|
raise ValueError("execution_context.graph_exec_id is required")
|
|
if not user_id:
|
|
raise ValueError("execution_context.user_id is required")
|
|
|
|
# Create workspace_manager if we have workspace_id (with session scoping)
|
|
# Import here to avoid circular import (file.py → workspace.py → data → blocks → file.py)
|
|
from backend.util.workspace import WorkspaceManager
|
|
|
|
workspace_manager: WorkspaceManager | None = None
|
|
if execution_context.workspace_id:
|
|
workspace_manager = WorkspaceManager(
|
|
user_id, execution_context.workspace_id, execution_context.session_id
|
|
)
|
|
# Build base path
|
|
base_path = Path(get_exec_file_path(graph_exec_id, ""))
|
|
base_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Security fix: Add disk space limits to prevent DoS
|
|
MAX_FILE_SIZE_BYTES = Config().max_file_size_mb * 1024 * 1024
|
|
MAX_TOTAL_DISK_USAGE = 1024 * 1024 * 1024 # 1GB total per execution directory
|
|
|
|
# Check total disk usage in base_path
|
|
if base_path.exists():
|
|
current_usage = get_dir_size(base_path)
|
|
if current_usage > MAX_TOTAL_DISK_USAGE:
|
|
raise ValueError(
|
|
f"Disk usage limit exceeded: {current_usage} bytes > {MAX_TOTAL_DISK_USAGE} bytes"
|
|
)
|
|
|
|
# Helper functions
|
|
def _extension_from_mime(mime: str) -> str:
|
|
ext = mimetypes.guess_extension(mime, strict=False)
|
|
return ext if ext else ".bin"
|
|
|
|
def _file_to_data_uri(path: Path) -> str:
|
|
mime_type = get_mime_type(str(path))
|
|
b64 = base64.b64encode(path.read_bytes()).decode("utf-8")
|
|
return f"data:{mime_type};base64,{b64}"
|
|
|
|
def _ensure_inside_base(path_candidate: Path, base: Path) -> Path:
|
|
"""
|
|
Resolve symlinks via resolve() and ensure the result is still under base.
|
|
"""
|
|
real_candidate = path_candidate.resolve()
|
|
real_base = base.resolve()
|
|
|
|
if not real_candidate.is_relative_to(real_base):
|
|
raise ValueError(
|
|
"Local file path is outside the temp_base directory. Access denied."
|
|
)
|
|
return real_candidate
|
|
|
|
def _strip_base_prefix(absolute_path: Path, base: Path) -> str:
|
|
"""
|
|
Strip base prefix and normalize path.
|
|
"""
|
|
return str(absolute_path.relative_to(base))
|
|
|
|
# Get cloud storage handler for checking cloud paths
|
|
cloud_storage = await get_cloud_storage_handler()
|
|
|
|
# Track if the input came from workspace (don't re-save it)
|
|
is_from_workspace = file.startswith("workspace://")
|
|
|
|
# Check if this is a workspace file reference
|
|
if is_from_workspace:
|
|
if workspace_manager is None:
|
|
raise ValueError(
|
|
"Workspace file reference requires workspace context. "
|
|
"This file type is only available in CoPilot sessions."
|
|
)
|
|
|
|
# Parse workspace reference (strips #mimeType fragment from file ID)
|
|
ws = parse_workspace_uri(file)
|
|
|
|
if ws.is_path:
|
|
# Path reference: workspace:///path/to/file.txt
|
|
workspace_content = await workspace_manager.read_file(ws.file_ref)
|
|
file_info = await workspace_manager.get_file_info_by_path(ws.file_ref)
|
|
filename = sanitize_filename(
|
|
file_info.name if file_info else f"{uuid.uuid4()}.bin"
|
|
)
|
|
else:
|
|
# ID reference: workspace://abc123 or workspace://abc123#video/mp4
|
|
workspace_content = await workspace_manager.read_file_by_id(ws.file_ref)
|
|
file_info = await workspace_manager.get_file_info(ws.file_ref)
|
|
filename = sanitize_filename(
|
|
file_info.name if file_info else f"{uuid.uuid4()}.bin"
|
|
)
|
|
|
|
try:
|
|
target_path = _ensure_inside_base(base_path / filename, base_path)
|
|
except OSError as e:
|
|
raise ValueError(f"Invalid file path '{filename}': {e}") from e
|
|
|
|
# Check file size limit
|
|
if len(workspace_content) > MAX_FILE_SIZE_BYTES:
|
|
raise ValueError(
|
|
f"File too large: {len(workspace_content)} bytes > {MAX_FILE_SIZE_BYTES} bytes"
|
|
)
|
|
|
|
# Virus scan the workspace content before writing locally
|
|
await scan_content_safe(workspace_content, filename=filename)
|
|
target_path.write_bytes(workspace_content)
|
|
|
|
# Check if this is a cloud storage path
|
|
elif cloud_storage.is_cloud_path(file):
|
|
# Download from cloud storage and store locally
|
|
cloud_content = await cloud_storage.retrieve_file(
|
|
file, user_id=user_id, graph_exec_id=graph_exec_id
|
|
)
|
|
|
|
# Generate filename from cloud path
|
|
_, path_part = cloud_storage.parse_cloud_path(file)
|
|
filename = sanitize_filename(Path(path_part).name or f"{uuid.uuid4()}.bin")
|
|
try:
|
|
target_path = _ensure_inside_base(base_path / filename, base_path)
|
|
except OSError as e:
|
|
raise ValueError(f"Invalid file path '{filename}': {e}") from e
|
|
|
|
# Check file size limit
|
|
if len(cloud_content) > MAX_FILE_SIZE_BYTES:
|
|
raise ValueError(
|
|
f"File too large: {len(cloud_content)} bytes > {MAX_FILE_SIZE_BYTES} bytes"
|
|
)
|
|
|
|
# Virus scan the cloud content before writing locally
|
|
await scan_content_safe(cloud_content, filename=filename)
|
|
target_path.write_bytes(cloud_content)
|
|
|
|
# Process file
|
|
elif file.startswith("data:"):
|
|
# Data URI
|
|
match = re.match(r"^data:([^;]+);base64,(.*)$", file, re.DOTALL)
|
|
if not match:
|
|
raise ValueError(
|
|
"Invalid data URI format. Expected data:<mime>;base64,<data>"
|
|
)
|
|
mime_type = match.group(1).strip().lower()
|
|
b64_content = match.group(2).strip()
|
|
|
|
# Generate filename and decode
|
|
extension = _extension_from_mime(mime_type)
|
|
filename = f"{uuid.uuid4()}{extension}"
|
|
try:
|
|
target_path = _ensure_inside_base(base_path / filename, base_path)
|
|
except OSError as e:
|
|
raise ValueError(f"Invalid file path '{filename}': {e}") from e
|
|
content = base64.b64decode(b64_content)
|
|
|
|
# Check file size limit
|
|
if len(content) > MAX_FILE_SIZE_BYTES:
|
|
raise ValueError(
|
|
f"File too large: {len(content)} bytes > {MAX_FILE_SIZE_BYTES} bytes"
|
|
)
|
|
|
|
# Virus scan the base64 content before writing
|
|
await scan_content_safe(content, filename=filename)
|
|
target_path.write_bytes(content)
|
|
|
|
elif file.startswith(("http://", "https://")):
|
|
# URL - download first to get Content-Type header
|
|
resp = await Requests().get(file)
|
|
|
|
# Check file size limit
|
|
if len(resp.content) > MAX_FILE_SIZE_BYTES:
|
|
raise ValueError(
|
|
f"File too large: {len(resp.content)} bytes > {MAX_FILE_SIZE_BYTES} bytes"
|
|
)
|
|
|
|
# Extract filename from URL path
|
|
parsed_url = urlparse(file)
|
|
filename = sanitize_filename(Path(parsed_url.path).name or f"{uuid.uuid4()}")
|
|
|
|
# If filename lacks extension, add one from Content-Type header
|
|
if "." not in filename:
|
|
content_type = resp.headers.get("Content-Type", "").split(";")[0].strip()
|
|
if content_type:
|
|
ext = _extension_from_mime(content_type)
|
|
filename = f"{filename}{ext}"
|
|
|
|
try:
|
|
target_path = _ensure_inside_base(base_path / filename, base_path)
|
|
except OSError as e:
|
|
raise ValueError(f"Invalid file path '{filename}': {e}") from e
|
|
|
|
# Virus scan the downloaded content before writing
|
|
await scan_content_safe(resp.content, filename=filename)
|
|
target_path.write_bytes(resp.content)
|
|
|
|
else:
|
|
# Local path - sanitize the filename part to prevent long filename errors
|
|
sanitized_file = sanitize_filename(file)
|
|
try:
|
|
target_path = _ensure_inside_base(base_path / sanitized_file, base_path)
|
|
except OSError as e:
|
|
raise ValueError(f"Invalid file path '{sanitized_file}': {e}") from e
|
|
if not target_path.is_file():
|
|
raise ValueError(f"Local file does not exist: {target_path}")
|
|
|
|
# Virus scan the local file before any further processing
|
|
local_content = target_path.read_bytes()
|
|
if len(local_content) > MAX_FILE_SIZE_BYTES:
|
|
raise ValueError(
|
|
f"File too large: {len(local_content)} bytes > {MAX_FILE_SIZE_BYTES} bytes"
|
|
)
|
|
await scan_content_safe(local_content, filename=sanitized_file)
|
|
|
|
# Return based on requested format
|
|
if return_format == "for_local_processing":
|
|
# Use when processing files locally with tools like ffmpeg, MoviePy, PIL
|
|
# Returns: relative path in exec_file directory (e.g., "image.png")
|
|
return MediaFileType(_strip_base_prefix(target_path, base_path))
|
|
|
|
elif return_format == "for_external_api":
|
|
# Use when sending content to external APIs that need base64
|
|
# Returns: data URI (e.g., "...")
|
|
return MediaFileType(_file_to_data_uri(target_path))
|
|
|
|
elif return_format == "for_block_output":
|
|
# Use when returning output from a block to user/next block
|
|
# Returns: workspace:// ref (CoPilot) or data URI (graph execution)
|
|
if workspace_manager is None:
|
|
# No workspace available (graph execution without CoPilot)
|
|
# Fallback to data URI so the content can still be used/displayed
|
|
return MediaFileType(_file_to_data_uri(target_path))
|
|
|
|
# Don't re-save if input was already from workspace
|
|
if is_from_workspace:
|
|
# Return original workspace reference, ensuring MIME type fragment
|
|
ws = parse_workspace_uri(file)
|
|
if not ws.mime_type:
|
|
# Add MIME type fragment if missing (older refs without it)
|
|
try:
|
|
if ws.is_path:
|
|
info = await workspace_manager.get_file_info_by_path(
|
|
ws.file_ref
|
|
)
|
|
else:
|
|
info = await workspace_manager.get_file_info(ws.file_ref)
|
|
if info:
|
|
return MediaFileType(f"{file}#{info.mime_type}")
|
|
except Exception:
|
|
pass
|
|
return MediaFileType(file)
|
|
|
|
# Save new content to workspace
|
|
content = target_path.read_bytes()
|
|
filename = target_path.name
|
|
|
|
file_record = await workspace_manager.write_file(
|
|
content=content,
|
|
filename=filename,
|
|
overwrite=True,
|
|
)
|
|
return MediaFileType(f"workspace://{file_record.id}#{file_record.mime_type}")
|
|
|
|
else:
|
|
raise ValueError(f"Invalid return_format: {return_format}")
|
|
|
|
|
|
def get_dir_size(path: Path) -> int:
|
|
"""Get total size of directory."""
|
|
total = 0
|
|
try:
|
|
for entry in path.glob("**/*"):
|
|
if entry.is_file():
|
|
total += entry.stat().st_size
|
|
except Exception:
|
|
pass
|
|
return total
|
|
|
|
|
|
def get_mime_type(file: str) -> str:
|
|
"""
|
|
Get the MIME type of a file, whether it's a data URI, URL, or local path.
|
|
"""
|
|
if file.startswith("data:"):
|
|
match = re.match(r"^data:([^;]+);base64,", file)
|
|
return match.group(1) if match else "application/octet-stream"
|
|
|
|
elif file.startswith(("http://", "https://")):
|
|
parsed_url = urlparse(file)
|
|
mime_type, _ = mimetypes.guess_type(parsed_url.path)
|
|
return mime_type or "application/octet-stream"
|
|
|
|
else:
|
|
mime_type, _ = mimetypes.guess_type(file)
|
|
return mime_type or "application/octet-stream"
|