docs: Add workspace and media file architecture documentation (#11989 )

### Changes 🏗️ - Added comprehensive architecture documentation at `docs/platform/workspace-media-architecture.md` covering: - Database models (`UserWorkspace`, `UserWorkspaceFile`) - `WorkspaceManager` API with session scoping - `store_media_file()` media normalization pipeline (input types, return formats) - Virus scanning responsibility boundaries - Decision tree for choosing `WorkspaceManager` vs `store_media_file()` - Configuration reference including `clamav_max_concurrency` and `clamav_mark_failed_scans_as_clean` - Common patterns with error handling examples - Updated `autogpt_platform/backend/CLAUDE.md` with a "Workspace & Media Files" section referencing the new docs - Removed duplicate `scan_content_safe()` call from `WriteWorkspaceFileTool` — `WorkspaceManager.write_file()` already scans internally, so the tool was double-scanning every file - Replaced removed comment in `workspace.py` with explicit ownership comment clarifying that `WorkspaceManager` is the single scanning boundary ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] Verified `scan_content_safe()` is called inside `WorkspaceManager.write_file()` (workspace.py:186) - [x] Verified `store_media_file()` scans all input branches including local paths (file.py:351) - [x] Verified documentation accuracy against current source code after merge with dev - [x] CI checks all passing  --- > [!NOTE] > **Low Risk** > Mostly adds documentation and internal developer guidance; the only code change is a comment clarifying `WorkspaceManager.write_file()` as the single virus-scanning boundary, with no behavior change. > > **Overview** > Adds a new `docs/platform/workspace-media-architecture.md` describing the Workspace storage layer vs the `store_media_file()` media pipeline, including session scoping and virus-scanning/persistence responsibility boundaries. > > Updates backend `CLAUDE.md` to point contributors to the new doc when working on CoPilot uploads/downloads or `WorkspaceManager`/`store_media_file()`, and clarifies in `WorkspaceManager.write_file()` (comment-only) that callers should not duplicate virus scanning. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 18fcfa03f8. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup>  --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Merge branch 'master' of github.com:Significant-Gravitas/AutoGPT into dev
2026-03-17 03:00:27 -04:00 · 2026-03-17 06:12:26 +00:00 · 2026-03-17 13:13:42 +07:00 · 2026-03-13 23:54:54 +07:00
6 changed files with 431 additions and 1 deletions
--- a/autogpt_platform/backend/CLAUDE.md
+++ b/autogpt_platform/backend/CLAUDE.md
@@ -178,6 +178,16 @@ yield "image_url", result_url
 3. Write tests alongside the route file
 4. Run `poetry run test` to verify

+## Workspace & Media Files
+
+**Read [Workspace & Media Architecture](../../docs/platform/workspace-media-architecture.md) when:**
+- Working on CoPilot file upload/download features
+- Building blocks that handle `MediaFileType` inputs/outputs
+- Modifying `WorkspaceManager` or `store_media_file()`
+- Debugging file persistence or virus scanning issues
+
+Covers: `WorkspaceManager` (persistent storage with session scoping), `store_media_file()` (media normalization pipeline), and responsibility boundaries for virus scanning and persistence.
+
 ## Security Implementation

 ### Cache Protection Middleware
--- a/autogpt_platform/backend/backend/util/workspace.py
+++ b/autogpt_platform/backend/backend/util/workspace.py
@@ -183,7 +183,8 @@ class WorkspaceManager:
                f"{Config().max_file_size_mb}MB limit"
            )

-        # Virus scan content before persisting (defense in depth)
+        # Scan here — callers must NOT duplicate this scan.
+        # WorkspaceManager owns virus scanning for all persisted files.
        await scan_content_safe(content, filename=filename)

        # Determine path with session scoping
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/components/ChatInput/useChatInput.ts
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/components/ChatInput/useChatInput.ts
@@ -1,3 +1,4 @@
+import { useCopilotUIStore } from "@/app/(platform)/copilot/store";
 import { ChangeEvent, FormEvent, useEffect, useState } from "react";

 interface Args {
@@ -16,6 +17,16 @@ export function useChatInput({
 }: Args) {
  const [value, setValue] = useState("");
  const [isSending, setIsSending] = useState(false);
+  const { initialPrompt, setInitialPrompt } = useCopilotUIStore();
+
+  useEffect(
+    function consumeInitialPrompt() {
+      if (!initialPrompt) return;
+      setValue((prev) => (prev.length === 0 ? initialPrompt : prev));
+      setInitialPrompt(null);
+    },
+    [initialPrompt, setInitialPrompt],
+  );

  useEffect(
    function focusOnMount() {
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/store.ts
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/store.ts
@@ -7,6 +7,10 @@ export interface DeleteTarget {
 }

 interface CopilotUIState {
+  /** Prompt extracted from URL hash (e.g. /copilot#prompt=...) for input prefill. */
+  initialPrompt: string | null;
+  setInitialPrompt: (prompt: string | null) => void;
+
  sessionToDelete: DeleteTarget | null;
  setSessionToDelete: (target: DeleteTarget | null) => void;

@@ -31,6 +35,9 @@ interface CopilotUIState {
 }

 export const useCopilotUIStore = create<CopilotUIState>((set) => ({
+  initialPrompt: null,
+  setInitialPrompt: (prompt) => set({ initialPrompt: prompt }),
+
  sessionToDelete: null,
  setSessionToDelete: (target) => set({ sessionToDelete: target }),

--- a/autogpt_platform/frontend/src/app/(platform)/copilot/useCopilotPage.ts
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/useCopilotPage.ts
@@ -19,6 +19,42 @@ import { useCopilotStream } from "./useCopilotStream";
 const TITLE_POLL_INTERVAL_MS = 2_000;
 const TITLE_POLL_MAX_ATTEMPTS = 5;

+/**
+ * Extract a prompt from the URL hash fragment.
+ * Supports: /copilot#prompt=URL-encoded-text
+ * Optionally auto-submits if ?autosubmit=true is in the query string.
+ * Returns null if no prompt is present.
+ */
+function extractPromptFromUrl(): {
+  prompt: string;
+  autosubmit: boolean;
+} | null {
+  if (typeof window === "undefined") return null;
+
+  const hash = window.location.hash;
+  if (!hash) return null;
+
+  const hashParams = new URLSearchParams(hash.slice(1));
+  const prompt = hashParams.get("prompt");
+
+  if (!prompt || !prompt.trim()) return null;
+
+  const searchParams = new URLSearchParams(window.location.search);
+  const autosubmit = searchParams.get("autosubmit") === "true";
+
+  // Clean up hash + autosubmit param only (preserve other query params)
+  const cleanURL = new URL(window.location.href);
+  cleanURL.hash = "";
+  cleanURL.searchParams.delete("autosubmit");
+  window.history.replaceState(
+    null,
+    "",
+    `${cleanURL.pathname}${cleanURL.search}`,
+  );
+
+  return { prompt: prompt.trim(), autosubmit };
+}
+
 interface UploadedFile {
  file_id: string;
  name: string;
@@ -127,6 +163,28 @@ export function useCopilotPage() {
    }
  }, [sessionId, pendingMessage, sendMessage]);

+  // --- Extract prompt from URL hash on mount (e.g. /copilot#prompt=Hello) ---
+  const { setInitialPrompt } = useCopilotUIStore();
+  const hasProcessedUrlPrompt = useRef(false);
+  useEffect(() => {
+    if (hasProcessedUrlPrompt.current) return;
+
+    const urlPrompt = extractPromptFromUrl();
+    if (!urlPrompt) return;
+
+    hasProcessedUrlPrompt.current = true;
+
+    if (urlPrompt.autosubmit) {
+      setPendingMessage(urlPrompt.prompt);
+      void createSession().catch(() => {
+        setPendingMessage(null);
+        setInitialPrompt(urlPrompt.prompt);
+      });
+    } else {
+      setInitialPrompt(urlPrompt.prompt);
+    }
+  }, [createSession, setInitialPrompt]);
+
  async function uploadFiles(
    files: File[],
    sid: string,
--- a/docs/platform/workspace-media-architecture.md
+++ b/docs/platform/workspace-media-architecture.md
@@ -0,0 +1,343 @@
+# Workspace & Media File Architecture
+
+This document describes the architecture for handling user files in AutoGPT Platform, covering persistent user storage (Workspace) and ephemeral media processing pipelines.
+
+## Overview
+
+The platform has two distinct file-handling layers:
+
+| Layer | Purpose | Persistence | Scope |
+|-------|---------|-------------|-------|
+| **Workspace** | Long-term user file storage | Persistent (DB + GCS/local) | Per-user, session-scoped access |
+| **Media Pipeline** | Ephemeral file processing for blocks | Temporary (local disk) | Per-execution |
+
+## Database Models
+
+### UserWorkspace
+
+Represents a user's file storage space. Created on-demand (one per user).
+
+```prisma
+model UserWorkspace {
+  id        String   @id @default(uuid())
+  createdAt DateTime @default(now())
+  updatedAt DateTime @updatedAt
+  userId    String   @unique
+  Files     UserWorkspaceFile[]
+}
+```
+
+**Key points:**
+- One workspace per user (enforced by `@unique` on `userId`)
+- Created lazily via `get_or_create_workspace()` 
+- Uses upsert to handle race conditions
+
+### UserWorkspaceFile
+
+Represents a file stored in a user's workspace.
+
+```prisma
+model UserWorkspaceFile {
+  id          String    @id @default(uuid())
+  workspaceId String
+  name        String    // User-visible filename
+  path        String    // Virtual path (e.g., "/sessions/abc123/image.png")
+  storagePath String    // Actual storage path (gcs://... or local://...)
+  mimeType    String
+  sizeBytes   BigInt
+  checksum    String?   // SHA256 for integrity
+  isDeleted   Boolean   @default(false)
+  deletedAt   DateTime?
+  metadata    Json      @default("{}")
+
+  @@unique([workspaceId, path])  // Enforce unique paths within workspace
+}
+```
+
+**Key points:**
+- `path` is a virtual path for organizing files (not actual filesystem path)
+- `storagePath` contains the actual GCS or local storage location
+- Soft-delete pattern: `isDeleted` flag with `deletedAt` timestamp
+- Path is modified on delete to free up the virtual path for reuse
+
+---
+
+## WorkspaceManager
+
+**Location:** `backend/util/workspace.py`
+
+High-level API for workspace file operations. Combines storage backend operations with database record management.
+
+### Initialization
+
+```python
+from backend.util.workspace import WorkspaceManager
+
+# Basic usage
+manager = WorkspaceManager(user_id="user-123", workspace_id="ws-456")
+
+# With session scoping (CoPilot sessions)
+manager = WorkspaceManager(
+    user_id="user-123",
+    workspace_id="ws-456", 
+    session_id="session-789"
+)
+```
+
+### Session Scoping
+
+When `session_id` is provided, files are isolated to `/sessions/{session_id}/`:
+
+```python
+# With session_id="abc123":
+manager.write_file(content, "image.png")  
+# → stored at /sessions/abc123/image.png
+
+# Cross-session access is explicit:
+manager.read_file("/sessions/other-session/file.txt")  # Works
+```
+
+**Why session scoping?**
+- CoPilot conversations need file isolation
+- Prevents file collisions between concurrent sessions
+- Allows session cleanup without affecting other sessions
+
+### Core Methods
+
+| Method | Description |
+|--------|-------------|
+| `write_file(content, filename, path?, mime_type?, overwrite?)` | Write file to workspace |
+| `read_file(path)` | Read file by virtual path |
+| `read_file_by_id(file_id)` | Read file by ID |
+| `list_files(path?, limit?, offset?, include_all_sessions?)` | List files |
+| `delete_file(file_id)` | Soft-delete a file |
+| `get_download_url(file_id, expires_in?)` | Get signed download URL |
+| `get_file_info(file_id)` | Get file metadata |
+| `get_file_info_by_path(path)` | Get file metadata by path |
+| `get_file_count(path?, include_all_sessions?)` | Count files |
+
+### Storage Backends
+
+WorkspaceManager delegates to `WorkspaceStorageBackend`:
+
+| Backend | When Used | Storage Path Format |
+|---------|-----------|---------------------|
+| `GCSWorkspaceStorage` | `media_gcs_bucket_name` is configured | `gcs://bucket/workspaces/{ws_id}/{file_id}/{filename}` |
+| `LocalWorkspaceStorage` | No GCS bucket configured | `local://{ws_id}/{file_id}/{filename}` |
+
+---
+
+## store_media_file()
+
+**Location:** `backend/util/file.py`
+
+The media normalization pipeline. Handles various input types and normalizes them for processing or output.
+
+### Purpose
+
+Blocks receive files in many formats (URLs, data URIs, workspace references, local paths). `store_media_file()` normalizes these to a consistent format based on what the block needs.
+
+### Input Types Handled
+
+| Input Format | Example | How It's Processed |
+|--------------|---------|-------------------|
+| Data URI | `data:image/png;base64,iVBOR...` | Decoded, virus scanned, written locally |
+| HTTP(S) URL | `https://example.com/image.png` | Downloaded, virus scanned, written locally |
+| Workspace URI | `workspace://abc123` or `workspace:///path/to/file` | Read from workspace, virus scanned, written locally |
+| Cloud path | `gcs://bucket/path` | Downloaded, virus scanned, written locally |
+| Local path | `image.png` | Verified to exist in exec_file directory |
+
+### Return Formats
+
+The `return_format` parameter determines what you get back:
+
+```python
+from backend.util.file import store_media_file
+
+# For local processing (ffmpeg, MoviePy, PIL)
+local_path = await store_media_file(
+    file=input_file,
+    execution_context=ctx,
+    return_format="for_local_processing"
+)
+# Returns: "image.png" (relative path in exec_file dir)
+
+# For external APIs (Replicate, OpenAI, etc.)
+data_uri = await store_media_file(
+    file=input_file,
+    execution_context=ctx,
+    return_format="for_external_api"
+)
+# Returns: "data:image/png;base64,iVBOR..."
+
+# For block output (adapts to execution context)
+output = await store_media_file(
+    file=input_file,
+    execution_context=ctx,
+    return_format="for_block_output"
+)
+# In CoPilot: Returns "workspace://file-id#image/png"
+# In graphs:  Returns "data:image/png;base64,..."
+```
+
+### Execution Context
+
+`store_media_file()` requires an `ExecutionContext` with:
+- `graph_exec_id` - Required for temp file location
+- `user_id` - Required for workspace access
+- `workspace_id` - Optional; enables workspace features
+- `session_id` - Optional; for session scoping in CoPilot
+
+---
+
+## Responsibility Boundaries
+
+### Virus Scanning
+
+| Component | Scans? | Notes |
+|-----------|--------|-------|
+| `store_media_file()` | ✅ Yes | Scans **all** content before writing to local disk |
+| `WorkspaceManager.write_file()` | ✅ Yes | Scans content before persisting |
+
+**Scanning happens at:**
+1. `store_media_file()` — scans everything it downloads/decodes
+2. `WorkspaceManager.write_file()` — scans before persistence
+
+Tools like `WriteWorkspaceFileTool` don't need to scan because `WorkspaceManager.write_file()` handles it.
+
+### Persistence
+
+| Component | Persists To | Lifecycle |
+|-----------|-------------|-----------|
+| `store_media_file()` | Temp dir (`/tmp/exec_file/{exec_id}/`) | Cleaned after execution |
+| `WorkspaceManager` | GCS or local storage + DB | Persistent until deleted |
+
+**Automatic cleanup:** `clean_exec_files(graph_exec_id)` removes temp files after execution completes.
+
+---
+
+## Decision Tree: WorkspaceManager vs store_media_file
+
+```text
+┌─────────────────────────────────────────────────────┐
+│ What do you need to do with the file?               │
+└─────────────────────────────────────────────────────┘
+                         │
+           ┌─────────────┴─────────────┐
+           ▼                           ▼
+    Process in a block          Store for user access
+    (ffmpeg, PIL, etc.)         (CoPilot files, uploads)
+           │                           │
+           ▼                           ▼
+    store_media_file()           WorkspaceManager
+    with appropriate             
+    return_format                
+           │                           
+           │                           
+    ┌──────┴──────┐                    
+    ▼             ▼                    
+ "for_local_   "for_block_
+ processing"   output"
+    │             │
+    ▼             ▼
+ Get local    Auto-saves to
+ path for     workspace in
+ tools        CoPilot context
+
+Store for user access
+    │
+    ├── write_file() ─── Upload + persist (scans internally)
+    ├── read_file() / get_download_url() ─── Retrieve
+    └── list_files() / delete_file() ─── Manage
+```
+
+### Quick Reference
+
+| Scenario | Use |
+|----------|-----|
+| Block needs to process a file with ffmpeg | `store_media_file(..., return_format="for_local_processing")` |
+| Block needs to send file to external API | `store_media_file(..., return_format="for_external_api")` |
+| Block returning a generated file | `store_media_file(..., return_format="for_block_output")` |
+| API endpoint handling file upload | `WorkspaceManager.write_file()` (handles virus scanning internally) |
+| API endpoint serving file download | `WorkspaceManager.get_download_url()` |
+| Listing user's files | `WorkspaceManager.list_files()` |
+
+---
+
+## Key Files Reference
+
+| File | Purpose |
+|------|---------|
+| `backend/data/workspace.py` | Database CRUD operations for UserWorkspace and UserWorkspaceFile |
+| `backend/util/workspace.py` | `WorkspaceManager` class - high-level workspace API |
+| `backend/util/workspace_storage.py` | Storage backends (GCS, local) and `WorkspaceStorageBackend` interface |
+| `backend/util/file.py` | `store_media_file()` and media processing utilities |
+| `backend/util/virus_scanner.py` | `VirusScannerService` and `scan_content_safe()` |
+| `schema.prisma` | Database model definitions |
+
+---
+
+## Common Patterns
+
+### Block Processing a User's File
+
+```python
+async def run(self, input_data, *, execution_context, **kwargs):
+    # Normalize input to local path
+    local_path = await store_media_file(
+        file=input_data.video,
+        execution_context=execution_context,
+        return_format="for_local_processing",
+    )
+    
+    # Process with local tools
+    output_path = process_video(local_path)
+    
+    # Return (auto-saves to workspace in CoPilot)
+    result = await store_media_file(
+        file=output_path,
+        execution_context=execution_context,
+        return_format="for_block_output",
+    )
+    yield "output", result
+```
+
+### API Upload Endpoint
+
+```python
+from backend.util.virus_scanner import VirusDetectedError, VirusScanError
+
+async def upload_file(file: UploadFile, user_id: str, workspace_id: str):
+    content = await file.read()
+
+    # write_file handles virus scanning internally
+    manager = WorkspaceManager(user_id, workspace_id)
+    try:
+        workspace_file = await manager.write_file(
+            content=content,
+            filename=file.filename,
+        )
+    except VirusDetectedError:
+        raise HTTPException(status_code=400, detail="File rejected: virus detected")
+    except VirusScanError:
+        raise HTTPException(status_code=503, detail="Virus scanning unavailable")
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    return {"file_id": workspace_file.id}
+```
+
+---
+
+## Configuration
+
+| Setting | Purpose | Default |
+|---------|---------|---------|
+| `media_gcs_bucket_name` | GCS bucket for workspace storage | None (uses local) |
+| `workspace_storage_dir` | Local storage directory | `{app_data}/workspaces` |
+| `max_file_size_mb` | Maximum file size in MB | 100 |
+| `clamav_service_enabled` | Enable virus scanning | true |
+| `clamav_service_host` | ClamAV daemon host | localhost |
+| `clamav_service_port` | ClamAV daemon port | 3310 |
+| `clamav_max_concurrency` | Max concurrent scans to ClamAV daemon | 5 |
+| `clamav_mark_failed_scans_as_clean` | If true, scan failures pass content through instead of rejecting (⚠️ security risk if ClamAV is unreachable) | false |