mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-02-06 21:05:13 -05:00
Compare commits
8 Commits
ci/cla-lab
...
docs/works
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0e48c9653a | ||
|
|
a22bf17e98 | ||
|
|
2a4a7f9375 | ||
|
|
cd08e84016 | ||
|
|
6f6343db4e | ||
|
|
d2c7118527 | ||
|
|
3820f2fdbc | ||
|
|
df5dcc2f67 |
412
.github/workflows/cla-label-sync.yml
vendored
412
.github/workflows/cla-label-sync.yml
vendored
@@ -1,412 +0,0 @@
|
||||
name: CLA Label Sync
|
||||
|
||||
on:
|
||||
# Real-time: when CLA status changes (CLA-assistant uses Status API)
|
||||
status:
|
||||
|
||||
# When PRs are opened or updated
|
||||
pull_request_target:
|
||||
types: [opened, synchronize, reopened]
|
||||
|
||||
# Scheduled sweep - check stale PRs daily
|
||||
schedule:
|
||||
- cron: '0 9 * * *' # 9 AM UTC daily
|
||||
|
||||
# Manual trigger for testing
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
pr_number:
|
||||
description: 'Specific PR number to check (optional)'
|
||||
required: false
|
||||
|
||||
permissions:
|
||||
pull-requests: write
|
||||
contents: read
|
||||
statuses: read
|
||||
checks: read
|
||||
|
||||
env:
|
||||
CLA_CHECK_NAME: 'license/cla'
|
||||
LABEL_PENDING: 'cla: pending'
|
||||
LABEL_SIGNED: 'cla: signed'
|
||||
# Timing configuration (all independently configurable)
|
||||
REMINDER_DAYS: 3 # Days before first reminder
|
||||
CLOSE_WARNING_DAYS: 7 # Days before "closing soon" warning
|
||||
CLOSE_DAYS: 10 # Days before auto-close
|
||||
|
||||
jobs:
|
||||
sync-labels:
|
||||
runs-on: ubuntu-latest
|
||||
# Only run on status events if it's the CLA check
|
||||
if: github.event_name != 'status' || github.event.context == 'license/cla'
|
||||
|
||||
steps:
|
||||
- name: Ensure CLA labels exist
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const labels = [
|
||||
{ name: 'cla: pending', color: 'fbca04', description: 'CLA not yet signed by all contributors' },
|
||||
{ name: 'cla: signed', color: '0e8a16', description: 'CLA signed by all contributors' }
|
||||
];
|
||||
|
||||
for (const label of labels) {
|
||||
try {
|
||||
await github.rest.issues.getLabel({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
name: label.name
|
||||
});
|
||||
} catch (e) {
|
||||
if (e.status === 404) {
|
||||
await github.rest.issues.createLabel({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
name: label.name,
|
||||
color: label.color,
|
||||
description: label.description
|
||||
});
|
||||
console.log(`Created label: ${label.name}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
- name: Sync CLA labels and handle stale PRs
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const CLA_CHECK_NAME = process.env.CLA_CHECK_NAME;
|
||||
const LABEL_PENDING = process.env.LABEL_PENDING;
|
||||
const LABEL_SIGNED = process.env.LABEL_SIGNED;
|
||||
const REMINDER_DAYS = parseInt(process.env.REMINDER_DAYS);
|
||||
const CLOSE_WARNING_DAYS = parseInt(process.env.CLOSE_WARNING_DAYS);
|
||||
const CLOSE_DAYS = parseInt(process.env.CLOSE_DAYS);
|
||||
|
||||
// Validate timing configuration
|
||||
if ([REMINDER_DAYS, CLOSE_WARNING_DAYS, CLOSE_DAYS].some(Number.isNaN)) {
|
||||
core.setFailed('Invalid timing configuration — REMINDER_DAYS, CLOSE_WARNING_DAYS, and CLOSE_DAYS must be numeric.');
|
||||
return;
|
||||
}
|
||||
if (!(REMINDER_DAYS < CLOSE_WARNING_DAYS && CLOSE_WARNING_DAYS < CLOSE_DAYS)) {
|
||||
core.warning(`Timing order looks odd: REMINDER(${REMINDER_DAYS}) < WARNING(${CLOSE_WARNING_DAYS}) < CLOSE(${CLOSE_DAYS}) expected.`);
|
||||
}
|
||||
|
||||
const CLA_SIGN_URL = `https://cla-assistant.io/${context.repo.owner}/${context.repo.repo}`;
|
||||
|
||||
// Helper: Get CLA status for a commit
|
||||
async function getClaStatus(headSha) {
|
||||
// CLA-assistant uses the commit status API (not checks API)
|
||||
const { data: statuses } = await github.rest.repos.getCombinedStatusForRef({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
ref: headSha
|
||||
});
|
||||
|
||||
const claStatus = statuses.statuses.find(
|
||||
s => s.context === CLA_CHECK_NAME
|
||||
);
|
||||
|
||||
if (claStatus) {
|
||||
return {
|
||||
found: true,
|
||||
passed: claStatus.state === 'success',
|
||||
state: claStatus.state,
|
||||
description: claStatus.description
|
||||
};
|
||||
}
|
||||
|
||||
// Fallback: check the Checks API too
|
||||
const { data: checkRuns } = await github.rest.checks.listForRef({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
ref: headSha
|
||||
});
|
||||
|
||||
const claCheck = checkRuns.check_runs.find(
|
||||
check => check.name === CLA_CHECK_NAME
|
||||
);
|
||||
|
||||
if (claCheck) {
|
||||
return {
|
||||
found: true,
|
||||
passed: claCheck.conclusion === 'success',
|
||||
state: claCheck.conclusion,
|
||||
description: claCheck.output?.summary || ''
|
||||
};
|
||||
}
|
||||
|
||||
return { found: false, passed: false, state: 'unknown' };
|
||||
}
|
||||
|
||||
// Helper: Check if bot already commented with a specific marker (paginated)
|
||||
async function hasCommentWithMarker(prNumber, marker) {
|
||||
// Use paginate to fetch ALL comments, not just first 100
|
||||
const comments = await github.paginate(
|
||||
github.rest.issues.listComments,
|
||||
{
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber,
|
||||
per_page: 100
|
||||
}
|
||||
);
|
||||
|
||||
return comments.some(c =>
|
||||
c.user?.type === 'Bot' &&
|
||||
c.body?.includes(marker)
|
||||
);
|
||||
}
|
||||
|
||||
// Helper: Days since a date
|
||||
function daysSince(dateString) {
|
||||
const date = new Date(dateString);
|
||||
const now = new Date();
|
||||
return Math.floor((now - date) / (1000 * 60 * 60 * 24));
|
||||
}
|
||||
|
||||
// Determine which PRs to check
|
||||
let prsToCheck = [];
|
||||
|
||||
if (context.eventName === 'status') {
|
||||
// Status event from CLA-assistant - find PRs with this commit
|
||||
const sha = context.payload.sha;
|
||||
console.log(`Status event for SHA: ${sha}, context: ${context.payload.context}`);
|
||||
|
||||
// Search for open PRs with this head SHA
|
||||
const { data: prs } = await github.rest.pulls.list({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
state: 'open',
|
||||
per_page: 100
|
||||
});
|
||||
prsToCheck = prs.filter(pr => pr.head.sha === sha).map(pr => pr.number);
|
||||
|
||||
if (prsToCheck.length === 0) {
|
||||
console.log('No open PRs found with this SHA');
|
||||
return;
|
||||
}
|
||||
|
||||
} else if (context.eventName === 'pull_request_target') {
|
||||
prsToCheck = [context.payload.pull_request.number];
|
||||
|
||||
} else if (context.eventName === 'workflow_dispatch' && context.payload.inputs?.pr_number) {
|
||||
prsToCheck = [parseInt(context.payload.inputs.pr_number)];
|
||||
|
||||
} else {
|
||||
// Scheduled run: check all open PRs (paginated to handle >100 PRs)
|
||||
const openPRs = await github.paginate(
|
||||
github.rest.pulls.list,
|
||||
{
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
state: 'open',
|
||||
per_page: 100
|
||||
}
|
||||
);
|
||||
prsToCheck = openPRs.map(pr => pr.number);
|
||||
}
|
||||
|
||||
console.log(`Checking ${prsToCheck.length} PR(s): ${prsToCheck.join(', ')}`);
|
||||
|
||||
for (const prNumber of prsToCheck) {
|
||||
try {
|
||||
// Get PR details
|
||||
const { data: pr } = await github.rest.pulls.get({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
pull_number: prNumber
|
||||
});
|
||||
|
||||
// Skip if PR is from a bot
|
||||
if (pr.user.type === 'Bot') {
|
||||
console.log(`PR #${prNumber}: Skipping bot PR`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip if PR is not open (closed/merged)
|
||||
if (pr.state !== 'open') {
|
||||
console.log(`PR #${prNumber}: Skipping non-open PR (state=${pr.state})`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip if PR doesn't touch platform code (CLA automation only for autogpt_platform/)
|
||||
const PLATFORM_PATH = 'autogpt_platform/';
|
||||
const { data: files } = await github.rest.pulls.listFiles({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
pull_number: prNumber,
|
||||
per_page: 100
|
||||
});
|
||||
const touchesPlatform = files.some(f => f.filename.startsWith(PLATFORM_PATH));
|
||||
if (!touchesPlatform) {
|
||||
console.log(`PR #${prNumber}: Skipping - doesn't touch ${PLATFORM_PATH}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const claStatus = await getClaStatus(pr.head.sha);
|
||||
const currentLabels = pr.labels.map(l => l.name);
|
||||
const hasPending = currentLabels.includes(LABEL_PENDING);
|
||||
const hasSigned = currentLabels.includes(LABEL_SIGNED);
|
||||
const prAgeDays = daysSince(pr.created_at);
|
||||
|
||||
console.log(`PR #${prNumber}: CLA ${claStatus.passed ? 'passed' : 'pending'} (${claStatus.state}), age: ${prAgeDays} days`);
|
||||
|
||||
if (claStatus.passed) {
|
||||
// ✅ CLA signed - add signed label, remove pending
|
||||
if (!hasSigned) {
|
||||
await github.rest.issues.addLabels({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber,
|
||||
labels: [LABEL_SIGNED]
|
||||
});
|
||||
console.log(`Added '${LABEL_SIGNED}' to PR #${prNumber}`);
|
||||
}
|
||||
if (hasPending) {
|
||||
await github.rest.issues.removeLabel({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber,
|
||||
name: LABEL_PENDING
|
||||
});
|
||||
console.log(`Removed '${LABEL_PENDING}' from PR #${prNumber}`);
|
||||
}
|
||||
|
||||
} else {
|
||||
// ⏳ CLA pending
|
||||
|
||||
// Add pending label if not present
|
||||
if (!hasPending) {
|
||||
await github.rest.issues.addLabels({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber,
|
||||
labels: [LABEL_PENDING]
|
||||
});
|
||||
console.log(`Added '${LABEL_PENDING}' to PR #${prNumber}`);
|
||||
}
|
||||
if (hasSigned) {
|
||||
await github.rest.issues.removeLabel({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber,
|
||||
name: LABEL_SIGNED
|
||||
});
|
||||
console.log(`Removed '${LABEL_SIGNED}' from PR #${prNumber}`);
|
||||
}
|
||||
|
||||
// Check if we need to send reminder or close
|
||||
const REMINDER_MARKER = '<!-- cla-reminder -->';
|
||||
const CLOSE_WARNING_MARKER = '<!-- cla-close-warning -->';
|
||||
|
||||
// 📢 Reminder after REMINDER_DAYS (but before warning window)
|
||||
if (prAgeDays >= REMINDER_DAYS && prAgeDays < CLOSE_WARNING_DAYS) {
|
||||
const hasReminder = await hasCommentWithMarker(prNumber, REMINDER_MARKER);
|
||||
|
||||
if (!hasReminder) {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber,
|
||||
body: `${REMINDER_MARKER}
|
||||
|
||||
👋 **Friendly reminder:** This PR is waiting on a signed CLA.
|
||||
|
||||
All contributors need to sign our Contributor License Agreement before we can merge this PR.
|
||||
|
||||
**➡️ [Sign the CLA here](${CLA_SIGN_URL}?pullRequest=${prNumber})**
|
||||
|
||||
<details>
|
||||
<summary>Why do we need a CLA?</summary>
|
||||
|
||||
The CLA protects both you and the project by clarifying the terms under which your contribution is made. It's a one-time process — once signed, it covers all your future contributions.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Common issues</summary>
|
||||
|
||||
- **Email mismatch:** Make sure your Git commit email matches your GitHub account email
|
||||
- **Merge commits:** If you merged \`dev\` into your branch, try rebasing instead: \`git rebase origin/dev && git push --force-with-lease\`
|
||||
- **Multiple authors:** All commit authors need to sign, not just the PR author
|
||||
|
||||
</details>
|
||||
|
||||
If you have questions, just ask! 🙂`
|
||||
});
|
||||
console.log(`Posted reminder on PR #${prNumber}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ⚠️ Close warning at CLOSE_WARNING_DAYS
|
||||
if (prAgeDays >= CLOSE_WARNING_DAYS && prAgeDays < CLOSE_DAYS) {
|
||||
const hasCloseWarning = await hasCommentWithMarker(prNumber, CLOSE_WARNING_MARKER);
|
||||
|
||||
if (!hasCloseWarning) {
|
||||
const daysRemaining = CLOSE_DAYS - prAgeDays;
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber,
|
||||
body: `${CLOSE_WARNING_MARKER}
|
||||
|
||||
⚠️ **This PR will be automatically closed in ${daysRemaining} day${daysRemaining === 1 ? '' : 's'}** if the CLA is not signed.
|
||||
|
||||
We haven't received a signed CLA from all contributors yet. Please sign it to keep this PR open:
|
||||
|
||||
**➡️ [Sign the CLA here](${CLA_SIGN_URL}?pullRequest=${prNumber})**
|
||||
|
||||
If you're unable to sign or have questions, please let us know — we're happy to help!`
|
||||
});
|
||||
console.log(`Posted close warning on PR #${prNumber}`);
|
||||
}
|
||||
}
|
||||
|
||||
// 🚪 Auto-close after CLOSE_DAYS
|
||||
if (prAgeDays >= CLOSE_DAYS) {
|
||||
const CLOSE_MARKER = '<!-- cla-auto-closed -->';
|
||||
const OVERRIDE_LABEL = 'cla: override';
|
||||
|
||||
// Check for override label (maintainer wants to keep PR open)
|
||||
if (currentLabels.includes(OVERRIDE_LABEL)) {
|
||||
console.log(`PR #${prNumber}: Skipping close due to '${OVERRIDE_LABEL}' label`);
|
||||
} else {
|
||||
// Check if we already posted a close comment
|
||||
const hasCloseComment = await hasCommentWithMarker(prNumber, CLOSE_MARKER);
|
||||
|
||||
if (!hasCloseComment) {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber,
|
||||
body: `${CLOSE_MARKER}
|
||||
|
||||
👋 Closing this PR due to unsigned CLA after ${CLOSE_DAYS} days.
|
||||
|
||||
Thank you for your contribution! If you'd still like to contribute:
|
||||
|
||||
1. [Sign the CLA](${CLA_SIGN_URL})
|
||||
2. Re-open this PR or create a new one
|
||||
|
||||
We appreciate your interest in AutoGPT and hope to see you back! 🚀`
|
||||
});
|
||||
}
|
||||
|
||||
await github.rest.pulls.update({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
pull_number: prNumber,
|
||||
state: 'closed'
|
||||
});
|
||||
|
||||
console.log(`Closed PR #${prNumber} due to unsigned CLA`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Error processing PR #${prNumber}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('CLA label sync complete!');
|
||||
@@ -157,6 +157,16 @@ yield "image_url", result_url
|
||||
3. Write tests alongside the route file
|
||||
4. Run `poetry run test` to verify
|
||||
|
||||
## Workspace & Media Files
|
||||
|
||||
**Read [Workspace & Media Architecture](../../docs/platform/workspace-media-architecture.md) when:**
|
||||
- Working on CoPilot file upload/download features
|
||||
- Building blocks that handle `MediaFileType` inputs/outputs
|
||||
- Modifying `WorkspaceManager` or `store_media_file()`
|
||||
- Debugging file persistence or virus scanning issues
|
||||
|
||||
Covers: `WorkspaceManager` (persistent storage with session scoping), `store_media_file()` (media normalization pipeline), and responsibility boundaries for virus scanning and persistence.
|
||||
|
||||
## Security Implementation
|
||||
|
||||
### Cache Protection Middleware
|
||||
|
||||
@@ -9,7 +9,6 @@ from pydantic import BaseModel
|
||||
from backend.api.features.chat.model import ChatSession
|
||||
from backend.data.workspace import get_or_create_workspace
|
||||
from backend.util.settings import Config
|
||||
from backend.util.virus_scanner import scan_content_safe
|
||||
from backend.util.workspace import WorkspaceManager
|
||||
|
||||
from .base import BaseTool
|
||||
@@ -475,9 +474,6 @@ class WriteWorkspaceFileTool(BaseTool):
|
||||
)
|
||||
|
||||
try:
|
||||
# Virus scan
|
||||
await scan_content_safe(content, filename=filename)
|
||||
|
||||
workspace = await get_or_create_workspace(user_id)
|
||||
# Pass session_id for session-scoped file access
|
||||
manager = WorkspaceManager(user_id, workspace.id, session_id)
|
||||
|
||||
@@ -188,7 +188,6 @@ class WorkspaceManager:
|
||||
f"{Config().max_file_size_mb}MB limit"
|
||||
)
|
||||
|
||||
# Virus scan content before persisting (defense in depth)
|
||||
await scan_content_safe(content, filename=filename)
|
||||
|
||||
# Determine path with session scoping
|
||||
|
||||
325
docs/platform/workspace-media-architecture.md
Normal file
325
docs/platform/workspace-media-architecture.md
Normal file
@@ -0,0 +1,325 @@
|
||||
# Workspace & Media File Architecture
|
||||
|
||||
This document describes the architecture for handling user files in AutoGPT Platform, covering persistent user storage (Workspace) and ephemeral media processing pipelines.
|
||||
|
||||
## Overview
|
||||
|
||||
The platform has two distinct file-handling layers:
|
||||
|
||||
| Layer | Purpose | Persistence | Scope |
|
||||
|-------|---------|-------------|-------|
|
||||
| **Workspace** | Long-term user file storage | Persistent (DB + GCS/local) | Per-user, session-scoped access |
|
||||
| **Media Pipeline** | Ephemeral file processing for blocks | Temporary (local disk) | Per-execution |
|
||||
|
||||
## Database Models
|
||||
|
||||
### UserWorkspace
|
||||
|
||||
Represents a user's file storage space. Created on-demand (one per user).
|
||||
|
||||
```prisma
|
||||
model UserWorkspace {
|
||||
id String @id @default(uuid())
|
||||
createdAt DateTime @default(now())
|
||||
updatedAt DateTime @updatedAt
|
||||
userId String @unique
|
||||
Files UserWorkspaceFile[]
|
||||
}
|
||||
```
|
||||
|
||||
**Key points:**
|
||||
- One workspace per user (enforced by `@unique` on `userId`)
|
||||
- Created lazily via `get_or_create_workspace()`
|
||||
- Uses upsert to handle race conditions
|
||||
|
||||
### UserWorkspaceFile
|
||||
|
||||
Represents a file stored in a user's workspace.
|
||||
|
||||
```prisma
|
||||
model UserWorkspaceFile {
|
||||
id String @id @default(uuid())
|
||||
workspaceId String
|
||||
name String // User-visible filename
|
||||
path String // Virtual path (e.g., "/sessions/abc123/image.png")
|
||||
storagePath String // Actual storage path (gcs://... or local://...)
|
||||
mimeType String
|
||||
sizeBytes BigInt
|
||||
checksum String? // SHA256 for integrity
|
||||
isDeleted Boolean @default(false)
|
||||
deletedAt DateTime?
|
||||
metadata Json @default("{}")
|
||||
|
||||
@@unique([workspaceId, path]) // Enforce unique paths within workspace
|
||||
}
|
||||
```
|
||||
|
||||
**Key points:**
|
||||
- `path` is a virtual path for organizing files (not actual filesystem path)
|
||||
- `storagePath` contains the actual GCS or local storage location
|
||||
- Soft-delete pattern: `isDeleted` flag with `deletedAt` timestamp
|
||||
- Path is modified on delete to free up the virtual path for reuse
|
||||
|
||||
---
|
||||
|
||||
## WorkspaceManager
|
||||
|
||||
**Location:** `backend/util/workspace.py`
|
||||
|
||||
High-level API for workspace file operations. Combines storage backend operations with database record management.
|
||||
|
||||
### Initialization
|
||||
|
||||
```python
|
||||
from backend.util.workspace import WorkspaceManager
|
||||
|
||||
# Basic usage
|
||||
manager = WorkspaceManager(user_id="user-123", workspace_id="ws-456")
|
||||
|
||||
# With session scoping (CoPilot sessions)
|
||||
manager = WorkspaceManager(
|
||||
user_id="user-123",
|
||||
workspace_id="ws-456",
|
||||
session_id="session-789"
|
||||
)
|
||||
```
|
||||
|
||||
### Session Scoping
|
||||
|
||||
When `session_id` is provided, files are isolated to `/sessions/{session_id}/`:
|
||||
|
||||
```python
|
||||
# With session_id="abc123":
|
||||
manager.write_file(content, "image.png")
|
||||
# → stored at /sessions/abc123/image.png
|
||||
|
||||
# Cross-session access is explicit:
|
||||
manager.read_file("/sessions/other-session/file.txt") # Works
|
||||
```
|
||||
|
||||
**Why session scoping?**
|
||||
- CoPilot conversations need file isolation
|
||||
- Prevents file collisions between concurrent sessions
|
||||
- Allows session cleanup without affecting other sessions
|
||||
|
||||
### Core Methods
|
||||
|
||||
| Method | Description |
|
||||
|--------|-------------|
|
||||
| `write_file(content, filename, path?, mime_type?, overwrite?)` | Write file to workspace |
|
||||
| `read_file(path)` | Read file by virtual path |
|
||||
| `read_file_by_id(file_id)` | Read file by ID |
|
||||
| `list_files(path?, limit?, offset?, include_all_sessions?)` | List files |
|
||||
| `delete_file(file_id)` | Soft-delete a file |
|
||||
| `get_download_url(file_id, expires_in?)` | Get signed download URL |
|
||||
| `get_file_info(file_id)` | Get file metadata |
|
||||
| `get_file_count(path?, include_all_sessions?)` | Count files |
|
||||
|
||||
### Storage Backends
|
||||
|
||||
WorkspaceManager delegates to `WorkspaceStorageBackend`:
|
||||
|
||||
| Backend | When Used | Storage Path Format |
|
||||
|---------|-----------|---------------------|
|
||||
| `GCSWorkspaceStorage` | `media_gcs_bucket_name` is configured | `gcs://bucket/workspaces/{ws_id}/{file_id}/{filename}` |
|
||||
| `LocalWorkspaceStorage` | No GCS bucket configured | `local://{ws_id}/{file_id}/{filename}` |
|
||||
|
||||
---
|
||||
|
||||
## store_media_file()
|
||||
|
||||
**Location:** `backend/util/file.py`
|
||||
|
||||
The media normalization pipeline. Handles various input types and normalizes them for processing or output.
|
||||
|
||||
### Purpose
|
||||
|
||||
Blocks receive files in many formats (URLs, data URIs, workspace references, local paths). `store_media_file()` normalizes these to a consistent format based on what the block needs.
|
||||
|
||||
### Input Types Handled
|
||||
|
||||
| Input Format | Example | How It's Processed |
|
||||
|--------------|---------|-------------------|
|
||||
| Data URI | `data:image/png;base64,iVBOR...` | Decoded, virus scanned, written locally |
|
||||
| HTTP(S) URL | `https://example.com/image.png` | Downloaded, virus scanned, written locally |
|
||||
| Workspace URI | `workspace://abc123` or `workspace:///path/to/file` | Read from workspace, virus scanned, written locally |
|
||||
| Cloud path | `gcs://bucket/path` | Downloaded, virus scanned, written locally |
|
||||
| Local path | `image.png` | Verified to exist in exec_file directory |
|
||||
|
||||
### Return Formats
|
||||
|
||||
The `return_format` parameter determines what you get back:
|
||||
|
||||
```python
|
||||
from backend.util.file import store_media_file
|
||||
|
||||
# For local processing (ffmpeg, MoviePy, PIL)
|
||||
local_path = await store_media_file(
|
||||
file=input_file,
|
||||
execution_context=ctx,
|
||||
return_format="for_local_processing"
|
||||
)
|
||||
# Returns: "image.png" (relative path in exec_file dir)
|
||||
|
||||
# For external APIs (Replicate, OpenAI, etc.)
|
||||
data_uri = await store_media_file(
|
||||
file=input_file,
|
||||
execution_context=ctx,
|
||||
return_format="for_external_api"
|
||||
)
|
||||
# Returns: "data:image/png;base64,iVBOR..."
|
||||
|
||||
# For block output (adapts to execution context)
|
||||
output = await store_media_file(
|
||||
file=input_file,
|
||||
execution_context=ctx,
|
||||
return_format="for_block_output"
|
||||
)
|
||||
# In CoPilot: Returns "workspace://file-id#image/png"
|
||||
# In graphs: Returns "data:image/png;base64,..."
|
||||
```
|
||||
|
||||
### Execution Context
|
||||
|
||||
`store_media_file()` requires an `ExecutionContext` with:
|
||||
- `graph_exec_id` - Required for temp file location
|
||||
- `user_id` - Required for workspace access
|
||||
- `workspace_id` - Optional; enables workspace features
|
||||
- `session_id` - Optional; for session scoping in CoPilot
|
||||
|
||||
---
|
||||
|
||||
## Responsibility Boundaries
|
||||
|
||||
### Virus Scanning
|
||||
|
||||
| Component | Scans? | Notes |
|
||||
|-----------|--------|-------|
|
||||
| `store_media_file()` | ✅ Yes | Scans **all** content before writing to local disk |
|
||||
| `WorkspaceManager.write_file()` | ✅ Yes | Scans content before persisting |
|
||||
|
||||
**Scanning happens at:**
|
||||
1. `store_media_file()` — scans everything it downloads/decodes
|
||||
2. `WorkspaceManager.write_file()` — scans before persistence
|
||||
|
||||
Tools like `WriteWorkspaceFileTool` don't need to scan because `WorkspaceManager.write_file()` handles it.
|
||||
|
||||
### Persistence
|
||||
|
||||
| Component | Persists To | Lifecycle |
|
||||
|-----------|-------------|-----------|
|
||||
| `store_media_file()` | Temp dir (`/tmp/exec_file/{exec_id}/`) | Cleaned after execution |
|
||||
| `WorkspaceManager` | GCS or local storage + DB | Persistent until deleted |
|
||||
|
||||
**Automatic cleanup:** `clean_exec_files(graph_exec_id)` removes temp files after execution completes.
|
||||
|
||||
---
|
||||
|
||||
## Decision Tree: WorkspaceManager vs store_media_file
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────┐
|
||||
│ What do you need to do with the file? │
|
||||
└─────────────────────────────────────────────────────┘
|
||||
│
|
||||
┌─────────────┴─────────────┐
|
||||
▼ ▼
|
||||
Process in a block Store for user access
|
||||
(ffmpeg, PIL, etc.) (CoPilot files, uploads)
|
||||
│ │
|
||||
▼ ▼
|
||||
store_media_file() WorkspaceManager
|
||||
with appropriate
|
||||
return_format
|
||||
│
|
||||
│
|
||||
┌──────┴──────┐
|
||||
▼ ▼
|
||||
"for_local_ "for_block_
|
||||
processing" output"
|
||||
│ │
|
||||
▼ ▼
|
||||
Get local Auto-saves to
|
||||
path for workspace in
|
||||
tools CoPilot context
|
||||
```
|
||||
|
||||
### Quick Reference
|
||||
|
||||
| Scenario | Use |
|
||||
|----------|-----|
|
||||
| Block needs to process a file with ffmpeg | `store_media_file(..., return_format="for_local_processing")` |
|
||||
| Block needs to send file to external API | `store_media_file(..., return_format="for_external_api")` |
|
||||
| Block returning a generated file | `store_media_file(..., return_format="for_block_output")` |
|
||||
| API endpoint handling file upload | `WorkspaceManager.write_file()` (after virus scan) |
|
||||
| API endpoint serving file download | `WorkspaceManager.get_download_url()` |
|
||||
| Listing user's files | `WorkspaceManager.list_files()` |
|
||||
|
||||
---
|
||||
|
||||
## Key Files Reference
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `backend/data/workspace.py` | Database CRUD operations for UserWorkspace and UserWorkspaceFile |
|
||||
| `backend/util/workspace.py` | `WorkspaceManager` class - high-level workspace API |
|
||||
| `backend/util/workspace_storage.py` | Storage backends (GCS, local) and `WorkspaceStorageBackend` interface |
|
||||
| `backend/util/file.py` | `store_media_file()` and media processing utilities |
|
||||
| `backend/util/virus_scanner.py` | `VirusScannerService` and `scan_content_safe()` |
|
||||
| `schema.prisma` | Database model definitions |
|
||||
|
||||
---
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Block Processing a User's File
|
||||
|
||||
```python
|
||||
async def run(self, input_data, *, execution_context, **kwargs):
|
||||
# Normalize input to local path
|
||||
local_path = await store_media_file(
|
||||
file=input_data.video,
|
||||
execution_context=execution_context,
|
||||
return_format="for_local_processing",
|
||||
)
|
||||
|
||||
# Process with local tools
|
||||
output_path = process_video(local_path)
|
||||
|
||||
# Return (auto-saves to workspace in CoPilot)
|
||||
result = await store_media_file(
|
||||
file=output_path,
|
||||
execution_context=execution_context,
|
||||
return_format="for_block_output",
|
||||
)
|
||||
yield "output", result
|
||||
```
|
||||
|
||||
### API Upload Endpoint
|
||||
|
||||
```python
|
||||
async def upload_file(file: UploadFile, user_id: str, workspace_id: str):
|
||||
content = await file.read()
|
||||
|
||||
# write_file handles virus scanning
|
||||
manager = WorkspaceManager(user_id, workspace_id)
|
||||
workspace_file = await manager.write_file(
|
||||
content=content,
|
||||
filename=file.filename,
|
||||
)
|
||||
|
||||
return {"file_id": workspace_file.id}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
| Setting | Purpose | Default |
|
||||
|---------|---------|---------|
|
||||
| `media_gcs_bucket_name` | GCS bucket for workspace storage | None (uses local) |
|
||||
| `workspace_storage_dir` | Local storage directory | `{app_data}/workspaces` |
|
||||
| `max_file_size_mb` | Maximum file size in MB | 100 |
|
||||
| `clamav_service_enabled` | Enable virus scanning | true |
|
||||
| `clamav_service_host` | ClamAV daemon host | localhost |
|
||||
| `clamav_service_port` | ClamAV daemon port | 3310 |
|
||||
Reference in New Issue
Block a user