diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/IDEAS.md b/autogpt_platform/backend/backend/api/features/chat/tools/IDEAS.md new file mode 100644 index 0000000000..656aac61c4 --- /dev/null +++ b/autogpt_platform/backend/backend/api/features/chat/tools/IDEAS.md @@ -0,0 +1,79 @@ +# CoPilot Tools - Future Ideas + +## Multimodal Image Support for CoPilot + +**Problem:** CoPilot uses a vision-capable model but can't "see" workspace images. When a block generates an image and returns `workspace://abc123`, CoPilot can't evaluate it (e.g., checking blog thumbnail quality). + +**Backend Solution:** +When preparing messages for the LLM, detect `workspace://` image references and convert them to proper image content blocks: + +```python +# Before sending to LLM, scan for workspace image references +# and inject them as image content parts + +# Example message transformation: +# FROM: {"role": "assistant", "content": "Generated image: workspace://abc123"} +# TO: {"role": "assistant", "content": [ +# {"type": "text", "text": "Generated image: workspace://abc123"}, +# {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}} +# ]} +``` + +**Where to implement:** +- In the chat stream handler before calling the LLM +- Or in a message preprocessing step +- Need to fetch image from workspace, convert to base64, add as image content + +**Considerations:** +- Only do this for image MIME types (image/png, image/jpeg, etc.) +- May want a size limit (don't pass 10MB images) +- Track which images were "shown" to the AI for frontend indicator +- Cost implications - vision API calls are more expensive + +**Frontend Solution:** +Show visual indicator on workspace files in chat: +- If AI saw the image: normal display +- If AI didn't see it: overlay icon saying "AI can't see this image" + +Requires response metadata indicating which `workspace://` refs were passed to the model. + +--- + +## Output Post-Processing Layer for run_block + +**Problem:** Many blocks produce large outputs that: +- Consume massive context (100KB base64 image = ~133KB tokens) +- Can't fit in conversation +- Break things and cause high LLM costs + +**Proposed Solution:** Instead of modifying individual blocks or `store_media_file()`, implement a centralized output processor in `run_block.py` that handles outputs before they're returned to CoPilot. + +**Benefits:** +1. **Centralized** - one place to handle all output processing +2. **Future-proof** - new blocks automatically get output processing +3. **Keeps blocks pure** - they don't need to know about context constraints +4. **Handles all large outputs** - not just images + +**Processing Rules:** +- Detect base64 data URIs → save to workspace, return `workspace://` reference +- Truncate very long strings (>N chars) with truncation note +- Summarize large arrays/lists (e.g., "Array with 1000 items, first 5: [...]") +- Handle nested large outputs in dicts recursively +- Cap total output size + +**Implementation Location:** `run_block.py` after block execution, before returning `BlockOutputResponse` + +**Example:** +```python +def _process_outputs_for_context( + outputs: dict[str, list[Any]], + workspace_manager: WorkspaceManager, + max_string_length: int = 10000, + max_array_preview: int = 5, +) -> dict[str, list[Any]]: + """Process block outputs to prevent context bloat.""" + processed = {} + for name, values in outputs.items(): + processed[name] = [_process_value(v, workspace_manager) for v in values] + return processed +``` diff --git a/autogpt_platform/backend/backend/blocks/ai_image_customizer.py b/autogpt_platform/backend/backend/blocks/ai_image_customizer.py index 90f7f4f99d..aaee3bea05 100644 --- a/autogpt_platform/backend/backend/blocks/ai_image_customizer.py +++ b/autogpt_platform/backend/backend/blocks/ai_image_customizer.py @@ -143,8 +143,7 @@ class AIImageCustomizerBlock(Block): store_media_file( file=img, execution_context=execution_context, - return_content=True, - save_to_workspace=False, # Just get content for API, don't save input + return_format="data_uri", # Get content for external API ) for img in input_data.images ) @@ -163,7 +162,7 @@ class AIImageCustomizerBlock(Block): stored_url = await store_media_file( file=result, execution_context=execution_context, - return_content=True, + return_format="workspace_ref", ) yield "image_url", stored_url except Exception as e: diff --git a/autogpt_platform/backend/backend/blocks/ai_image_generator_block.py b/autogpt_platform/backend/backend/blocks/ai_image_generator_block.py index e77168857b..296c86ddbe 100644 --- a/autogpt_platform/backend/backend/blocks/ai_image_generator_block.py +++ b/autogpt_platform/backend/backend/blocks/ai_image_generator_block.py @@ -338,7 +338,7 @@ class AIImageGeneratorBlock(Block): stored_url = await store_media_file( file=MediaFileType(url), execution_context=execution_context, - return_content=True, # Return as data URI for persistence + return_format="workspace_ref", ) yield "image_url", stored_url else: diff --git a/autogpt_platform/backend/backend/blocks/ai_shortform_video_block.py b/autogpt_platform/backend/backend/blocks/ai_shortform_video_block.py index f4036cf92f..df906e0208 100644 --- a/autogpt_platform/backend/backend/blocks/ai_shortform_video_block.py +++ b/autogpt_platform/backend/backend/blocks/ai_shortform_video_block.py @@ -352,7 +352,7 @@ class AIShortformVideoCreatorBlock(Block): stored_url = await store_media_file( file=MediaFileType(video_url), execution_context=execution_context, - return_content=True, + return_format="workspace_ref", ) yield "video_url", stored_url @@ -556,7 +556,7 @@ class AIAdMakerVideoCreatorBlock(Block): stored_url = await store_media_file( file=MediaFileType(video_url), execution_context=execution_context, - return_content=True, + return_format="workspace_ref", ) yield "video_url", stored_url @@ -748,6 +748,6 @@ class AIScreenshotToVideoAdBlock(Block): stored_url = await store_media_file( file=MediaFileType(video_url), execution_context=execution_context, - return_content=True, + return_format="workspace_ref", ) yield "video_url", stored_url diff --git a/autogpt_platform/backend/backend/blocks/bannerbear/text_overlay.py b/autogpt_platform/backend/backend/blocks/bannerbear/text_overlay.py index 37da0857d8..d09d114706 100644 --- a/autogpt_platform/backend/backend/blocks/bannerbear/text_overlay.py +++ b/autogpt_platform/backend/backend/blocks/bannerbear/text_overlay.py @@ -249,7 +249,7 @@ class BannerbearTextOverlayBlock(Block): stored_url = await store_media_file( file=MediaFileType(image_url), execution_context=execution_context, - return_content=True, + return_format="workspace_ref", ) yield "image_url", stored_url else: diff --git a/autogpt_platform/backend/backend/blocks/basic.py b/autogpt_platform/backend/backend/blocks/basic.py index f6cc00e8aa..3312c117db 100644 --- a/autogpt_platform/backend/backend/blocks/basic.py +++ b/autogpt_platform/backend/backend/blocks/basic.py @@ -49,10 +49,18 @@ class FileStoreBlock(Block): execution_context: ExecutionContext, **kwargs, ) -> BlockOutput: + # Determine return format based on context and user preference + if execution_context.workspace_id and input_data.base_64: + return_format = "workspace_ref" + elif input_data.base_64: + return_format = "data_uri" + else: + return_format = "local_path" + yield "file_out", await store_media_file( file=input_data.file_in, execution_context=execution_context, - return_content=input_data.base_64, + return_format=return_format, ) diff --git a/autogpt_platform/backend/backend/blocks/discord/bot_blocks.py b/autogpt_platform/backend/backend/blocks/discord/bot_blocks.py index f0540d5e65..6bb5da7dd9 100644 --- a/autogpt_platform/backend/backend/blocks/discord/bot_blocks.py +++ b/autogpt_platform/backend/backend/blocks/discord/bot_blocks.py @@ -733,8 +733,7 @@ class SendDiscordFileBlock(Block): stored_file = await store_media_file( file=file, execution_context=execution_context, - return_content=True, # Get as data URI - save_to_workspace=False, # Just get content to send, don't save input + return_format="data_uri", # Get content to send to Discord ) # Now process as data URI header, encoded = stored_file.split(",", 1) diff --git a/autogpt_platform/backend/backend/blocks/fal/ai_video_generator.py b/autogpt_platform/backend/backend/blocks/fal/ai_video_generator.py index 7cf7b75368..7d289cec7f 100644 --- a/autogpt_platform/backend/backend/blocks/fal/ai_video_generator.py +++ b/autogpt_platform/backend/backend/blocks/fal/ai_video_generator.py @@ -224,7 +224,7 @@ class AIVideoGeneratorBlock(Block): stored_url = await store_media_file( file=MediaFileType(video_url), execution_context=execution_context, - return_content=True, + return_format="workspace_ref", ) yield "video_url", stored_url except Exception as e: diff --git a/autogpt_platform/backend/backend/blocks/flux_kontext.py b/autogpt_platform/backend/backend/blocks/flux_kontext.py index b02972beb0..ac63cf89cf 100644 --- a/autogpt_platform/backend/backend/blocks/flux_kontext.py +++ b/autogpt_platform/backend/backend/blocks/flux_kontext.py @@ -146,8 +146,7 @@ class AIImageEditorBlock(Block): await store_media_file( file=input_data.input_image, execution_context=execution_context, - return_content=True, - save_to_workspace=False, # Just get content for API, don't save input + return_format="data_uri", # Get content for external API ) if input_data.input_image else None @@ -161,7 +160,7 @@ class AIImageEditorBlock(Block): stored_url = await store_media_file( file=result, execution_context=execution_context, - return_content=True, + return_format="workspace_ref", ) yield "output_image", stored_url diff --git a/autogpt_platform/backend/backend/blocks/google/gmail.py b/autogpt_platform/backend/backend/blocks/google/gmail.py index 09c4a236a9..bafacb4b78 100644 --- a/autogpt_platform/backend/backend/blocks/google/gmail.py +++ b/autogpt_platform/backend/backend/blocks/google/gmail.py @@ -119,7 +119,7 @@ async def create_mime_message( local_path = await store_media_file( file=attach, execution_context=execution_context, - return_content=False, + return_format="local_path", ) abs_path = get_exec_file_path( execution_context.graph_exec_id or "", local_path @@ -1189,7 +1189,7 @@ async def _build_reply_message( local_path = await store_media_file( file=attach, execution_context=execution_context, - return_content=False, + return_format="local_path", ) abs_path = get_exec_file_path(execution_context.graph_exec_id or "", local_path) part = MIMEBase("application", "octet-stream") @@ -1719,7 +1719,7 @@ To: {original_to} local_path = await store_media_file( file=attach, execution_context=execution_context, - return_content=False, + return_format="local_path", ) abs_path = get_exec_file_path( execution_context.graph_exec_id or "", local_path diff --git a/autogpt_platform/backend/backend/blocks/http.py b/autogpt_platform/backend/backend/blocks/http.py index 36501a40e2..fe5f57254b 100644 --- a/autogpt_platform/backend/backend/blocks/http.py +++ b/autogpt_platform/backend/backend/blocks/http.py @@ -135,7 +135,7 @@ class SendWebRequestBlock(Block): rel_path = await store_media_file( file=media, execution_context=execution_context, - return_content=False, + return_format="local_path", ) abs_path = get_exec_file_path(graph_exec_id, rel_path) async with aiofiles.open(abs_path, "rb") as f: diff --git a/autogpt_platform/backend/backend/blocks/io.py b/autogpt_platform/backend/backend/blocks/io.py index 3398bea81d..0617e8a321 100644 --- a/autogpt_platform/backend/backend/blocks/io.py +++ b/autogpt_platform/backend/backend/blocks/io.py @@ -469,10 +469,18 @@ class AgentFileInputBlock(AgentInputBlock): if not input_data.value: return + # Determine return format based on context and user preference + if execution_context.workspace_id and input_data.base_64: + return_format = "workspace_ref" + elif input_data.base_64: + return_format = "data_uri" + else: + return_format = "local_path" + yield "result", await store_media_file( file=input_data.value, execution_context=execution_context, - return_content=input_data.base_64, + return_format=return_format, ) diff --git a/autogpt_platform/backend/backend/blocks/media.py b/autogpt_platform/backend/backend/blocks/media.py index 4164f53a8a..3499cfb99e 100644 --- a/autogpt_platform/backend/backend/blocks/media.py +++ b/autogpt_platform/backend/backend/blocks/media.py @@ -54,7 +54,7 @@ class MediaDurationBlock(Block): local_media_path = await store_media_file( file=input_data.media_in, execution_context=execution_context, - return_content=False, + return_format="local_path", ) assert execution_context.graph_exec_id is not None media_abspath = get_exec_file_path( @@ -125,7 +125,7 @@ class LoopVideoBlock(Block): local_video_path = await store_media_file( file=input_data.video_in, execution_context=execution_context, - return_content=False, + return_format="local_path", ) input_abspath = get_exec_file_path(graph_exec_id, local_video_path) @@ -153,11 +153,13 @@ class LoopVideoBlock(Block): looped_clip = looped_clip.with_audio(clip.audio) looped_clip.write_videofile(output_abspath, codec="libx264", audio_codec="aac") - # Return as data URI + # Return output - use workspace_ref for persistence, fallback to data_uri video_out = await store_media_file( file=output_filename, execution_context=execution_context, - return_content=input_data.output_return_type == "data_uri", + return_format="workspace_ref" if execution_context.workspace_id else ( + "data_uri" if input_data.output_return_type == "data_uri" else "local_path" + ), ) yield "video_out", video_out @@ -215,12 +217,12 @@ class AddAudioToVideoBlock(Block): local_video_path = await store_media_file( file=input_data.video_in, execution_context=execution_context, - return_content=False, + return_format="local_path", ) local_audio_path = await store_media_file( file=input_data.audio_in, execution_context=execution_context, - return_content=False, + return_format="local_path", ) abs_temp_dir = os.path.join(tempfile.gettempdir(), "exec_file", graph_exec_id) @@ -244,11 +246,13 @@ class AddAudioToVideoBlock(Block): output_abspath = os.path.join(abs_temp_dir, output_filename) final_clip.write_videofile(output_abspath, codec="libx264", audio_codec="aac") - # 5) Return either path or data URI + # 5) Return output - use workspace_ref for persistence, fallback to data_uri video_out = await store_media_file( file=output_filename, execution_context=execution_context, - return_content=input_data.output_return_type == "data_uri", + return_format="workspace_ref" if execution_context.workspace_id else ( + "data_uri" if input_data.output_return_type == "data_uri" else "local_path" + ), ) yield "video_out", video_out diff --git a/autogpt_platform/backend/backend/blocks/screenshotone.py b/autogpt_platform/backend/backend/blocks/screenshotone.py index 172e3b7f07..aedc09d9dd 100644 --- a/autogpt_platform/backend/backend/blocks/screenshotone.py +++ b/autogpt_platform/backend/backend/blocks/screenshotone.py @@ -159,7 +159,7 @@ class ScreenshotWebPageBlock(Block): f"data:image/{format.value};base64,{b64encode(content).decode('utf-8')}" ), execution_context=execution_context, - return_content=True, + return_format="workspace_ref", ) } diff --git a/autogpt_platform/backend/backend/blocks/spreadsheet.py b/autogpt_platform/backend/backend/blocks/spreadsheet.py index 4de4474ac6..affaf7b220 100644 --- a/autogpt_platform/backend/backend/blocks/spreadsheet.py +++ b/autogpt_platform/backend/backend/blocks/spreadsheet.py @@ -109,7 +109,7 @@ class ReadSpreadsheetBlock(Block): stored_file_path = await store_media_file( file=input_data.file_input, execution_context=execution_context, - return_content=False, + return_format="local_path", ) # Get full file path diff --git a/autogpt_platform/backend/backend/blocks/talking_head.py b/autogpt_platform/backend/backend/blocks/talking_head.py index bca95e686e..1a7fc864af 100644 --- a/autogpt_platform/backend/backend/blocks/talking_head.py +++ b/autogpt_platform/backend/backend/blocks/talking_head.py @@ -178,7 +178,7 @@ class CreateTalkingAvatarVideoBlock(Block): stored_url = await store_media_file( file=MediaFileType(video_url), execution_context=execution_context, - return_content=True, + return_format="workspace_ref", ) yield "video_url", stored_url return diff --git a/autogpt_platform/backend/backend/blocks/text.py b/autogpt_platform/backend/backend/blocks/text.py index cee48f87f2..1b5770b242 100644 --- a/autogpt_platform/backend/backend/blocks/text.py +++ b/autogpt_platform/backend/backend/blocks/text.py @@ -451,7 +451,7 @@ class FileReadBlock(Block): stored_file_path = await store_media_file( file=input_data.file_input, execution_context=execution_context, - return_content=False, + return_format="local_path", ) # Get full file path diff --git a/autogpt_platform/backend/backend/util/file.py b/autogpt_platform/backend/backend/util/file.py index 5ef0bbbccb..d6b310fa49 100644 --- a/autogpt_platform/backend/backend/util/file.py +++ b/autogpt_platform/backend/backend/util/file.py @@ -4,10 +4,14 @@ import re import shutil import tempfile import uuid +import warnings from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal from urllib.parse import urlparse +# Return format options for store_media_file +MediaReturnFormat = Literal["local_path", "data_uri", "workspace_ref"] + from prisma.enums import WorkspaceFileSource from backend.util.cloud_storage import get_cloud_storage_handler @@ -77,44 +81,53 @@ async def store_media_file( file: MediaFileType, execution_context: "ExecutionContext", *, - return_content: bool = False, - save_to_workspace: bool = True, + return_format: MediaReturnFormat | None = None, + # Deprecated parameters - use return_format instead + return_content: bool | None = None, + save_to_workspace: bool | None = None, ) -> MediaFileType: """ Safely handle 'file' (a data URI, a URL, a workspace:// reference, or a local path relative to {temp}/exec_file/{exec_id}), placing or verifying it under: {tempdir}/exec_file/{exec_id}/... - If 'return_content=True', return a data URI (data:;base64,). - Otherwise, returns the file media path relative to the exec_id folder. - - When execution_context has a workspace_id, files are also saved to the user's - persistent workspace (for CoPilot sessions). - - For each MediaFileType type: - - Data URI: - -> decode and store in a new random file in that folder - - URL: - -> download and store in that folder - - workspace:// reference: - -> read from user's workspace (requires workspace context) - workspace://abc123 - by file ID - workspace:///path/to/file.txt - by virtual path - - Local path: - -> interpret as relative to that folder; verify it exists - (no copying, as it's presumably already there). - We realpath-check so no symlink or '..' can escape the folder. + For each MediaFileType input: + - Data URI: decode and store locally + - URL: download and store locally + - workspace:// reference: read from workspace, store locally + - Local path: verify it exists in exec_file directory + Return format options: + - "local_path": Return relative path in exec_file dir (for local processing) + - "data_uri": Return base64 data URI (for external APIs) + - "workspace_ref": Save to workspace, return workspace://id (for CoPilot outputs) :param file: Data URI, URL, workspace://, or local (relative) path. :param execution_context: ExecutionContext with user_id, graph_exec_id, workspace_id. - :param return_content: If True, return content (data URI or workspace ref). - If False, return the *relative* path inside the exec_id folder. - :param save_to_workspace: If True (default), save new content to workspace and return ref. - If False, don't save to workspace, return data URI directly. - Use False when getting content for external APIs. - :return: The requested result: data URI, relative path, or workspace ref. + :param return_format: What to return: "local_path", "data_uri", or "workspace_ref". + :param return_content: DEPRECATED. Use return_format instead. + :param save_to_workspace: DEPRECATED. Use return_format instead. + :return: The requested result based on return_format. """ + # Handle deprecated parameters + if return_format is None: + if return_content is not None or save_to_workspace is not None: + warnings.warn( + "return_content and save_to_workspace are deprecated. " + "Use return_format='local_path', 'data_uri', or 'workspace_ref' instead.", + DeprecationWarning, + stacklevel=2, + ) + # Map old parameters to new return_format + if return_content is False or (return_content is None and save_to_workspace is None): + # Default or explicit return_content=False -> local_path + return_format = "local_path" + elif save_to_workspace is False: + # return_content=True, save_to_workspace=False -> data_uri + return_format = "data_uri" + else: + # return_content=True, save_to_workspace=True (or default) -> workspace_ref + return_format = "workspace_ref" # Extract values from execution_context graph_exec_id = execution_context.graph_exec_id user_id = execution_context.user_id @@ -311,16 +324,29 @@ async def store_media_file( if not target_path.is_file(): raise ValueError(f"Local file does not exist: {target_path}") - # Handle workspace saving and return value based on parameters: - # - save_to_workspace=True + return_content=True: save to workspace, return ref - # - save_to_workspace=False + return_content=True: don't save, return data URI - # - return_content=False: return local path (for file processing) - if workspace_manager is not None and return_content and save_to_workspace: + # Return based on requested format + if return_format == "local_path": + # For local file processing (MoviePy, ffmpeg, etc.) + return MediaFileType(_strip_base_prefix(target_path, base_path)) + + elif return_format == "data_uri": + # For external APIs that need base64 content + return MediaFileType(_file_to_data_uri(target_path)) + + elif return_format == "workspace_ref": + # For persisting outputs to workspace (CoPilot) + if workspace_manager is None: + raise ValueError( + "return_format='workspace_ref' requires workspace context. " + "Ensure execution_context has workspace_id set." + ) + # Don't re-save if input was already from workspace if is_from_workspace: - return MediaFileType(_file_to_data_uri(target_path)) + # Return original workspace reference + return MediaFileType(file) - # New content to persist - save to workspace and return ref + # Save new content to workspace content = target_path.read_bytes() filename = target_path.name @@ -328,19 +354,12 @@ async def store_media_file( content=content, filename=filename, source=WorkspaceFileSource.COPILOT, - overwrite=True, # Allow overwriting if file already exists + overwrite=True, ) - # Return workspace reference instead of base64 data URI - # This prevents context bloat from large base64 data URIs - # (100KB file = ~133KB tokens as base64) return MediaFileType(f"workspace://{file_record.id}") - # When return_content=False, return local relative path - # Blocks that need to process files locally (MoviePy, ffmpeg, etc.) need this - if return_content: - return MediaFileType(_file_to_data_uri(target_path)) - - return MediaFileType(_strip_base_prefix(target_path, base_path)) + else: + raise ValueError(f"Invalid return_format: {return_format}") def get_dir_size(path: Path) -> int: diff --git a/autogpt_platform/backend/backend/util/file_test.py b/autogpt_platform/backend/backend/util/file_test.py index 0ea1529c7e..2565734308 100644 --- a/autogpt_platform/backend/backend/util/file_test.py +++ b/autogpt_platform/backend/backend/util/file_test.py @@ -84,7 +84,7 @@ class TestFileCloudIntegration: result = await store_media_file( file=MediaFileType(cloud_path), execution_context=make_test_context(graph_exec_id=graph_exec_id), - return_content=False, + return_format="local_path", ) # Verify cloud storage operations @@ -157,7 +157,7 @@ class TestFileCloudIntegration: result = await store_media_file( file=MediaFileType(cloud_path), execution_context=make_test_context(graph_exec_id=graph_exec_id), - return_content=True, + return_format="data_uri", ) # Verify result is a data URI @@ -210,7 +210,7 @@ class TestFileCloudIntegration: await store_media_file( file=MediaFileType(data_uri), execution_context=make_test_context(graph_exec_id=graph_exec_id), - return_content=False, + return_format="local_path", ) # Verify cloud handler was checked but not used for retrieval