feat(backend/copilot): attach uploaded images and PDFs as multimodal vision blocks (#12273)

Requested by @majdyz When users upload images or PDFs to CoPilot, the AI couldn't see the content because the CLI's Zod validator rejects large base64 in MCP tool results and even small images were misidentified (the CLI silently drops or corrupts image content blocks in tool results). ## Approach Embed uploaded images directly as **vision content blocks** in the user message via `client._transport.write()`. The SDK's `client.query()` only accepts string content, so we bypass it for multimodal messages — writing a properly structured user message with `[...image_blocks, {"type": "text", "text": query}]` directly to the transport. This ensures the CLI binary receives images as native vision blocks, matching how the Anthropic API handles multimodal input. For binary files accessed via workspace tools at runtime, we save them to the SDK's ephemeral working directory (`sdk_cwd`) and return a file path for the CLI's built-in `Read` tool to handle natively. ## Changes ### Vision content blocks for attached files — `service.py` - `_prepare_file_attachments` downloads workspace files before the query, converts images to base64 vision blocks (`{"type": "image", "source": {"type": "base64", ...}}`) - When vision blocks are present, writes multimodal user message directly to `client._transport` instead of using `client.query()` - Non-image files (PDFs, text) are saved to `sdk_cwd` with a hint to use the Read tool ### File-path based access for workspace tools — `workspace_files.py` - `read_workspace_file` saves binary files to `sdk_cwd` instead of returning base64, returning a path for the Read tool ### SDK context for ephemeral directory — `tool_adapter.py` - Added `sdk_cwd` context variable so workspace tools can access the ephemeral directory - Removed inline base64 multimodal block machinery (`_extract_content_block`, `_strip_base64_from_text`, `_BLOCK_BUILDERS`, etc.) ### Frontend — rendering improvements - `MessageAttachments.tsx` — uses `OutputRenderers` system (`globalRegistry` + `OutputItem`) for image/video preview rendering instead of custom components - `GenericTool.tsx` — uses `OutputRenderers` system for inline image rendering of base64 content - `routes.py` — returns 409 for duplicate workspace filenames ### Tests - `tool_adapter_test.py` — removed multimodal extraction/stripping tests, added `get_sdk_cwd` tests - `service_test.py` — rewritten for `_prepare_file_attachments` with file-on-disk assertions Closes OPEN-3022 --------- Co-authored-by: Zamil Majdy <zamil.majdy@agpt.co>
2026-04-08 03:00:28 -04:00 · 2026-03-05 09:09:59 +00:00
parent 5474f7c495
commit 3d0ede9f34
10 changed files with 464 additions and 76 deletions
--- a/autogpt_platform/backend/backend/api/features/workspace/routes.py
+++ b/autogpt_platform/backend/backend/api/features/workspace/routes.py
@@ -247,7 +247,10 @@ async def upload_file(

    # Write file via WorkspaceManager
    manager = WorkspaceManager(user_id, workspace.id, session_id)
-    workspace_file = await manager.write_file(content, filename)
+    try:
+        workspace_file = await manager.write_file(content, filename)
+    except ValueError as e:
+        raise fastapi.HTTPException(status_code=409, detail=str(e)) from e

    # Post-write storage check — eliminates TOCTOU race on the quota.
    # If a concurrent upload pushed us over the limit, undo this write.
--- a/autogpt_platform/backend/backend/copilot/executor/processor.py
+++ b/autogpt_platform/backend/backend/copilot/executor/processor.py
@@ -263,6 +263,8 @@ class CoPilotProcessor:
                message=entry.message if entry.message else None,
                is_user_message=entry.is_user_message,
                user_id=entry.user_id,
+                context=entry.context,
+                file_ids=entry.file_ids,
            ):
                if cancel.is_set():
                    log.info("Cancel requested, breaking stream")
--- a/autogpt_platform/backend/backend/copilot/sdk/dummy.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/dummy.py
@@ -10,6 +10,7 @@ import asyncio
 import logging
 import uuid
 from collections.abc import AsyncGenerator
+from typing import Any

 from ..model import ChatSession
 from ..response_model import StreamBaseResponse, StreamStart, StreamTextDelta
@@ -26,6 +27,7 @@ async def stream_chat_completion_dummy(
    retry_count: int = 0,
    session: ChatSession | None = None,
    context: dict[str, str] | None = None,
+    **_kwargs: Any,
 ) -> AsyncGenerator[StreamBaseResponse, None]:
    """Stream dummy chat completion for testing.

--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -5,6 +5,7 @@ import base64
 import json
 import logging
 import os
+import re
 import shutil
 import sys
 import uuid
@@ -22,6 +23,7 @@ from claude_agent_sdk import (
 )
 from langfuse import propagate_attributes
 from langsmith.integrations.claude_agent_sdk import configure_claude_agent_sdk
+from pydantic import BaseModel

 from backend.data.redis_client import get_redis_async
 from backend.executor.cluster_lock import AsyncClusterLock
@@ -55,6 +57,7 @@ from ..service import (
 )
 from ..tools.e2b_sandbox import get_or_create_sandbox
 from ..tools.sandbox import WORKSPACE_PREFIX, make_session_path
+from ..tools.workspace_files import get_manager
 from ..tracking import track_user_message
 from .compaction import CompactionTracker, filter_compaction_messages
 from .response_adapter import SDKResponseAdapter
@@ -575,15 +578,142 @@ async def _build_query_message(
    return current_message, False


+# Claude API vision-supported image types.
+_VISION_MIME_TYPES = frozenset({"image/png", "image/jpeg", "image/gif", "image/webp"})
+
+# Max size for embedding images directly in the user message (20 MiB raw).
+_MAX_INLINE_IMAGE_BYTES = 20 * 1024 * 1024
+
+# Matches characters unsafe for filenames.
+_UNSAFE_FILENAME = re.compile(r"[^\w.\-]")
+
+
+def _save_to_sdk_cwd(sdk_cwd: str, filename: str, content: bytes) -> str:
+    """Write file content to the SDK ephemeral directory.
+
+    Returns the absolute path.  Adds a numeric suffix on name collisions.
+    """
+    safe = _UNSAFE_FILENAME.sub("_", filename) or "file"
+    candidate = os.path.join(sdk_cwd, safe)
+    if os.path.exists(candidate):
+        stem, ext = os.path.splitext(safe)
+        idx = 1
+        while os.path.exists(candidate):
+            candidate = os.path.join(sdk_cwd, f"{stem}_{idx}{ext}")
+            idx += 1
+    with open(candidate, "wb") as f:
+        f.write(content)
+    return candidate
+
+
+class PreparedAttachments(BaseModel):
+    """Result of preparing file attachments for a query."""
+
+    hint: str = ""
+    """Text hint describing the files (appended to the user message)."""
+
+    image_blocks: list[dict[str, Any]] = []
+    """Claude API image content blocks to embed in the user message."""
+
+
+async def _prepare_file_attachments(
+    file_ids: list[str],
+    user_id: str,
+    session_id: str,
+    sdk_cwd: str,
+) -> PreparedAttachments:
+    """Download workspace files and prepare them for Claude.
+
+    Images (PNG/JPEG/GIF/WebP) are embedded directly as vision content blocks
+    in the user message so Claude can see them without tool calls.
+
+    Non-image files (PDFs, text, etc.) are saved to *sdk_cwd* so the CLI's
+    built-in Read tool can access them.
+
+    Returns a :class:`PreparedAttachments` with a text hint and any image
+    content blocks.
+    """
+    empty = PreparedAttachments(hint="", image_blocks=[])
+    if not file_ids or not user_id:
+        return empty
+
+    try:
+        manager = await get_manager(user_id, session_id)
+    except Exception:
+        logger.warning(
+            "Failed to create workspace manager for file attachments",
+            exc_info=True,
+        )
+        return empty
+
+    image_blocks: list[dict[str, Any]] = []
+    file_descriptions: list[str] = []
+
+    for fid in file_ids:
+        try:
+            file_info = await manager.get_file_info(fid)
+            if file_info is None:
+                continue
+            content = await manager.read_file_by_id(fid)
+            mime = (file_info.mime_type or "").split(";")[0].strip().lower()
+
+            # Images: embed directly in the user message as vision blocks
+            if mime in _VISION_MIME_TYPES and len(content) <= _MAX_INLINE_IMAGE_BYTES:
+                b64 = base64.b64encode(content).decode("ascii")
+                image_blocks.append(
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": mime,
+                            "data": b64,
+                        },
+                    }
+                )
+                file_descriptions.append(
+                    f"- {file_info.name} ({mime}, "
+                    f"{file_info.size_bytes:,} bytes) [embedded as image]"
+                )
+            else:
+                # Non-image files: save to sdk_cwd for Read tool access
+                local_path = _save_to_sdk_cwd(sdk_cwd, file_info.name, content)
+                file_descriptions.append(
+                    f"- {file_info.name} ({mime}, "
+                    f"{file_info.size_bytes:,} bytes) saved to {local_path}"
+                )
+        except Exception:
+            logger.warning("Failed to prepare file %s", fid[:12], exc_info=True)
+
+    if not file_descriptions:
+        return empty
+
+    noun = "file" if len(file_descriptions) == 1 else "files"
+    has_non_images = len(file_descriptions) > len(image_blocks)
+    read_hint = " Use the Read tool to view non-image files." if has_non_images else ""
+    hint = (
+        f"[The user attached {len(file_descriptions)} {noun}.{read_hint}\n"
+        + "\n".join(file_descriptions)
+        + "]"
+    )
+    return PreparedAttachments(hint=hint, image_blocks=image_blocks)
+
+
 async def stream_chat_completion_sdk(
    session_id: str,
    message: str | None = None,
    is_user_message: bool = True,
    user_id: str | None = None,
    session: ChatSession | None = None,
+    file_ids: list[str] | None = None,
    **_kwargs: Any,
 ) -> AsyncGenerator[StreamBaseResponse, None]:
-    """Stream chat completion using Claude Agent SDK."""
+    """Stream chat completion using Claude Agent SDK.
+
+    Args:
+        file_ids: Optional workspace file IDs attached to the user's message.
+            Images are embedded as vision content blocks; other files are
+            saved to the SDK working directory for the Read tool.
+    """

    if session is None:
        session = await get_chat_session(session_id, user_id)
@@ -883,19 +1013,48 @@ async def stream_chat_completion_sdk(
                transcript_msg_count,
                session_id,
            )
+            # If files are attached, prepare them: images become vision
+            # content blocks in the user message, other files go to sdk_cwd.
+            attachments = await _prepare_file_attachments(
+                file_ids or [], user_id or "", session_id, sdk_cwd
+            )
+            if attachments.hint:
+                query_message = f"{query_message}\n\n{attachments.hint}"
+
            logger.info(
-                "[SDK] [%s] Sending query — resume=%s, total_msgs=%d, query_len=%d",
+                "[SDK] [%s] Sending query — resume=%s, total_msgs=%d, "
+                "query_len=%d, attached_files=%d, image_blocks=%d",
                session_id[:12],
                use_resume,
                len(session.messages),
                len(query_message),
+                len(file_ids) if file_ids else 0,
+                len(attachments.image_blocks),
            )

            compaction.reset_for_query()
            if was_compacted:
                for ev in compaction.emit_pre_query(session):
                    yield ev
-            await client.query(query_message, session_id=session_id)
+
+            if attachments.image_blocks:
+                # Build multimodal content: image blocks + text
+                content_blocks: list[dict[str, Any]] = [
+                    *attachments.image_blocks,
+                    {"type": "text", "text": query_message},
+                ]
+                user_msg = {
+                    "type": "user",
+                    "message": {"role": "user", "content": content_blocks},
+                    "parent_tool_use_id": None,
+                    "session_id": session_id,
+                }
+                assert client._transport is not None  # noqa: SLF001
+                await client._transport.write(  # noqa: SLF001
+                    json.dumps(user_msg) + "\n"
+                )
+            else:
+                await client.query(query_message, session_id=session_id)

            assistant_response = ChatMessage(role="assistant", content="")
            accumulated_tool_calls: list[dict[str, Any]] = []
--- a/autogpt_platform/backend/backend/copilot/sdk/service_test.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service_test.py
@@ -0,0 +1,147 @@
+"""Tests for SDK service helpers."""
+
+import base64
+import os
+from dataclasses import dataclass
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from .service import _prepare_file_attachments
+
+
+@dataclass
+class _FakeFileInfo:
+    id: str
+    name: str
+    path: str
+    mime_type: str
+    size_bytes: int
+
+
+_PATCH_TARGET = "backend.copilot.sdk.service.get_manager"
+
+
+class TestPrepareFileAttachments:
+    @pytest.mark.asyncio
+    async def test_empty_list_returns_empty(self, tmp_path):
+        result = await _prepare_file_attachments([], "u", "s", str(tmp_path))
+        assert result.hint == ""
+        assert result.image_blocks == []
+
+    @pytest.mark.asyncio
+    async def test_image_embedded_as_vision_block(self, tmp_path):
+        """JPEG images should become vision content blocks, not files on disk."""
+        raw = b"\xff\xd8\xff\xe0fake-jpeg"
+        info = _FakeFileInfo(
+            id="abc",
+            name="photo.jpg",
+            path="/photo.jpg",
+            mime_type="image/jpeg",
+            size_bytes=len(raw),
+        )
+        mgr = AsyncMock()
+        mgr.get_file_info.return_value = info
+        mgr.read_file_by_id.return_value = raw
+
+        with patch(_PATCH_TARGET, new_callable=AsyncMock, return_value=mgr):
+            result = await _prepare_file_attachments(
+                ["abc"], "user1", "sess1", str(tmp_path)
+            )
+
+        assert "1 file" in result.hint
+        assert "photo.jpg" in result.hint
+        assert "embedded as image" in result.hint
+        assert len(result.image_blocks) == 1
+        block = result.image_blocks[0]
+        assert block["type"] == "image"
+        assert block["source"]["media_type"] == "image/jpeg"
+        assert block["source"]["data"] == base64.b64encode(raw).decode("ascii")
+        # Image should NOT be written to disk (embedded instead)
+        assert not os.path.exists(os.path.join(tmp_path, "photo.jpg"))
+
+    @pytest.mark.asyncio
+    async def test_pdf_saved_to_disk(self, tmp_path):
+        """PDFs should be saved to disk for Read tool access, not embedded."""
+        info = _FakeFileInfo("f1", "doc.pdf", "/doc.pdf", "application/pdf", 50)
+        mgr = AsyncMock()
+        mgr.get_file_info.return_value = info
+        mgr.read_file_by_id.return_value = b"%PDF-1.4 fake"
+
+        with patch(_PATCH_TARGET, new_callable=AsyncMock, return_value=mgr):
+            result = await _prepare_file_attachments(["f1"], "u", "s", str(tmp_path))
+
+        assert result.image_blocks == []
+        saved = tmp_path / "doc.pdf"
+        assert saved.exists()
+        assert saved.read_bytes() == b"%PDF-1.4 fake"
+        assert str(saved) in result.hint
+
+    @pytest.mark.asyncio
+    async def test_mixed_images_and_files(self, tmp_path):
+        """Images become blocks, non-images go to disk."""
+        infos = {
+            "id1": _FakeFileInfo("id1", "a.png", "/a.png", "image/png", 4),
+            "id2": _FakeFileInfo("id2", "b.pdf", "/b.pdf", "application/pdf", 4),
+            "id3": _FakeFileInfo("id3", "c.txt", "/c.txt", "text/plain", 4),
+        }
+        mgr = AsyncMock()
+        mgr.get_file_info.side_effect = lambda fid: infos[fid]
+        mgr.read_file_by_id.return_value = b"data"
+
+        with patch(_PATCH_TARGET, new_callable=AsyncMock, return_value=mgr):
+            result = await _prepare_file_attachments(
+                ["id1", "id2", "id3"], "u", "s", str(tmp_path)
+            )
+
+        assert "3 files" in result.hint
+        assert "a.png" in result.hint
+        assert "b.pdf" in result.hint
+        assert "c.txt" in result.hint
+        # Only the image should be a vision block
+        assert len(result.image_blocks) == 1
+        assert result.image_blocks[0]["source"]["media_type"] == "image/png"
+        # Non-image files should be on disk
+        assert (tmp_path / "b.pdf").exists()
+        assert (tmp_path / "c.txt").exists()
+        # Read tool hint should appear (has non-image files)
+        assert "Read tool" in result.hint
+
+    @pytest.mark.asyncio
+    async def test_singular_noun(self, tmp_path):
+        info = _FakeFileInfo("x", "only.txt", "/only.txt", "text/plain", 2)
+        mgr = AsyncMock()
+        mgr.get_file_info.return_value = info
+        mgr.read_file_by_id.return_value = b"hi"
+
+        with patch(_PATCH_TARGET, new_callable=AsyncMock, return_value=mgr):
+            result = await _prepare_file_attachments(["x"], "u", "s", str(tmp_path))
+
+        assert "1 file." in result.hint
+
+    @pytest.mark.asyncio
+    async def test_missing_file_skipped(self, tmp_path):
+        mgr = AsyncMock()
+        mgr.get_file_info.return_value = None
+
+        with patch(_PATCH_TARGET, new_callable=AsyncMock, return_value=mgr):
+            result = await _prepare_file_attachments(
+                ["missing-id"], "u", "s", str(tmp_path)
+            )
+
+        assert result.hint == ""
+        assert result.image_blocks == []
+
+    @pytest.mark.asyncio
+    async def test_image_only_no_read_hint(self, tmp_path):
+        """When all files are images, no Read tool hint should appear."""
+        info = _FakeFileInfo("i1", "cat.png", "/cat.png", "image/png", 4)
+        mgr = AsyncMock()
+        mgr.get_file_info.return_value = info
+        mgr.read_file_by_id.return_value = b"data"
+
+        with patch(_PATCH_TARGET, new_callable=AsyncMock, return_value=mgr):
+            result = await _prepare_file_attachments(["i1"], "u", "s", str(tmp_path))
+
+        assert "Read tool" not in result.hint
+        assert len(result.image_blocks) == 1
--- a/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/tool_adapter.py
@@ -102,6 +102,9 @@ _current_session: ContextVar[ChatSession | None] = ContextVar(
 _current_sandbox: ContextVar["AsyncSandbox | None"] = ContextVar(
    "_current_sandbox", default=None
 )
+# Raw SDK working directory path (e.g. /tmp/copilot-<session_id>).
+# Used by workspace tools to save binary files for the CLI's built-in Read.
+_current_sdk_cwd: ContextVar[str] = ContextVar("_current_sdk_cwd", default="")

 # Stash for MCP tool outputs before the SDK potentially truncates them.
 # Keyed by tool_name → full output string. Consumed (popped) by the
@@ -140,6 +143,7 @@ def set_execution_context(
    _current_user_id.set(user_id)
    _current_session.set(session)
    _current_sandbox.set(sandbox)
+    _current_sdk_cwd.set(sdk_cwd or "")
    _current_project_dir.set(_encode_cwd_for_cli(sdk_cwd) if sdk_cwd else "")
    _pending_tool_outputs.set({})
    _stash_event.set(asyncio.Event())
@@ -150,6 +154,11 @@ def get_current_sandbox() -> "AsyncSandbox | None":
    return _current_sandbox.get()


+def get_sdk_cwd() -> str:
+    """Return the SDK ephemeral working directory for the current turn."""
+    return _current_sdk_cwd.get()
+
+
 def get_execution_context() -> tuple[str | None, ChatSession | None]:
    """Get the current execution context."""
    return (
@@ -263,61 +272,12 @@ async def _execute_tool_sync(
        result.output if isinstance(result.output, str) else json.dumps(result.output)
    )

-    content_blocks: list[dict[str, str]] = [{"type": "text", "text": text}]
-
-    # If the tool result contains inline image data, add an MCP image block
-    # so Claude can "see" the image (e.g. read_workspace_file on a small PNG).
-    image_block = _extract_image_block(text)
-    if image_block:
-        content_blocks.append(image_block)
-
    return {
-        "content": content_blocks,
+        "content": [{"type": "text", "text": text}],
        "isError": not result.success,
    }


-# MIME types that Claude can process as image content blocks.
-_SUPPORTED_IMAGE_TYPES = frozenset(
-    {"image/png", "image/jpeg", "image/gif", "image/webp"}
-)
-
-
-def _extract_image_block(text: str) -> dict[str, str] | None:
-    """Extract an MCP image content block from a tool result JSON string.
-
-    Detects workspace file responses with ``content_base64`` and an image
-    MIME type, returning an MCP-format image block that allows Claude to
-    "see" the image.  Returns ``None`` if the result is not an inline image.
-    """
-    try:
-        data = json.loads(text)
-    except (json.JSONDecodeError, TypeError):
-        return None
-
-    if not isinstance(data, dict):
-        return None
-
-    mime_type = data.get("mime_type", "")
-    base64_content = data.get("content_base64", "")
-
-    # Only inline small images — large ones would exceed Claude's limits.
-    # 32 KB raw ≈ ~43 KB base64.
-    _MAX_IMAGE_BASE64_BYTES = 43_000
-    if (
-        mime_type in _SUPPORTED_IMAGE_TYPES
-        and base64_content
-        and len(base64_content) <= _MAX_IMAGE_BASE64_BYTES
-    ):
-        return {
-            "type": "image",
-            "data": base64_content,
-            "mimeType": mime_type,
-        }
-
-    return None
-
-
 def _mcp_error(message: str) -> dict[str, Any]:
    return {
        "content": [
@@ -423,18 +383,21 @@ _READ_TOOL_SCHEMA = {
 }


-# Create the MCP server configuration
+# ---------------------------------------------------------------------------
+# MCP result helpers
+# ---------------------------------------------------------------------------
+
+
 def _text_from_mcp_result(result: dict[str, Any]) -> str:
    """Extract concatenated text from an MCP response's content blocks."""
    content = result.get("content", [])
-    if isinstance(content, list):
-        parts = [
-            b.get("text", "")
-            for b in content
-            if isinstance(b, dict) and b.get("type") == "text"
-        ]
-        return "".join(parts)
-    return ""
+    if not isinstance(content, list):
+        return ""
+    return "".join(
+        b.get("text", "")
+        for b in content
+        if isinstance(b, dict) and b.get("type") == "text"
+    )


 def create_copilot_mcp_server(*, use_e2b: bool = False):
--- a/autogpt_platform/backend/backend/copilot/sdk/tool_adapter_test.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/tool_adapter_test.py
@@ -1,4 +1,4 @@
-"""Tests for tool_adapter helpers: _text_from_mcp_result, truncation stash."""
+"""Tests for tool_adapter helpers: truncation, stash, context vars."""

 import pytest

@@ -7,6 +7,7 @@ from backend.util.truncate import truncate
 from .tool_adapter import (
    _MCP_MAX_CHARS,
    _text_from_mcp_result,
+    get_sdk_cwd,
    pop_pending_tool_output,
    set_execution_context,
    stash_pending_tool_output,
@@ -54,6 +55,30 @@ class TestTextFromMcpResult:
        assert _text_from_mcp_result(result) == ""


+# ---------------------------------------------------------------------------
+# get_sdk_cwd
+# ---------------------------------------------------------------------------
+
+
+class TestGetSdkCwd:
+    def test_returns_empty_string_by_default(self):
+        set_execution_context(
+            user_id="test",
+            session=None,  # type: ignore[arg-type]
+            sandbox=None,
+        )
+        assert get_sdk_cwd() == ""
+
+    def test_returns_set_value(self):
+        set_execution_context(
+            user_id="test",
+            session=None,  # type: ignore[arg-type]
+            sandbox=None,
+            sdk_cwd="/tmp/copilot-test-123",
+        )
+        assert get_sdk_cwd() == "/tmp/copilot-test-123"
+
+
 # ---------------------------------------------------------------------------
 # stash / pop round-trip (the mechanism _truncating relies on)
 # ---------------------------------------------------------------------------
--- a/autogpt_platform/backend/backend/copilot/tools/workspace_files.py
+++ b/autogpt_platform/backend/backend/copilot/tools/workspace_files.py
@@ -432,7 +432,7 @@ class ListWorkspaceFilesTool(BaseTool):
 class ReadWorkspaceFileTool(BaseTool):
    """Tool for reading file content from workspace."""

-    MAX_INLINE_SIZE_BYTES = 32 * 1024  # 32KB
+    MAX_INLINE_SIZE_BYTES = 32 * 1024  # 32KB for text/image files
    PREVIEW_SIZE = 500

    @property
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/components/ChatMessagesContainer/components/MessageAttachments.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/components/ChatMessagesContainer/components/MessageAttachments.tsx
@@ -3,6 +3,11 @@ import {
  DownloadSimple as DownloadIcon,
 } from "@phosphor-icons/react";
 import type { FileUIPart } from "ai";
+import {
+  globalRegistry,
+  OutputItem,
+} from "@/components/contextual/OutputRenderers";
+import type { OutputMetadata } from "@/components/contextual/OutputRenderers";
 import {
  ContentCard,
  ContentCardHeader,
@@ -15,13 +20,60 @@ interface Props {
  isUser?: boolean;
 }

+function renderFileContent(file: FileUIPart): React.ReactNode | null {
+  if (!file.url) return null;
+  const metadata: OutputMetadata = {
+    mimeType: file.mediaType,
+    filename: file.filename,
+    type: file.mediaType?.startsWith("image/")
+      ? "image"
+      : file.mediaType?.startsWith("video/")
+        ? "video"
+        : undefined,
+  };
+  const renderer = globalRegistry.getRenderer(file.url, metadata);
+  if (!renderer) return null;
+  return (
+    <OutputItem value={file.url} metadata={metadata} renderer={renderer} />
+  );
+}
+
 export function MessageAttachments({ files, isUser }: Props) {
  if (files.length === 0) return null;

  return (
    <div className="mt-2 flex flex-col gap-2">
-      {files.map((file, i) =>
-        isUser ? (
+      {files.map((file, i) => {
+        const rendered = renderFileContent(file);
+        return rendered ? (
+          <div
+            key={`${file.filename}-${i}`}
+            className={`inline-block rounded-lg border p-1.5 ${
+              isUser
+                ? "border-purple-300 bg-purple-50"
+                : "border-neutral-200 bg-neutral-50"
+            }`}
+          >
+            {rendered}
+            <div
+              className={`mt-1 flex items-center gap-1 px-0.5 text-xs ${
+                isUser ? "text-zinc-600" : "text-neutral-500"
+              }`}
+            >
+              <span className="truncate">{file.filename || "file"}</span>
+              {file.url && (
+                <a
+                  href={file.url}
+                  download
+                  aria-label="Download file"
+                  className="ml-auto shrink-0 opacity-50 hover:opacity-100"
+                >
+                  <DownloadIcon className="h-3.5 w-3.5" />
+                </a>
+              )}
+            </div>
+          </div>
+        ) : isUser ? (
          <div
            key={`${file.filename}-${i}`}
            className="min-w-0 rounded-lg border border-purple-300 bg-purple-100 p-3"
@@ -77,8 +129,8 @@ export function MessageAttachments({ files, isUser }: Props) {
              </div>
            </ContentCardHeader>
          </ContentCard>
-        ),
-      )}
+        );
+      })}
    </div>
  );
 }
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/tools/GenericTool/GenericTool.tsx
@@ -26,11 +26,28 @@ import {
  ContentMessage,
 } from "../../components/ToolAccordion/AccordionContent";
 import { OrbitLoader } from "../../components/OrbitLoader/OrbitLoader";
+import {
+  globalRegistry,
+  OutputItem,
+} from "@/components/contextual/OutputRenderers";
+import type { OutputMetadata } from "@/components/contextual/OutputRenderers";

 interface Props {
  part: ToolUIPart;
 }

+function RenderMedia({
+  value,
+  metadata,
+}: {
+  value: string;
+  metadata: OutputMetadata;
+}) {
+  const renderer = globalRegistry.getRenderer(value, metadata);
+  if (!renderer) return null;
+  return <OutputItem value={value} metadata={metadata} renderer={renderer} />;
+}
+
 /* ------------------------------------------------------------------ */
 /*  Tool name helpers                                                  */
 /* ------------------------------------------------------------------ */
@@ -612,14 +629,21 @@ function getFileAccordionData(

  // Handle base64 content from workspace files
  let displayContent = content;
+  const mimeType = getStringField(output, "mime_type");
+  const isImage = mimeType?.startsWith("image/");
  if (output.content_base64 && typeof output.content_base64 === "string") {
-    try {
-      const bytes = Uint8Array.from(atob(output.content_base64), (c) =>
-        c.charCodeAt(0),
-      );
-      displayContent = new TextDecoder().decode(bytes);
-    } catch {
-      displayContent = "[Binary content]";
+    if (isImage) {
+      // Render image inline — handled below in the JSX
+      displayContent = null;
+    } else {
+      try {
+        const bytes = Uint8Array.from(atob(output.content_base64), (c) =>
+          c.charCodeAt(0),
+        );
+        displayContent = new TextDecoder().decode(bytes);
+      } catch {
+        displayContent = "[Binary content]";
+      }
    }
  }

@@ -697,6 +721,17 @@ function getFileAccordionData(
          </>
        ) : writtenContent ? (
          <ContentCodeBlock>{writtenContent}</ContentCodeBlock>
+        ) : isImage &&
+          output.content_base64 &&
+          typeof output.content_base64 === "string" ? (
+          <RenderMedia
+            value={`data:${mimeType};base64,${output.content_base64}`}
+            metadata={{
+              type: "image",
+              mimeType: mimeType ?? undefined,
+              filename: filePath ?? undefined,
+            }}
+          />
        ) : displayContent ? (
          <ContentCodeBlock>{displayContent}</ContentCodeBlock>
        ) : null}