feat(frontend): render video/audio workspace refs using MIME fragment

Use the #mimeType fragment on workspace:// URIs to determine media category (video/image/audio) instead of relying solely on keyword matching. Adds video rendering support in MarkdownContent, broader format support in render.tsx, and enhanced output handling in the builder's DataTable and NodeOutputs. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 20:05:11 -05:00 · 2026-02-04 18:25:55 -06:00
parent ed26d81c2b
commit e871edd387
5 changed files with 243 additions and 72 deletions
--- a/autogpt_platform/frontend/src/app/(platform)/build/components/legacy-builder/DataTable.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/build/components/legacy-builder/DataTable.tsx
@@ -1,6 +1,6 @@
 import { beautifyString } from "@/lib/utils";
 import { Clipboard, Maximize2 } from "lucide-react";
-import React, { useState } from "react";
+import React, { useMemo, useState } from "react";
 import { Button } from "../../../../../components/__legacy__/ui/button";
 import { ContentRenderer } from "../../../../../components/__legacy__/ui/render";
 import {
@@ -11,6 +11,12 @@ import {
  TableHeader,
  TableRow,
 } from "../../../../../components/__legacy__/ui/table";
+import type { OutputMetadata } from "@/components/contextual/OutputRenderers";
+import {
+  globalRegistry,
+  OutputItem,
+} from "@/components/contextual/OutputRenderers";
+import { Flag, useGetFlag } from "@/services/feature-flags/use-get-flag";
 import { useToast } from "../../../../../components/molecules/Toast/use-toast";
 import ExpandableOutputDialog from "./ExpandableOutputDialog";

@@ -26,6 +32,9 @@ export default function DataTable({
  data,
 }: DataTableProps) {
  const { toast } = useToast();
+  const enableEnhancedOutputHandling = useGetFlag(
+    Flag.ENABLE_ENHANCED_OUTPUT_HANDLING,
+  );
  const [expandedDialog, setExpandedDialog] = useState<{
    isOpen: boolean;
    execId: string;
@@ -33,6 +42,15 @@ export default function DataTable({
    data: any[];
  } | null>(null);

+  // Prepare renderers for each item when enhanced mode is enabled
+  const getItemRenderer = useMemo(() => {
+    if (!enableEnhancedOutputHandling) return null;
+    return (item: unknown) => {
+      const metadata: OutputMetadata = {};
+      return globalRegistry.getRenderer(item, metadata);
+    };
+  }, [enableEnhancedOutputHandling]);
+
  const copyData = (pin: string, data: string) => {
    navigator.clipboard.writeText(data).then(() => {
      toast({
@@ -102,15 +120,31 @@ export default function DataTable({
                      <Clipboard size={18} />
                    </Button>
                  </div>
-                  {value.map((item, index) => (
-                    <React.Fragment key={index}>
-                      <ContentRenderer
-                        value={item}
-                        truncateLongData={truncateLongData}
-                      />
-                      {index < value.length - 1 && ", "}
-                    </React.Fragment>
-                  ))}
+                  {value.map((item, index) => {
+                    const renderer = getItemRenderer?.(item);
+                    if (enableEnhancedOutputHandling && renderer) {
+                      const metadata: OutputMetadata = {};
+                      return (
+                        <React.Fragment key={index}>
+                          <OutputItem
+                            value={item}
+                            metadata={metadata}
+                            renderer={renderer}
+                          />
+                          {index < value.length - 1 && ", "}
+                        </React.Fragment>
+                      );
+                    }
+                    return (
+                      <React.Fragment key={index}>
+                        <ContentRenderer
+                          value={item}
+                          truncateLongData={truncateLongData}
+                        />
+                        {index < value.length - 1 && ", "}
+                      </React.Fragment>
+                    );
+                  })}
                </div>
              </TableCell>
            </TableRow>
--- a/autogpt_platform/frontend/src/app/(platform)/build/components/legacy-builder/NodeOutputs.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/build/components/legacy-builder/NodeOutputs.tsx
@@ -1,8 +1,14 @@
-import React, { useContext, useState } from "react";
+import React, { useContext, useMemo, useState } from "react";
 import { Button } from "@/components/__legacy__/ui/button";
 import { Maximize2 } from "lucide-react";
 import * as Separator from "@radix-ui/react-separator";
 import { ContentRenderer } from "@/components/__legacy__/ui/render";
+import type { OutputMetadata } from "@/components/contextual/OutputRenderers";
+import {
+  globalRegistry,
+  OutputItem,
+} from "@/components/contextual/OutputRenderers";
+import { Flag, useGetFlag } from "@/services/feature-flags/use-get-flag";

 import { beautifyString } from "@/lib/utils";

@@ -21,6 +27,9 @@ export default function NodeOutputs({
  data,
 }: NodeOutputsProps) {
  const builderContext = useContext(BuilderContext);
+  const enableEnhancedOutputHandling = useGetFlag(
+    Flag.ENABLE_ENHANCED_OUTPUT_HANDLING,
+  );

  const [expandedDialog, setExpandedDialog] = useState<{
    isOpen: boolean;
@@ -37,6 +46,15 @@ export default function NodeOutputs({

  const { getNodeTitle } = builderContext;

+  // Prepare renderers for each item when enhanced mode is enabled
+  const getItemRenderer = useMemo(() => {
+    if (!enableEnhancedOutputHandling) return null;
+    return (item: unknown) => {
+      const metadata: OutputMetadata = {};
+      return globalRegistry.getRenderer(item, metadata);
+    };
+  }, [enableEnhancedOutputHandling]);
+
  const getBeautifiedPinName = (pin: string) => {
    if (!pin.startsWith("tools_^_")) {
      return beautifyString(pin);
@@ -87,15 +105,31 @@ export default function NodeOutputs({
          <div className="mt-2">
            <strong className="mr-2">Data:</strong>
            <div className="mt-1">
-              {dataArray.slice(0, 10).map((item, index) => (
-                <React.Fragment key={index}>
-                  <ContentRenderer
-                    value={item}
-                    truncateLongData={truncateLongData}
-                  />
-                  {index < Math.min(dataArray.length, 10) - 1 && ", "}
-                </React.Fragment>
-              ))}
+              {dataArray.slice(0, 10).map((item, index) => {
+                const renderer = getItemRenderer?.(item);
+                if (enableEnhancedOutputHandling && renderer) {
+                  const metadata: OutputMetadata = {};
+                  return (
+                    <React.Fragment key={index}>
+                      <OutputItem
+                        value={item}
+                        metadata={metadata}
+                        renderer={renderer}
+                      />
+                      {index < Math.min(dataArray.length, 10) - 1 && ", "}
+                    </React.Fragment>
+                  );
+                }
+                return (
+                  <React.Fragment key={index}>
+                    <ContentRenderer
+                      value={item}
+                      truncateLongData={truncateLongData}
+                    />
+                    {index < Math.min(dataArray.length, 10) - 1 && ", "}
+                  </React.Fragment>
+                );
+              })}
              {dataArray.length > 10 && (
                <span style={{ color: "#888" }}>
                  <br />
--- a/autogpt_platform/frontend/src/components/legacy/ui/render.tsx
+++ b/autogpt_platform/frontend/src/components/legacy/ui/render.tsx
@@ -22,7 +22,7 @@ const isValidVideoUrl = (url: string): boolean => {
  if (url.startsWith("data:video")) {
    return true;
  }
-  const videoExtensions = /\.(mp4|webm|ogg)$/i;
+  const videoExtensions = /\.(mp4|webm|ogg|mov|avi|mkv|m4v)$/i;
  const youtubeRegex = /^(https?:\/\/)?(www\.)?(youtube\.com|youtu\.?be)\/.+$/;
  const cleanedUrl = url.split("?")[0];
  return (
@@ -44,11 +44,29 @@ const isValidAudioUrl = (url: string): boolean => {
  if (url.startsWith("data:audio")) {
    return true;
  }
-  const audioExtensions = /\.(mp3|wav)$/i;
+  const audioExtensions = /\.(mp3|wav|ogg|m4a|aac|flac)$/i;
  const cleanedUrl = url.split("?")[0];
  return isValidMediaUri(url) && audioExtensions.test(cleanedUrl);
 };

+const getVideoMimeType = (url: string): string => {
+  if (url.startsWith("data:video/")) {
+    const match = url.match(/^data:(video\/[^;]+)/);
+    return match?.[1] || "video/mp4";
+  }
+  const extension = url.split("?")[0].split(".").pop()?.toLowerCase();
+  const mimeMap: Record<string, string> = {
+    mp4: "video/mp4",
+    webm: "video/webm",
+    ogg: "video/ogg",
+    mov: "video/quicktime",
+    avi: "video/x-msvideo",
+    mkv: "video/x-matroska",
+    m4v: "video/mp4",
+  };
+  return mimeMap[extension || ""] || "video/mp4";
+};
+
 const VideoRenderer: React.FC<{ videoUrl: string }> = ({ videoUrl }) => {
  const videoId = getYouTubeVideoId(videoUrl);
  return (
@@ -63,7 +81,7 @@ const VideoRenderer: React.FC<{ videoUrl: string }> = ({ videoUrl }) => {
        ></iframe>
      ) : (
        <video controls width="100%" height="315">
-          <source src={videoUrl} type="video/mp4" />
+          <source src={videoUrl} type={getVideoMimeType(videoUrl)} />
          Your browser does not support the video tag.
        </video>
      )}
--- a/autogpt_platform/frontend/src/components/contextual/Chat/components/MarkdownContent/MarkdownContent.tsx
+++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/MarkdownContent/MarkdownContent.tsx
@@ -3,7 +3,7 @@
 import { getGetWorkspaceDownloadFileByIdUrl } from "@/app/api/__generated__/endpoints/workspace/workspace";
 import { cn } from "@/lib/utils";
 import { EyeSlash } from "@phosphor-icons/react";
-import React from "react";
+import React, { useState } from "react";
 import ReactMarkdown from "react-markdown";
 import remarkGfm from "remark-gfm";

@@ -48,7 +48,9 @@ interface InputProps extends React.InputHTMLAttributes<HTMLInputElement> {
 */
 function resolveWorkspaceUrl(src: string): string {
  if (src.startsWith("workspace://")) {
-    const fileId = src.replace("workspace://", "");
+    // Strip MIME type fragment if present (e.g., workspace://abc123#video/mp4 → abc123)
+    const withoutPrefix = src.replace("workspace://", "");
+    const fileId = withoutPrefix.split("#")[0];
    // Use the generated API URL helper to get the correct path
    const apiPath = getGetWorkspaceDownloadFileByIdUrl(fileId);
    // Route through the Next.js proxy (same pattern as customMutator for client-side)
@@ -65,13 +67,49 @@ function isWorkspaceImage(src: string | undefined): boolean {
  return src?.includes("/workspace/files/") ?? false;
 }

+/**
+ * Renders a workspace video with controls and an optional "AI cannot see" badge.
+ */
+function WorkspaceVideo({
+  src,
+  aiCannotSee,
+}: {
+  src: string;
+  aiCannotSee: boolean;
+}) {
+  return (
+    <span className="relative my-2 inline-block">
+      <video
+        controls
+        className="h-auto max-w-full rounded-md border border-zinc-200"
+        preload="metadata"
+      >
+        <source src={src} />
+        Your browser does not support the video tag.
+      </video>
+      {aiCannotSee && (
+        <span
+          className="absolute bottom-2 right-2 flex items-center gap-1 rounded bg-black/70 px-2 py-1 text-xs text-white"
+          title="The AI cannot see this video"
+        >
+          <EyeSlash size={14} />
+          <span>AI cannot see this video</span>
+        </span>
+      )}
+    </span>
+  );
+}
+
 /**
 * Custom image component that shows an indicator when the AI cannot see the image.
+ * Also handles the "video:" alt-text prefix convention to render <video> elements.
+ * For workspace files with unknown types, falls back to <video> if <img> fails.
 * Note: src is already transformed by urlTransform, so workspace:// is now /api/workspace/...
 */
 function MarkdownImage(props: Record<string, unknown>) {
  const src = props.src as string | undefined;
  const alt = props.alt as string | undefined;
+  const [imgFailed, setImgFailed] = useState(false);

  const aiCannotSee = isWorkspaceImage(src);

@@ -84,6 +122,18 @@ function MarkdownImage(props: Record<string, unknown>) {
    );
  }

+  // Detect video: prefix in alt text (set by formatOutputValue in helpers.ts)
+  if (alt?.startsWith("video:")) {
+    return <WorkspaceVideo src={src} aiCannotSee={aiCannotSee} />;
+  }
+
+  // If the <img> failed to load and this is a workspace file, try as video.
+  // This handles generic output keys like "file_out" where the MIME type
+  // isn't known from the key name alone.
+  if (imgFailed && aiCannotSee) {
+    return <WorkspaceVideo src={src} aiCannotSee={aiCannotSee} />;
+  }
+
  return (
    <span className="relative my-2 inline-block">
      {/* eslint-disable-next-line @next/next/no-img-element */}
@@ -92,6 +142,9 @@ function MarkdownImage(props: Record<string, unknown>) {
        alt={alt || "Image"}
        className="h-auto max-w-full rounded-md border border-zinc-200"
        loading="lazy"
+        onError={() => {
+          if (aiCannotSee) setImgFailed(true);
+        }}
      />
      {aiCannotSee && (
        <span
--- a/autogpt_platform/frontend/src/components/contextual/Chat/components/ToolResponseMessage/helpers.ts
+++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ToolResponseMessage/helpers.ts
@@ -39,69 +39,101 @@ export function getErrorMessage(result: unknown): string {

 /**
 * Check if a value is a workspace file reference.
+ * Format: workspace://{fileId} or workspace://{fileId}#{mimeType}
 */
 function isWorkspaceRef(value: unknown): value is string {
  return typeof value === "string" && value.startsWith("workspace://");
 }

 /**
- * Check if a workspace reference appears to be an image based on common patterns.
- * Since workspace refs don't have extensions, we check the context or assume image
- * for certain block types.
- *
- * TODO: Replace keyword matching with MIME type encoded in workspace ref.
- * e.g., workspace://abc123#image/png or workspace://abc123#video/mp4
- * This would let frontend render correctly without fragile keyword matching.
+ * Extract MIME type from a workspace reference fragment.
+ * e.g., "workspace://abc123#video/mp4" → "video/mp4"
+ * Returns undefined if no fragment is present.
 */
-function isLikelyImageRef(value: string, outputKey?: string): boolean {
-  if (!isWorkspaceRef(value)) return false;
-
-  // Check output key name for video-related hints (these are NOT images)
-  const videoKeywords = ["video", "mp4", "mov", "avi", "webm", "movie", "clip"];
-  if (outputKey) {
-    const lowerKey = outputKey.toLowerCase();
-    if (videoKeywords.some((kw) => lowerKey.includes(kw))) {
-      return false;
-    }
-  }
-
-  // Check output key name for image-related hints
-  const imageKeywords = [
-    "image",
-    "img",
-    "photo",
-    "picture",
-    "thumbnail",
-    "avatar",
-    "icon",
-    "screenshot",
-  ];
-  if (outputKey) {
-    const lowerKey = outputKey.toLowerCase();
-    if (imageKeywords.some((kw) => lowerKey.includes(kw))) {
-      return true;
-    }
-  }
-
-  // Default to treating workspace refs as potential images
-  // since that's the most common case for generated content
-  return true;
+function getWorkspaceMimeType(value: string): string | undefined {
+  const hashIndex = value.indexOf("#");
+  if (hashIndex === -1) return undefined;
+  return value.slice(hashIndex + 1) || undefined;
 }

 /**
- * Format a single output value, converting workspace refs to markdown images.
+ * Determine the media category of a workspace ref or data URI.
+ * Uses the MIME type fragment on workspace refs when available,
+ * falls back to output key keyword matching for older refs without it.
 */
-function formatOutputValue(value: unknown, outputKey?: string): string {
-  if (isWorkspaceRef(value) && isLikelyImageRef(value, outputKey)) {
-    // Format as markdown image
-    return `![${outputKey || "Generated image"}](${value})`;
+function getMediaCategory(
+  value: string,
+  outputKey?: string,
+): "video" | "image" | "audio" | "unknown" {
+  // Data URIs carry their own MIME type
+  if (value.startsWith("data:video/")) return "video";
+  if (value.startsWith("data:image/")) return "image";
+  if (value.startsWith("data:audio/")) return "audio";
+
+  // Workspace refs: prefer MIME type fragment
+  if (isWorkspaceRef(value)) {
+    const mime = getWorkspaceMimeType(value);
+    if (mime) {
+      if (mime.startsWith("video/")) return "video";
+      if (mime.startsWith("image/")) return "image";
+      if (mime.startsWith("audio/")) return "audio";
+      return "unknown";
+    }
+
+    // Fallback: keyword matching on output key for older refs without fragment
+    if (outputKey) {
+      const lowerKey = outputKey.toLowerCase();
+
+      const videoKeywords = [
+        "video",
+        "mp4",
+        "mov",
+        "avi",
+        "webm",
+        "movie",
+        "clip",
+      ];
+      if (videoKeywords.some((kw) => lowerKey.includes(kw))) return "video";
+
+      const imageKeywords = [
+        "image",
+        "img",
+        "photo",
+        "picture",
+        "thumbnail",
+        "avatar",
+        "icon",
+        "screenshot",
+      ];
+      if (imageKeywords.some((kw) => lowerKey.includes(kw))) return "image";
+    }
+
+    // Default to image for backward compatibility
+    return "image";
  }

+  return "unknown";
+}
+
+/**
+ * Format a single output value, converting workspace refs to markdown images/videos.
+ * Videos use a "video:" alt-text prefix so the MarkdownContent renderer can
+ * distinguish them from images and render a <video> element.
+ */
+function formatOutputValue(value: unknown, outputKey?: string): string {
  if (typeof value === "string") {
-    // Check for data URIs (images)
-    if (value.startsWith("data:image/")) {
+    const category = getMediaCategory(value, outputKey);
+
+    if (category === "video") {
+      // Format with "video:" prefix so MarkdownContent renders <video>
+      return `![video:${outputKey || "Video"}](${value})`;
+    }
+
+    if (category === "image") {
      return `![${outputKey || "Generated image"}](${value})`;
    }
+
+    // For audio, unknown workspace refs, data URIs, etc. - return as-is
    return value;
  }