diff --git a/autogpt_platform/backend/.env.default b/autogpt_platform/backend/.env.default index b393f13017..fa52ba812a 100644 --- a/autogpt_platform/backend/.env.default +++ b/autogpt_platform/backend/.env.default @@ -152,6 +152,7 @@ REPLICATE_API_KEY= REVID_API_KEY= SCREENSHOTONE_API_KEY= UNREAL_SPEECH_API_KEY= +ELEVENLABS_API_KEY= # Data & Search Services E2B_API_KEY= diff --git a/autogpt_platform/backend/Dockerfile b/autogpt_platform/backend/Dockerfile index 103226d079..9bd455e490 100644 --- a/autogpt_platform/backend/Dockerfile +++ b/autogpt_platform/backend/Dockerfile @@ -62,10 +62,12 @@ ENV POETRY_HOME=/opt/poetry \ DEBIAN_FRONTEND=noninteractive ENV PATH=/opt/poetry/bin:$PATH -# Install Python without upgrading system-managed packages +# Install Python, FFmpeg, and ImageMagick (required for video processing blocks) RUN apt-get update && apt-get install -y \ python3.13 \ python3-pip \ + ffmpeg \ + imagemagick \ && rm -rf /var/lib/apt/lists/* # Copy only necessary files from builder diff --git a/autogpt_platform/backend/backend/blocks/elevenlabs/_auth.py b/autogpt_platform/backend/backend/blocks/elevenlabs/_auth.py new file mode 100644 index 0000000000..b823627b43 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/elevenlabs/_auth.py @@ -0,0 +1,28 @@ +"""ElevenLabs integration blocks - test credentials and shared utilities.""" + +from typing import Literal + +from pydantic import SecretStr + +from backend.data.model import APIKeyCredentials, CredentialsMetaInput +from backend.integrations.providers import ProviderName + +TEST_CREDENTIALS = APIKeyCredentials( + id="01234567-89ab-cdef-0123-456789abcdef", + provider="elevenlabs", + api_key=SecretStr("mock-elevenlabs-api-key"), + title="Mock ElevenLabs API key", + expires_at=None, +) + +TEST_CREDENTIALS_INPUT = { + "provider": TEST_CREDENTIALS.provider, + "id": TEST_CREDENTIALS.id, + "type": TEST_CREDENTIALS.type, + "title": TEST_CREDENTIALS.title, +} + +ElevenLabsCredentials = APIKeyCredentials +ElevenLabsCredentialsInput = CredentialsMetaInput[ + Literal[ProviderName.ELEVENLABS], Literal["api_key"] +] diff --git a/autogpt_platform/backend/backend/blocks/media.py b/autogpt_platform/backend/backend/blocks/media.py deleted file mode 100644 index a8d145bc64..0000000000 --- a/autogpt_platform/backend/backend/blocks/media.py +++ /dev/null @@ -1,246 +0,0 @@ -import os -import tempfile -from typing import Optional - -from moviepy.audio.io.AudioFileClip import AudioFileClip -from moviepy.video.fx.Loop import Loop -from moviepy.video.io.VideoFileClip import VideoFileClip - -from backend.data.block import ( - Block, - BlockCategory, - BlockOutput, - BlockSchemaInput, - BlockSchemaOutput, -) -from backend.data.execution import ExecutionContext -from backend.data.model import SchemaField -from backend.util.file import MediaFileType, get_exec_file_path, store_media_file - - -class MediaDurationBlock(Block): - - class Input(BlockSchemaInput): - media_in: MediaFileType = SchemaField( - description="Media input (URL, data URI, or local path)." - ) - is_video: bool = SchemaField( - description="Whether the media is a video (True) or audio (False).", - default=True, - ) - - class Output(BlockSchemaOutput): - duration: float = SchemaField( - description="Duration of the media file (in seconds)." - ) - - def __init__(self): - super().__init__( - id="d8b91fd4-da26-42d4-8ecb-8b196c6d84b6", - description="Block to get the duration of a media file.", - categories={BlockCategory.MULTIMEDIA}, - input_schema=MediaDurationBlock.Input, - output_schema=MediaDurationBlock.Output, - ) - - async def run( - self, - input_data: Input, - *, - execution_context: ExecutionContext, - **kwargs, - ) -> BlockOutput: - # 1) Store the input media locally - local_media_path = await store_media_file( - file=input_data.media_in, - execution_context=execution_context, - return_format="for_local_processing", - ) - assert execution_context.graph_exec_id is not None - media_abspath = get_exec_file_path( - execution_context.graph_exec_id, local_media_path - ) - - # 2) Load the clip - if input_data.is_video: - clip = VideoFileClip(media_abspath) - else: - clip = AudioFileClip(media_abspath) - - yield "duration", clip.duration - - -class LoopVideoBlock(Block): - """ - Block for looping (repeating) a video clip until a given duration or number of loops. - """ - - class Input(BlockSchemaInput): - video_in: MediaFileType = SchemaField( - description="The input video (can be a URL, data URI, or local path)." - ) - # Provide EITHER a `duration` or `n_loops` or both. We'll demonstrate `duration`. - duration: Optional[float] = SchemaField( - description="Target duration (in seconds) to loop the video to. If omitted, defaults to no looping.", - default=None, - ge=0.0, - ) - n_loops: Optional[int] = SchemaField( - description="Number of times to repeat the video. If omitted, defaults to 1 (no repeat).", - default=None, - ge=1, - ) - - class Output(BlockSchemaOutput): - video_out: str = SchemaField( - description="Looped video returned either as a relative path or a data URI." - ) - - def __init__(self): - super().__init__( - id="8bf9eef6-5451-4213-b265-25306446e94b", - description="Block to loop a video to a given duration or number of repeats.", - categories={BlockCategory.MULTIMEDIA}, - input_schema=LoopVideoBlock.Input, - output_schema=LoopVideoBlock.Output, - ) - - async def run( - self, - input_data: Input, - *, - execution_context: ExecutionContext, - **kwargs, - ) -> BlockOutput: - assert execution_context.graph_exec_id is not None - assert execution_context.node_exec_id is not None - graph_exec_id = execution_context.graph_exec_id - node_exec_id = execution_context.node_exec_id - - # 1) Store the input video locally - local_video_path = await store_media_file( - file=input_data.video_in, - execution_context=execution_context, - return_format="for_local_processing", - ) - input_abspath = get_exec_file_path(graph_exec_id, local_video_path) - - # 2) Load the clip - clip = VideoFileClip(input_abspath) - - # 3) Apply the loop effect - looped_clip = clip - if input_data.duration: - # Loop until we reach the specified duration - looped_clip = looped_clip.with_effects([Loop(duration=input_data.duration)]) - elif input_data.n_loops: - looped_clip = looped_clip.with_effects([Loop(n=input_data.n_loops)]) - else: - raise ValueError("Either 'duration' or 'n_loops' must be provided.") - - assert isinstance(looped_clip, VideoFileClip) - - # 4) Save the looped output - output_filename = MediaFileType( - f"{node_exec_id}_looped_{os.path.basename(local_video_path)}" - ) - output_abspath = get_exec_file_path(graph_exec_id, output_filename) - - looped_clip = looped_clip.with_audio(clip.audio) - looped_clip.write_videofile(output_abspath, codec="libx264", audio_codec="aac") - - # Return output - for_block_output returns workspace:// if available, else data URI - video_out = await store_media_file( - file=output_filename, - execution_context=execution_context, - return_format="for_block_output", - ) - - yield "video_out", video_out - - -class AddAudioToVideoBlock(Block): - """ - Block that adds (attaches) an audio track to an existing video. - Optionally scale the volume of the new track. - """ - - class Input(BlockSchemaInput): - video_in: MediaFileType = SchemaField( - description="Video input (URL, data URI, or local path)." - ) - audio_in: MediaFileType = SchemaField( - description="Audio input (URL, data URI, or local path)." - ) - volume: float = SchemaField( - description="Volume scale for the newly attached audio track (1.0 = original).", - default=1.0, - ) - - class Output(BlockSchemaOutput): - video_out: MediaFileType = SchemaField( - description="Final video (with attached audio), as a path or data URI." - ) - - def __init__(self): - super().__init__( - id="3503748d-62b6-4425-91d6-725b064af509", - description="Block to attach an audio file to a video file using moviepy.", - categories={BlockCategory.MULTIMEDIA}, - input_schema=AddAudioToVideoBlock.Input, - output_schema=AddAudioToVideoBlock.Output, - ) - - async def run( - self, - input_data: Input, - *, - execution_context: ExecutionContext, - **kwargs, - ) -> BlockOutput: - assert execution_context.graph_exec_id is not None - assert execution_context.node_exec_id is not None - graph_exec_id = execution_context.graph_exec_id - node_exec_id = execution_context.node_exec_id - - # 1) Store the inputs locally - local_video_path = await store_media_file( - file=input_data.video_in, - execution_context=execution_context, - return_format="for_local_processing", - ) - local_audio_path = await store_media_file( - file=input_data.audio_in, - execution_context=execution_context, - return_format="for_local_processing", - ) - - abs_temp_dir = os.path.join(tempfile.gettempdir(), "exec_file", graph_exec_id) - video_abspath = os.path.join(abs_temp_dir, local_video_path) - audio_abspath = os.path.join(abs_temp_dir, local_audio_path) - - # 2) Load video + audio with moviepy - video_clip = VideoFileClip(video_abspath) - audio_clip = AudioFileClip(audio_abspath) - # Optionally scale volume - if input_data.volume != 1.0: - audio_clip = audio_clip.with_volume_scaled(input_data.volume) - - # 3) Attach the new audio track - final_clip = video_clip.with_audio(audio_clip) - - # 4) Write to output file - output_filename = MediaFileType( - f"{node_exec_id}_audio_attached_{os.path.basename(local_video_path)}" - ) - output_abspath = os.path.join(abs_temp_dir, output_filename) - final_clip.write_videofile(output_abspath, codec="libx264", audio_codec="aac") - - # 5) Return output - for_block_output returns workspace:// if available, else data URI - video_out = await store_media_file( - file=output_filename, - execution_context=execution_context, - return_format="for_block_output", - ) - - yield "video_out", video_out diff --git a/autogpt_platform/backend/backend/blocks/video/__init__.py b/autogpt_platform/backend/backend/blocks/video/__init__.py new file mode 100644 index 0000000000..4974ae8a87 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/__init__.py @@ -0,0 +1,37 @@ +"""Video editing blocks for AutoGPT Platform. + +This module provides blocks for: +- Downloading videos from URLs (YouTube, Vimeo, news sites, direct links) +- Clipping/trimming video segments +- Concatenating multiple videos +- Adding text overlays +- Adding AI-generated narration +- Getting media duration +- Looping videos +- Adding audio to videos + +Dependencies: +- yt-dlp: For video downloading +- moviepy: For video editing operations +- elevenlabs: For AI narration (optional) +""" + +from backend.blocks.video.add_audio import AddAudioToVideoBlock +from backend.blocks.video.clip import VideoClipBlock +from backend.blocks.video.concat import VideoConcatBlock +from backend.blocks.video.download import VideoDownloadBlock +from backend.blocks.video.duration import MediaDurationBlock +from backend.blocks.video.loop import LoopVideoBlock +from backend.blocks.video.narration import VideoNarrationBlock +from backend.blocks.video.text_overlay import VideoTextOverlayBlock + +__all__ = [ + "AddAudioToVideoBlock", + "LoopVideoBlock", + "MediaDurationBlock", + "VideoClipBlock", + "VideoConcatBlock", + "VideoDownloadBlock", + "VideoNarrationBlock", + "VideoTextOverlayBlock", +] diff --git a/autogpt_platform/backend/backend/blocks/video/_utils.py b/autogpt_platform/backend/backend/blocks/video/_utils.py new file mode 100644 index 0000000000..9ebf195078 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/_utils.py @@ -0,0 +1,131 @@ +"""Shared utilities for video blocks.""" + +from __future__ import annotations + +import logging +import os +import re +import subprocess +from pathlib import Path + +logger = logging.getLogger(__name__) + +# Known operation tags added by video blocks +_VIDEO_OPS = ( + r"(?:clip|overlay|narrated|looped|concat|audio_attached|with_audio|narration)" +) + +# Matches: {node_exec_id}_{operation}_ where node_exec_id contains a UUID +_BLOCK_PREFIX_RE = re.compile( + r"^[a-zA-Z0-9_-]*" + r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" + r"[a-zA-Z0-9_-]*" + r"_" + _VIDEO_OPS + r"_" +) + +# Matches: a lone {node_exec_id}_ prefix (no operation keyword, e.g. download output) +_UUID_PREFIX_RE = re.compile( + r"^[a-zA-Z0-9_-]*" + r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" + r"[a-zA-Z0-9_-]*_" +) + + +def extract_source_name(input_path: str, max_length: int = 50) -> str: + """Extract the original source filename by stripping block-generated prefixes. + + Iteratively removes {node_exec_id}_{operation}_ prefixes that accumulate + when chaining video blocks, recovering the original human-readable name. + + Safe for plain filenames (no UUID -> no stripping). + Falls back to "video" if everything is stripped. + """ + stem = Path(input_path).stem + + # Pass 1: strip {node_exec_id}_{operation}_ prefixes iteratively + while _BLOCK_PREFIX_RE.match(stem): + stem = _BLOCK_PREFIX_RE.sub("", stem, count=1) + + # Pass 2: strip a lone {node_exec_id}_ prefix (e.g. from download block) + if _UUID_PREFIX_RE.match(stem): + stem = _UUID_PREFIX_RE.sub("", stem, count=1) + + if not stem: + return "video" + + return stem[:max_length] + + +def get_video_codecs(output_path: str) -> tuple[str, str]: + """Get appropriate video and audio codecs based on output file extension. + + Args: + output_path: Path to the output file (used to determine extension) + + Returns: + Tuple of (video_codec, audio_codec) + + Codec mappings: + - .mp4: H.264 + AAC (universal compatibility) + - .webm: VP8 + Vorbis (web streaming) + - .mkv: H.264 + AAC (container supports many codecs) + - .mov: H.264 + AAC (Apple QuickTime, widely compatible) + - .m4v: H.264 + AAC (Apple iTunes/devices) + - .avi: MPEG-4 + MP3 (legacy Windows) + """ + ext = os.path.splitext(output_path)[1].lower() + + codec_map: dict[str, tuple[str, str]] = { + ".mp4": ("libx264", "aac"), + ".webm": ("libvpx", "libvorbis"), + ".mkv": ("libx264", "aac"), + ".mov": ("libx264", "aac"), + ".m4v": ("libx264", "aac"), + ".avi": ("mpeg4", "libmp3lame"), + } + + return codec_map.get(ext, ("libx264", "aac")) + + +def strip_chapters_inplace(video_path: str) -> None: + """Strip chapter metadata from a media file in-place using ffmpeg. + + MoviePy 2.x crashes with IndexError when parsing files with embedded + chapter metadata (https://github.com/Zulko/moviepy/issues/2419). + This strips chapters without re-encoding. + + Args: + video_path: Absolute path to the media file to strip chapters from. + """ + base, ext = os.path.splitext(video_path) + tmp_path = base + ".tmp" + ext + try: + result = subprocess.run( + [ + "ffmpeg", + "-y", + "-i", + video_path, + "-map_chapters", + "-1", + "-codec", + "copy", + tmp_path, + ], + capture_output=True, + text=True, + timeout=300, + ) + if result.returncode != 0: + logger.warning( + "ffmpeg chapter strip failed (rc=%d): %s", + result.returncode, + result.stderr, + ) + return + os.replace(tmp_path, video_path) + except FileNotFoundError: + logger.warning("ffmpeg not found; skipping chapter strip") + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) diff --git a/autogpt_platform/backend/backend/blocks/video/add_audio.py b/autogpt_platform/backend/backend/blocks/video/add_audio.py new file mode 100644 index 0000000000..ebd4ab94f2 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/add_audio.py @@ -0,0 +1,113 @@ +"""AddAudioToVideoBlock - Attach an audio track to a video file.""" + +from moviepy.audio.io.AudioFileClip import AudioFileClip +from moviepy.video.io.VideoFileClip import VideoFileClip + +from backend.blocks.video._utils import extract_source_name, strip_chapters_inplace +from backend.data.block import ( + Block, + BlockCategory, + BlockOutput, + BlockSchemaInput, + BlockSchemaOutput, +) +from backend.data.execution import ExecutionContext +from backend.data.model import SchemaField +from backend.util.file import MediaFileType, get_exec_file_path, store_media_file + + +class AddAudioToVideoBlock(Block): + """Add (attach) an audio track to an existing video.""" + + class Input(BlockSchemaInput): + video_in: MediaFileType = SchemaField( + description="Video input (URL, data URI, or local path)." + ) + audio_in: MediaFileType = SchemaField( + description="Audio input (URL, data URI, or local path)." + ) + volume: float = SchemaField( + description="Volume scale for the newly attached audio track (1.0 = original).", + default=1.0, + ) + + class Output(BlockSchemaOutput): + video_out: MediaFileType = SchemaField( + description="Final video (with attached audio), as a path or data URI." + ) + + def __init__(self): + super().__init__( + id="3503748d-62b6-4425-91d6-725b064af509", + description="Block to attach an audio file to a video file using moviepy.", + categories={BlockCategory.MULTIMEDIA}, + input_schema=AddAudioToVideoBlock.Input, + output_schema=AddAudioToVideoBlock.Output, + ) + + async def run( + self, + input_data: Input, + *, + execution_context: ExecutionContext, + **kwargs, + ) -> BlockOutput: + assert execution_context.graph_exec_id is not None + assert execution_context.node_exec_id is not None + graph_exec_id = execution_context.graph_exec_id + node_exec_id = execution_context.node_exec_id + + # 1) Store the inputs locally + local_video_path = await store_media_file( + file=input_data.video_in, + execution_context=execution_context, + return_format="for_local_processing", + ) + local_audio_path = await store_media_file( + file=input_data.audio_in, + execution_context=execution_context, + return_format="for_local_processing", + ) + + video_abspath = get_exec_file_path(graph_exec_id, local_video_path) + audio_abspath = get_exec_file_path(graph_exec_id, local_audio_path) + + # 2) Load video + audio with moviepy + strip_chapters_inplace(video_abspath) + strip_chapters_inplace(audio_abspath) + video_clip = None + audio_clip = None + final_clip = None + try: + video_clip = VideoFileClip(video_abspath) + audio_clip = AudioFileClip(audio_abspath) + # Optionally scale volume + if input_data.volume != 1.0: + audio_clip = audio_clip.with_volume_scaled(input_data.volume) + + # 3) Attach the new audio track + final_clip = video_clip.with_audio(audio_clip) + + # 4) Write to output file + source = extract_source_name(local_video_path) + output_filename = MediaFileType(f"{node_exec_id}_with_audio_{source}.mp4") + output_abspath = get_exec_file_path(graph_exec_id, output_filename) + final_clip.write_videofile( + output_abspath, codec="libx264", audio_codec="aac" + ) + finally: + if final_clip: + final_clip.close() + if audio_clip: + audio_clip.close() + if video_clip: + video_clip.close() + + # 5) Return output - for_block_output returns workspace:// if available, else data URI + video_out = await store_media_file( + file=output_filename, + execution_context=execution_context, + return_format="for_block_output", + ) + + yield "video_out", video_out diff --git a/autogpt_platform/backend/backend/blocks/video/clip.py b/autogpt_platform/backend/backend/blocks/video/clip.py new file mode 100644 index 0000000000..05deea6530 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/clip.py @@ -0,0 +1,167 @@ +"""VideoClipBlock - Extract a segment from a video file.""" + +from typing import Literal + +from moviepy.video.io.VideoFileClip import VideoFileClip + +from backend.blocks.video._utils import ( + extract_source_name, + get_video_codecs, + strip_chapters_inplace, +) +from backend.data.block import ( + Block, + BlockCategory, + BlockOutput, + BlockSchemaInput, + BlockSchemaOutput, +) +from backend.data.execution import ExecutionContext +from backend.data.model import SchemaField +from backend.util.exceptions import BlockExecutionError +from backend.util.file import MediaFileType, get_exec_file_path, store_media_file + + +class VideoClipBlock(Block): + """Extract a time segment from a video.""" + + class Input(BlockSchemaInput): + video_in: MediaFileType = SchemaField( + description="Input video (URL, data URI, or local path)" + ) + start_time: float = SchemaField(description="Start time in seconds", ge=0.0) + end_time: float = SchemaField(description="End time in seconds", ge=0.0) + output_format: Literal["mp4", "webm", "mkv", "mov"] = SchemaField( + description="Output format", default="mp4", advanced=True + ) + + class Output(BlockSchemaOutput): + video_out: MediaFileType = SchemaField( + description="Clipped video file (path or data URI)" + ) + duration: float = SchemaField(description="Clip duration in seconds") + + def __init__(self): + super().__init__( + id="8f539119-e580-4d86-ad41-86fbcb22abb1", + description="Extract a time segment from a video", + categories={BlockCategory.MULTIMEDIA}, + input_schema=self.Input, + output_schema=self.Output, + test_input={ + "video_in": "/tmp/test.mp4", + "start_time": 0.0, + "end_time": 10.0, + }, + test_output=[("video_out", str), ("duration", float)], + test_mock={ + "_clip_video": lambda *args: 10.0, + "_store_input_video": lambda *args, **kwargs: "test.mp4", + "_store_output_video": lambda *args, **kwargs: "clip_test.mp4", + }, + ) + + async def _store_input_video( + self, execution_context: ExecutionContext, file: MediaFileType + ) -> MediaFileType: + """Store input video. Extracted for testability.""" + return await store_media_file( + file=file, + execution_context=execution_context, + return_format="for_local_processing", + ) + + async def _store_output_video( + self, execution_context: ExecutionContext, file: MediaFileType + ) -> MediaFileType: + """Store output video. Extracted for testability.""" + return await store_media_file( + file=file, + execution_context=execution_context, + return_format="for_block_output", + ) + + def _clip_video( + self, + video_abspath: str, + output_abspath: str, + start_time: float, + end_time: float, + ) -> float: + """Extract a clip from a video. Extracted for testability.""" + clip = None + subclip = None + try: + strip_chapters_inplace(video_abspath) + clip = VideoFileClip(video_abspath) + subclip = clip.subclipped(start_time, end_time) + video_codec, audio_codec = get_video_codecs(output_abspath) + subclip.write_videofile( + output_abspath, codec=video_codec, audio_codec=audio_codec + ) + return subclip.duration + finally: + if subclip: + subclip.close() + if clip: + clip.close() + + async def run( + self, + input_data: Input, + *, + execution_context: ExecutionContext, + node_exec_id: str, + **kwargs, + ) -> BlockOutput: + # Validate time range + if input_data.end_time <= input_data.start_time: + raise BlockExecutionError( + message=f"end_time ({input_data.end_time}) must be greater than start_time ({input_data.start_time})", + block_name=self.name, + block_id=str(self.id), + ) + + try: + assert execution_context.graph_exec_id is not None + + # Store the input video locally + local_video_path = await self._store_input_video( + execution_context, input_data.video_in + ) + video_abspath = get_exec_file_path( + execution_context.graph_exec_id, local_video_path + ) + + # Build output path + source = extract_source_name(local_video_path) + output_filename = MediaFileType( + f"{node_exec_id}_clip_{source}.{input_data.output_format}" + ) + output_abspath = get_exec_file_path( + execution_context.graph_exec_id, output_filename + ) + + duration = self._clip_video( + video_abspath, + output_abspath, + input_data.start_time, + input_data.end_time, + ) + + # Return as workspace path or data URI based on context + video_out = await self._store_output_video( + execution_context, output_filename + ) + + yield "video_out", video_out + yield "duration", duration + + except BlockExecutionError: + raise + except Exception as e: + raise BlockExecutionError( + message=f"Failed to clip video: {e}", + block_name=self.name, + block_id=str(self.id), + ) from e diff --git a/autogpt_platform/backend/backend/blocks/video/concat.py b/autogpt_platform/backend/backend/blocks/video/concat.py new file mode 100644 index 0000000000..b49854fb40 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/concat.py @@ -0,0 +1,227 @@ +"""VideoConcatBlock - Concatenate multiple video clips into one.""" + +from typing import Literal + +from moviepy import concatenate_videoclips +from moviepy.video.fx import CrossFadeIn, CrossFadeOut, FadeIn, FadeOut +from moviepy.video.io.VideoFileClip import VideoFileClip + +from backend.blocks.video._utils import ( + extract_source_name, + get_video_codecs, + strip_chapters_inplace, +) +from backend.data.block import ( + Block, + BlockCategory, + BlockOutput, + BlockSchemaInput, + BlockSchemaOutput, +) +from backend.data.execution import ExecutionContext +from backend.data.model import SchemaField +from backend.util.exceptions import BlockExecutionError +from backend.util.file import MediaFileType, get_exec_file_path, store_media_file + + +class VideoConcatBlock(Block): + """Merge multiple video clips into one continuous video.""" + + class Input(BlockSchemaInput): + videos: list[MediaFileType] = SchemaField( + description="List of video files to concatenate (in order)" + ) + transition: Literal["none", "crossfade", "fade_black"] = SchemaField( + description="Transition between clips", default="none" + ) + transition_duration: int = SchemaField( + description="Transition duration in seconds", + default=1, + ge=0, + advanced=True, + ) + output_format: Literal["mp4", "webm", "mkv", "mov"] = SchemaField( + description="Output format", default="mp4", advanced=True + ) + + class Output(BlockSchemaOutput): + video_out: MediaFileType = SchemaField( + description="Concatenated video file (path or data URI)" + ) + total_duration: float = SchemaField(description="Total duration in seconds") + + def __init__(self): + super().__init__( + id="9b0f531a-1118-487f-aeec-3fa63ea8900a", + description="Merge multiple video clips into one continuous video", + categories={BlockCategory.MULTIMEDIA}, + input_schema=self.Input, + output_schema=self.Output, + test_input={ + "videos": ["/tmp/a.mp4", "/tmp/b.mp4"], + }, + test_output=[ + ("video_out", str), + ("total_duration", float), + ], + test_mock={ + "_concat_videos": lambda *args: 20.0, + "_store_input_video": lambda *args, **kwargs: "test.mp4", + "_store_output_video": lambda *args, **kwargs: "concat_test.mp4", + }, + ) + + async def _store_input_video( + self, execution_context: ExecutionContext, file: MediaFileType + ) -> MediaFileType: + """Store input video. Extracted for testability.""" + return await store_media_file( + file=file, + execution_context=execution_context, + return_format="for_local_processing", + ) + + async def _store_output_video( + self, execution_context: ExecutionContext, file: MediaFileType + ) -> MediaFileType: + """Store output video. Extracted for testability.""" + return await store_media_file( + file=file, + execution_context=execution_context, + return_format="for_block_output", + ) + + def _concat_videos( + self, + video_abspaths: list[str], + output_abspath: str, + transition: str, + transition_duration: int, + ) -> float: + """Concatenate videos. Extracted for testability. + + Returns: + Total duration of the concatenated video. + """ + clips = [] + faded_clips = [] + final = None + try: + # Load clips + for v in video_abspaths: + strip_chapters_inplace(v) + clips.append(VideoFileClip(v)) + + # Validate transition_duration against shortest clip + if transition in {"crossfade", "fade_black"} and transition_duration > 0: + min_duration = min(c.duration for c in clips) + if transition_duration >= min_duration: + raise BlockExecutionError( + message=( + f"transition_duration ({transition_duration}s) must be " + f"shorter than the shortest clip ({min_duration:.2f}s)" + ), + block_name=self.name, + block_id=str(self.id), + ) + + if transition == "crossfade": + for i, clip in enumerate(clips): + effects = [] + if i > 0: + effects.append(CrossFadeIn(transition_duration)) + if i < len(clips) - 1: + effects.append(CrossFadeOut(transition_duration)) + if effects: + clip = clip.with_effects(effects) + faded_clips.append(clip) + final = concatenate_videoclips( + faded_clips, + method="compose", + padding=-transition_duration, + ) + elif transition == "fade_black": + for clip in clips: + faded = clip.with_effects( + [FadeIn(transition_duration), FadeOut(transition_duration)] + ) + faded_clips.append(faded) + final = concatenate_videoclips(faded_clips) + else: + final = concatenate_videoclips(clips) + + video_codec, audio_codec = get_video_codecs(output_abspath) + final.write_videofile( + output_abspath, codec=video_codec, audio_codec=audio_codec + ) + + return final.duration + finally: + if final: + final.close() + for clip in faded_clips: + clip.close() + for clip in clips: + clip.close() + + async def run( + self, + input_data: Input, + *, + execution_context: ExecutionContext, + node_exec_id: str, + **kwargs, + ) -> BlockOutput: + # Validate minimum clips + if len(input_data.videos) < 2: + raise BlockExecutionError( + message="At least 2 videos are required for concatenation", + block_name=self.name, + block_id=str(self.id), + ) + + try: + assert execution_context.graph_exec_id is not None + + # Store all input videos locally + video_abspaths = [] + for video in input_data.videos: + local_path = await self._store_input_video(execution_context, video) + video_abspaths.append( + get_exec_file_path(execution_context.graph_exec_id, local_path) + ) + + # Build output path + source = ( + extract_source_name(video_abspaths[0]) if video_abspaths else "video" + ) + output_filename = MediaFileType( + f"{node_exec_id}_concat_{source}.{input_data.output_format}" + ) + output_abspath = get_exec_file_path( + execution_context.graph_exec_id, output_filename + ) + + total_duration = self._concat_videos( + video_abspaths, + output_abspath, + input_data.transition, + input_data.transition_duration, + ) + + # Return as workspace path or data URI based on context + video_out = await self._store_output_video( + execution_context, output_filename + ) + + yield "video_out", video_out + yield "total_duration", total_duration + + except BlockExecutionError: + raise + except Exception as e: + raise BlockExecutionError( + message=f"Failed to concatenate videos: {e}", + block_name=self.name, + block_id=str(self.id), + ) from e diff --git a/autogpt_platform/backend/backend/blocks/video/download.py b/autogpt_platform/backend/backend/blocks/video/download.py new file mode 100644 index 0000000000..4046d5df42 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/download.py @@ -0,0 +1,172 @@ +"""VideoDownloadBlock - Download video from URL (YouTube, Vimeo, news sites, direct links).""" + +import os +import typing +from typing import Literal + +import yt_dlp + +if typing.TYPE_CHECKING: + from yt_dlp import _Params + +from backend.data.block import ( + Block, + BlockCategory, + BlockOutput, + BlockSchemaInput, + BlockSchemaOutput, +) +from backend.data.execution import ExecutionContext +from backend.data.model import SchemaField +from backend.util.exceptions import BlockExecutionError +from backend.util.file import MediaFileType, get_exec_file_path, store_media_file + + +class VideoDownloadBlock(Block): + """Download video from URL using yt-dlp.""" + + class Input(BlockSchemaInput): + url: str = SchemaField( + description="URL of the video to download (YouTube, Vimeo, direct link, etc.)", + placeholder="https://www.youtube.com/watch?v=...", + ) + quality: Literal["best", "1080p", "720p", "480p", "audio_only"] = SchemaField( + description="Video quality preference", default="720p" + ) + output_format: Literal["mp4", "webm", "mkv"] = SchemaField( + description="Output video format", default="mp4", advanced=True + ) + + class Output(BlockSchemaOutput): + video_file: MediaFileType = SchemaField( + description="Downloaded video (path or data URI)" + ) + duration: float = SchemaField(description="Video duration in seconds") + title: str = SchemaField(description="Video title from source") + source_url: str = SchemaField(description="Original source URL") + + def __init__(self): + super().__init__( + id="c35daabb-cd60-493b-b9ad-51f1fe4b50c4", + description="Download video from URL (YouTube, Vimeo, news sites, direct links)", + categories={BlockCategory.MULTIMEDIA}, + input_schema=self.Input, + output_schema=self.Output, + disabled=True, # Disable until we can sandbox yt-dlp and handle security implications + test_input={ + "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", + "quality": "480p", + }, + test_output=[ + ("video_file", str), + ("duration", float), + ("title", str), + ("source_url", str), + ], + test_mock={ + "_download_video": lambda *args: ( + "video.mp4", + 212.0, + "Test Video", + ), + "_store_output_video": lambda *args, **kwargs: "video.mp4", + }, + ) + + async def _store_output_video( + self, execution_context: ExecutionContext, file: MediaFileType + ) -> MediaFileType: + """Store output video. Extracted for testability.""" + return await store_media_file( + file=file, + execution_context=execution_context, + return_format="for_block_output", + ) + + def _get_format_string(self, quality: str) -> str: + formats = { + "best": "bestvideo+bestaudio/best", + "1080p": "bestvideo[height<=1080]+bestaudio/best[height<=1080]", + "720p": "bestvideo[height<=720]+bestaudio/best[height<=720]", + "480p": "bestvideo[height<=480]+bestaudio/best[height<=480]", + "audio_only": "bestaudio/best", + } + return formats.get(quality, formats["720p"]) + + def _download_video( + self, + url: str, + quality: str, + output_format: str, + output_dir: str, + node_exec_id: str, + ) -> tuple[str, float, str]: + """Download video. Extracted for testability.""" + output_template = os.path.join( + output_dir, f"{node_exec_id}_%(title).50s.%(ext)s" + ) + + ydl_opts: "_Params" = { + "format": f"{self._get_format_string(quality)}/best", + "outtmpl": output_template, + "merge_output_format": output_format, + "quiet": True, + "no_warnings": True, + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=True) + video_path = ydl.prepare_filename(info) + + # Handle format conversion in filename + if not video_path.endswith(f".{output_format}"): + video_path = video_path.rsplit(".", 1)[0] + f".{output_format}" + + # Return just the filename, not the full path + filename = os.path.basename(video_path) + + return ( + filename, + info.get("duration") or 0.0, + info.get("title") or "Unknown", + ) + + async def run( + self, + input_data: Input, + *, + execution_context: ExecutionContext, + node_exec_id: str, + **kwargs, + ) -> BlockOutput: + try: + assert execution_context.graph_exec_id is not None + + # Get the exec file directory + output_dir = get_exec_file_path(execution_context.graph_exec_id, "") + os.makedirs(output_dir, exist_ok=True) + + filename, duration, title = self._download_video( + input_data.url, + input_data.quality, + input_data.output_format, + output_dir, + node_exec_id, + ) + + # Return as workspace path or data URI based on context + video_out = await self._store_output_video( + execution_context, MediaFileType(filename) + ) + + yield "video_file", video_out + yield "duration", duration + yield "title", title + yield "source_url", input_data.url + + except Exception as e: + raise BlockExecutionError( + message=f"Failed to download video: {e}", + block_name=self.name, + block_id=str(self.id), + ) from e diff --git a/autogpt_platform/backend/backend/blocks/video/duration.py b/autogpt_platform/backend/backend/blocks/video/duration.py new file mode 100644 index 0000000000..9e05d35b00 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/duration.py @@ -0,0 +1,77 @@ +"""MediaDurationBlock - Get the duration of a media file.""" + +from moviepy.audio.io.AudioFileClip import AudioFileClip +from moviepy.video.io.VideoFileClip import VideoFileClip + +from backend.blocks.video._utils import strip_chapters_inplace +from backend.data.block import ( + Block, + BlockCategory, + BlockOutput, + BlockSchemaInput, + BlockSchemaOutput, +) +from backend.data.execution import ExecutionContext +from backend.data.model import SchemaField +from backend.util.file import MediaFileType, get_exec_file_path, store_media_file + + +class MediaDurationBlock(Block): + """Get the duration of a media file (video or audio).""" + + class Input(BlockSchemaInput): + media_in: MediaFileType = SchemaField( + description="Media input (URL, data URI, or local path)." + ) + is_video: bool = SchemaField( + description="Whether the media is a video (True) or audio (False).", + default=True, + ) + + class Output(BlockSchemaOutput): + duration: float = SchemaField( + description="Duration of the media file (in seconds)." + ) + + def __init__(self): + super().__init__( + id="d8b91fd4-da26-42d4-8ecb-8b196c6d84b6", + description="Block to get the duration of a media file.", + categories={BlockCategory.MULTIMEDIA}, + input_schema=MediaDurationBlock.Input, + output_schema=MediaDurationBlock.Output, + ) + + async def run( + self, + input_data: Input, + *, + execution_context: ExecutionContext, + **kwargs, + ) -> BlockOutput: + # 1) Store the input media locally + local_media_path = await store_media_file( + file=input_data.media_in, + execution_context=execution_context, + return_format="for_local_processing", + ) + assert execution_context.graph_exec_id is not None + media_abspath = get_exec_file_path( + execution_context.graph_exec_id, local_media_path + ) + + # 2) Strip chapters to avoid MoviePy crash, then load the clip + strip_chapters_inplace(media_abspath) + clip = None + try: + if input_data.is_video: + clip = VideoFileClip(media_abspath) + else: + clip = AudioFileClip(media_abspath) + + duration = clip.duration + finally: + if clip: + clip.close() + + yield "duration", duration diff --git a/autogpt_platform/backend/backend/blocks/video/loop.py b/autogpt_platform/backend/backend/blocks/video/loop.py new file mode 100644 index 0000000000..461610f713 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/loop.py @@ -0,0 +1,115 @@ +"""LoopVideoBlock - Loop a video to a given duration or number of repeats.""" + +from typing import Optional + +from moviepy.video.fx.Loop import Loop +from moviepy.video.io.VideoFileClip import VideoFileClip + +from backend.blocks.video._utils import extract_source_name, strip_chapters_inplace +from backend.data.block import ( + Block, + BlockCategory, + BlockOutput, + BlockSchemaInput, + BlockSchemaOutput, +) +from backend.data.execution import ExecutionContext +from backend.data.model import SchemaField +from backend.util.file import MediaFileType, get_exec_file_path, store_media_file + + +class LoopVideoBlock(Block): + """Loop (repeat) a video clip until a given duration or number of loops.""" + + class Input(BlockSchemaInput): + video_in: MediaFileType = SchemaField( + description="The input video (can be a URL, data URI, or local path)." + ) + duration: Optional[float] = SchemaField( + description="Target duration (in seconds) to loop the video to. Either duration or n_loops must be provided.", + default=None, + ge=0.0, + le=3600.0, # Max 1 hour to prevent disk exhaustion + ) + n_loops: Optional[int] = SchemaField( + description="Number of times to repeat the video. Either n_loops or duration must be provided.", + default=None, + ge=1, + le=10, # Max 10 loops to prevent disk exhaustion + ) + + class Output(BlockSchemaOutput): + video_out: MediaFileType = SchemaField( + description="Looped video returned either as a relative path or a data URI." + ) + + def __init__(self): + super().__init__( + id="8bf9eef6-5451-4213-b265-25306446e94b", + description="Block to loop a video to a given duration or number of repeats.", + categories={BlockCategory.MULTIMEDIA}, + input_schema=LoopVideoBlock.Input, + output_schema=LoopVideoBlock.Output, + ) + + async def run( + self, + input_data: Input, + *, + execution_context: ExecutionContext, + **kwargs, + ) -> BlockOutput: + assert execution_context.graph_exec_id is not None + assert execution_context.node_exec_id is not None + graph_exec_id = execution_context.graph_exec_id + node_exec_id = execution_context.node_exec_id + + # 1) Store the input video locally + local_video_path = await store_media_file( + file=input_data.video_in, + execution_context=execution_context, + return_format="for_local_processing", + ) + input_abspath = get_exec_file_path(graph_exec_id, local_video_path) + + # 2) Load the clip + strip_chapters_inplace(input_abspath) + clip = None + looped_clip = None + try: + clip = VideoFileClip(input_abspath) + + # 3) Apply the loop effect + if input_data.duration: + # Loop until we reach the specified duration + looped_clip = clip.with_effects([Loop(duration=input_data.duration)]) + elif input_data.n_loops: + looped_clip = clip.with_effects([Loop(n=input_data.n_loops)]) + else: + raise ValueError("Either 'duration' or 'n_loops' must be provided.") + + assert isinstance(looped_clip, VideoFileClip) + + # 4) Save the looped output + source = extract_source_name(local_video_path) + output_filename = MediaFileType(f"{node_exec_id}_looped_{source}.mp4") + output_abspath = get_exec_file_path(graph_exec_id, output_filename) + + looped_clip = looped_clip.with_audio(clip.audio) + looped_clip.write_videofile( + output_abspath, codec="libx264", audio_codec="aac" + ) + finally: + if looped_clip: + looped_clip.close() + if clip: + clip.close() + + # Return output - for_block_output returns workspace:// if available, else data URI + video_out = await store_media_file( + file=output_filename, + execution_context=execution_context, + return_format="for_block_output", + ) + + yield "video_out", video_out diff --git a/autogpt_platform/backend/backend/blocks/video/narration.py b/autogpt_platform/backend/backend/blocks/video/narration.py new file mode 100644 index 0000000000..adf41753c8 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/narration.py @@ -0,0 +1,267 @@ +"""VideoNarrationBlock - Generate AI voice narration and add to video.""" + +import os +from typing import Literal + +from elevenlabs import ElevenLabs +from moviepy import CompositeAudioClip +from moviepy.audio.io.AudioFileClip import AudioFileClip +from moviepy.video.io.VideoFileClip import VideoFileClip + +from backend.blocks.elevenlabs._auth import ( + TEST_CREDENTIALS, + TEST_CREDENTIALS_INPUT, + ElevenLabsCredentials, + ElevenLabsCredentialsInput, +) +from backend.blocks.video._utils import ( + extract_source_name, + get_video_codecs, + strip_chapters_inplace, +) +from backend.data.block import ( + Block, + BlockCategory, + BlockOutput, + BlockSchemaInput, + BlockSchemaOutput, +) +from backend.data.execution import ExecutionContext +from backend.data.model import CredentialsField, SchemaField +from backend.util.exceptions import BlockExecutionError +from backend.util.file import MediaFileType, get_exec_file_path, store_media_file + + +class VideoNarrationBlock(Block): + """Generate AI narration and add to video.""" + + class Input(BlockSchemaInput): + credentials: ElevenLabsCredentialsInput = CredentialsField( + description="ElevenLabs API key for voice synthesis" + ) + video_in: MediaFileType = SchemaField( + description="Input video (URL, data URI, or local path)" + ) + script: str = SchemaField(description="Narration script text") + voice_id: str = SchemaField( + description="ElevenLabs voice ID", default="21m00Tcm4TlvDq8ikWAM" # Rachel + ) + model_id: Literal[ + "eleven_multilingual_v2", + "eleven_flash_v2_5", + "eleven_turbo_v2_5", + "eleven_turbo_v2", + ] = SchemaField( + description="ElevenLabs TTS model", + default="eleven_multilingual_v2", + ) + mix_mode: Literal["replace", "mix", "ducking"] = SchemaField( + description="How to combine with original audio. 'ducking' applies stronger attenuation than 'mix'.", + default="ducking", + ) + narration_volume: float = SchemaField( + description="Narration volume (0.0 to 2.0)", + default=1.0, + ge=0.0, + le=2.0, + advanced=True, + ) + original_volume: float = SchemaField( + description="Original audio volume when mixing (0.0 to 1.0)", + default=0.3, + ge=0.0, + le=1.0, + advanced=True, + ) + + class Output(BlockSchemaOutput): + video_out: MediaFileType = SchemaField( + description="Video with narration (path or data URI)" + ) + audio_file: MediaFileType = SchemaField( + description="Generated audio file (path or data URI)" + ) + + def __init__(self): + super().__init__( + id="3d036b53-859c-4b17-9826-ca340f736e0e", + description="Generate AI narration and add to video", + categories={BlockCategory.MULTIMEDIA, BlockCategory.AI}, + input_schema=self.Input, + output_schema=self.Output, + test_input={ + "video_in": "/tmp/test.mp4", + "script": "Hello world", + "credentials": TEST_CREDENTIALS_INPUT, + }, + test_credentials=TEST_CREDENTIALS, + test_output=[("video_out", str), ("audio_file", str)], + test_mock={ + "_generate_narration_audio": lambda *args: b"mock audio content", + "_add_narration_to_video": lambda *args: None, + "_store_input_video": lambda *args, **kwargs: "test.mp4", + "_store_output_video": lambda *args, **kwargs: "narrated_test.mp4", + }, + ) + + async def _store_input_video( + self, execution_context: ExecutionContext, file: MediaFileType + ) -> MediaFileType: + """Store input video. Extracted for testability.""" + return await store_media_file( + file=file, + execution_context=execution_context, + return_format="for_local_processing", + ) + + async def _store_output_video( + self, execution_context: ExecutionContext, file: MediaFileType + ) -> MediaFileType: + """Store output video. Extracted for testability.""" + return await store_media_file( + file=file, + execution_context=execution_context, + return_format="for_block_output", + ) + + def _generate_narration_audio( + self, api_key: str, script: str, voice_id: str, model_id: str + ) -> bytes: + """Generate narration audio via ElevenLabs API.""" + client = ElevenLabs(api_key=api_key) + audio_generator = client.text_to_speech.convert( + voice_id=voice_id, + text=script, + model_id=model_id, + ) + # The SDK returns a generator, collect all chunks + return b"".join(audio_generator) + + def _add_narration_to_video( + self, + video_abspath: str, + audio_abspath: str, + output_abspath: str, + mix_mode: str, + narration_volume: float, + original_volume: float, + ) -> None: + """Add narration audio to video. Extracted for testability.""" + video = None + final = None + narration_original = None + narration_scaled = None + original = None + + try: + strip_chapters_inplace(video_abspath) + video = VideoFileClip(video_abspath) + narration_original = AudioFileClip(audio_abspath) + narration_scaled = narration_original.with_volume_scaled(narration_volume) + narration = narration_scaled + + if mix_mode == "replace": + final_audio = narration + elif mix_mode == "mix": + if video.audio: + original = video.audio.with_volume_scaled(original_volume) + final_audio = CompositeAudioClip([original, narration]) + else: + final_audio = narration + else: # ducking - apply stronger attenuation + if video.audio: + # Ducking uses a much lower volume for original audio + ducking_volume = original_volume * 0.3 + original = video.audio.with_volume_scaled(ducking_volume) + final_audio = CompositeAudioClip([original, narration]) + else: + final_audio = narration + + final = video.with_audio(final_audio) + video_codec, audio_codec = get_video_codecs(output_abspath) + final.write_videofile( + output_abspath, codec=video_codec, audio_codec=audio_codec + ) + + finally: + if original: + original.close() + if narration_scaled: + narration_scaled.close() + if narration_original: + narration_original.close() + if final: + final.close() + if video: + video.close() + + async def run( + self, + input_data: Input, + *, + credentials: ElevenLabsCredentials, + execution_context: ExecutionContext, + node_exec_id: str, + **kwargs, + ) -> BlockOutput: + try: + assert execution_context.graph_exec_id is not None + + # Store the input video locally + local_video_path = await self._store_input_video( + execution_context, input_data.video_in + ) + video_abspath = get_exec_file_path( + execution_context.graph_exec_id, local_video_path + ) + + # Generate narration audio via ElevenLabs + audio_content = self._generate_narration_audio( + credentials.api_key.get_secret_value(), + input_data.script, + input_data.voice_id, + input_data.model_id, + ) + + # Save audio to exec file path + audio_filename = MediaFileType(f"{node_exec_id}_narration.mp3") + audio_abspath = get_exec_file_path( + execution_context.graph_exec_id, audio_filename + ) + os.makedirs(os.path.dirname(audio_abspath), exist_ok=True) + with open(audio_abspath, "wb") as f: + f.write(audio_content) + + # Add narration to video + source = extract_source_name(local_video_path) + output_filename = MediaFileType(f"{node_exec_id}_narrated_{source}.mp4") + output_abspath = get_exec_file_path( + execution_context.graph_exec_id, output_filename + ) + + self._add_narration_to_video( + video_abspath, + audio_abspath, + output_abspath, + input_data.mix_mode, + input_data.narration_volume, + input_data.original_volume, + ) + + # Return as workspace path or data URI based on context + video_out = await self._store_output_video( + execution_context, output_filename + ) + audio_out = await self._store_output_video( + execution_context, audio_filename + ) + + yield "video_out", video_out + yield "audio_file", audio_out + + except Exception as e: + raise BlockExecutionError( + message=f"Failed to add narration: {e}", + block_name=self.name, + block_id=str(self.id), + ) from e diff --git a/autogpt_platform/backend/backend/blocks/video/text_overlay.py b/autogpt_platform/backend/backend/blocks/video/text_overlay.py new file mode 100644 index 0000000000..cb7cfe0420 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/text_overlay.py @@ -0,0 +1,231 @@ +"""VideoTextOverlayBlock - Add text overlay to video.""" + +from typing import Literal + +from moviepy import CompositeVideoClip, TextClip +from moviepy.video.io.VideoFileClip import VideoFileClip + +from backend.blocks.video._utils import ( + extract_source_name, + get_video_codecs, + strip_chapters_inplace, +) +from backend.data.block import ( + Block, + BlockCategory, + BlockOutput, + BlockSchemaInput, + BlockSchemaOutput, +) +from backend.data.execution import ExecutionContext +from backend.data.model import SchemaField +from backend.util.exceptions import BlockExecutionError +from backend.util.file import MediaFileType, get_exec_file_path, store_media_file + + +class VideoTextOverlayBlock(Block): + """Add text overlay/caption to video.""" + + class Input(BlockSchemaInput): + video_in: MediaFileType = SchemaField( + description="Input video (URL, data URI, or local path)" + ) + text: str = SchemaField(description="Text to overlay on video") + position: Literal[ + "top", + "center", + "bottom", + "top-left", + "top-right", + "bottom-left", + "bottom-right", + ] = SchemaField(description="Position of text on screen", default="bottom") + start_time: float | None = SchemaField( + description="When to show text (seconds). None = entire video", + default=None, + advanced=True, + ) + end_time: float | None = SchemaField( + description="When to hide text (seconds). None = until end", + default=None, + advanced=True, + ) + font_size: int = SchemaField( + description="Font size", default=48, ge=12, le=200, advanced=True + ) + font_color: str = SchemaField( + description="Font color (hex or name)", default="white", advanced=True + ) + bg_color: str | None = SchemaField( + description="Background color behind text (None for transparent)", + default=None, + advanced=True, + ) + + class Output(BlockSchemaOutput): + video_out: MediaFileType = SchemaField( + description="Video with text overlay (path or data URI)" + ) + + def __init__(self): + super().__init__( + id="8ef14de6-cc90-430a-8cfa-3a003be92454", + description="Add text overlay/caption to video", + categories={BlockCategory.MULTIMEDIA}, + input_schema=self.Input, + output_schema=self.Output, + disabled=True, # Disable until we can lockdown imagemagick security policy + test_input={"video_in": "/tmp/test.mp4", "text": "Hello World"}, + test_output=[("video_out", str)], + test_mock={ + "_add_text_overlay": lambda *args: None, + "_store_input_video": lambda *args, **kwargs: "test.mp4", + "_store_output_video": lambda *args, **kwargs: "overlay_test.mp4", + }, + ) + + async def _store_input_video( + self, execution_context: ExecutionContext, file: MediaFileType + ) -> MediaFileType: + """Store input video. Extracted for testability.""" + return await store_media_file( + file=file, + execution_context=execution_context, + return_format="for_local_processing", + ) + + async def _store_output_video( + self, execution_context: ExecutionContext, file: MediaFileType + ) -> MediaFileType: + """Store output video. Extracted for testability.""" + return await store_media_file( + file=file, + execution_context=execution_context, + return_format="for_block_output", + ) + + def _add_text_overlay( + self, + video_abspath: str, + output_abspath: str, + text: str, + position: str, + start_time: float | None, + end_time: float | None, + font_size: int, + font_color: str, + bg_color: str | None, + ) -> None: + """Add text overlay to video. Extracted for testability.""" + video = None + final = None + txt_clip = None + try: + strip_chapters_inplace(video_abspath) + video = VideoFileClip(video_abspath) + + txt_clip = TextClip( + text=text, + font_size=font_size, + color=font_color, + bg_color=bg_color, + ) + + # Position mapping + pos_map = { + "top": ("center", "top"), + "center": ("center", "center"), + "bottom": ("center", "bottom"), + "top-left": ("left", "top"), + "top-right": ("right", "top"), + "bottom-left": ("left", "bottom"), + "bottom-right": ("right", "bottom"), + } + + txt_clip = txt_clip.with_position(pos_map[position]) + + # Set timing + start = start_time or 0 + end = end_time or video.duration + duration = max(0, end - start) + txt_clip = txt_clip.with_start(start).with_end(end).with_duration(duration) + + final = CompositeVideoClip([video, txt_clip]) + video_codec, audio_codec = get_video_codecs(output_abspath) + final.write_videofile( + output_abspath, codec=video_codec, audio_codec=audio_codec + ) + + finally: + if txt_clip: + txt_clip.close() + if final: + final.close() + if video: + video.close() + + async def run( + self, + input_data: Input, + *, + execution_context: ExecutionContext, + node_exec_id: str, + **kwargs, + ) -> BlockOutput: + # Validate time range if both are provided + if ( + input_data.start_time is not None + and input_data.end_time is not None + and input_data.end_time <= input_data.start_time + ): + raise BlockExecutionError( + message=f"end_time ({input_data.end_time}) must be greater than start_time ({input_data.start_time})", + block_name=self.name, + block_id=str(self.id), + ) + + try: + assert execution_context.graph_exec_id is not None + + # Store the input video locally + local_video_path = await self._store_input_video( + execution_context, input_data.video_in + ) + video_abspath = get_exec_file_path( + execution_context.graph_exec_id, local_video_path + ) + + # Build output path + source = extract_source_name(local_video_path) + output_filename = MediaFileType(f"{node_exec_id}_overlay_{source}.mp4") + output_abspath = get_exec_file_path( + execution_context.graph_exec_id, output_filename + ) + + self._add_text_overlay( + video_abspath, + output_abspath, + input_data.text, + input_data.position, + input_data.start_time, + input_data.end_time, + input_data.font_size, + input_data.font_color, + input_data.bg_color, + ) + + # Return as workspace path or data URI based on context + video_out = await self._store_output_video( + execution_context, output_filename + ) + + yield "video_out", video_out + + except BlockExecutionError: + raise + except Exception as e: + raise BlockExecutionError( + message=f"Failed to add text overlay: {e}", + block_name=self.name, + block_id=str(self.id), + ) from e diff --git a/autogpt_platform/backend/backend/data/block_cost_config.py b/autogpt_platform/backend/backend/data/block_cost_config.py index 590f09cb41..ec35afa401 100644 --- a/autogpt_platform/backend/backend/data/block_cost_config.py +++ b/autogpt_platform/backend/backend/data/block_cost_config.py @@ -36,12 +36,14 @@ from backend.blocks.replicate.replicate_block import ReplicateModelBlock from backend.blocks.smart_decision_maker import SmartDecisionMakerBlock from backend.blocks.talking_head import CreateTalkingAvatarVideoBlock from backend.blocks.text_to_speech_block import UnrealTextToSpeechBlock +from backend.blocks.video.narration import VideoNarrationBlock from backend.data.block import Block, BlockCost, BlockCostType from backend.integrations.credentials_store import ( aiml_api_credentials, anthropic_credentials, apollo_credentials, did_credentials, + elevenlabs_credentials, enrichlayer_credentials, groq_credentials, ideogram_credentials, @@ -640,4 +642,16 @@ BLOCK_COSTS: dict[Type[Block], list[BlockCost]] = { }, ), ], + VideoNarrationBlock: [ + BlockCost( + cost_amount=5, # ElevenLabs TTS cost + cost_filter={ + "credentials": { + "id": elevenlabs_credentials.id, + "provider": elevenlabs_credentials.provider, + "type": elevenlabs_credentials.type, + } + }, + ) + ], } diff --git a/autogpt_platform/backend/backend/integrations/credentials_store.py b/autogpt_platform/backend/backend/integrations/credentials_store.py index 40a6f7269c..384405b0c7 100644 --- a/autogpt_platform/backend/backend/integrations/credentials_store.py +++ b/autogpt_platform/backend/backend/integrations/credentials_store.py @@ -224,6 +224,14 @@ openweathermap_credentials = APIKeyCredentials( expires_at=None, ) +elevenlabs_credentials = APIKeyCredentials( + id="f4a8b6c2-3d1e-4f5a-9b8c-7d6e5f4a3b2c", + provider="elevenlabs", + api_key=SecretStr(settings.secrets.elevenlabs_api_key), + title="Use Credits for ElevenLabs", + expires_at=None, +) + DEFAULT_CREDENTIALS = [ ollama_credentials, revid_credentials, @@ -252,6 +260,7 @@ DEFAULT_CREDENTIALS = [ v0_credentials, webshare_proxy_credentials, openweathermap_credentials, + elevenlabs_credentials, ] SYSTEM_CREDENTIAL_IDS = {cred.id for cred in DEFAULT_CREDENTIALS} @@ -366,6 +375,8 @@ class IntegrationCredentialsStore: all_credentials.append(webshare_proxy_credentials) if settings.secrets.openweathermap_api_key: all_credentials.append(openweathermap_credentials) + if settings.secrets.elevenlabs_api_key: + all_credentials.append(elevenlabs_credentials) return all_credentials async def get_creds_by_id( diff --git a/autogpt_platform/backend/backend/integrations/providers.py b/autogpt_platform/backend/backend/integrations/providers.py index 3af5006ca4..8a0d6fd183 100644 --- a/autogpt_platform/backend/backend/integrations/providers.py +++ b/autogpt_platform/backend/backend/integrations/providers.py @@ -18,6 +18,7 @@ class ProviderName(str, Enum): DISCORD = "discord" D_ID = "d_id" E2B = "e2b" + ELEVENLABS = "elevenlabs" FAL = "fal" GITHUB = "github" GOOGLE = "google" diff --git a/autogpt_platform/backend/backend/util/file.py b/autogpt_platform/backend/backend/util/file.py index baa9225629..1b8dbdea82 100644 --- a/autogpt_platform/backend/backend/util/file.py +++ b/autogpt_platform/backend/backend/util/file.py @@ -8,6 +8,8 @@ from pathlib import Path from typing import TYPE_CHECKING, Literal from urllib.parse import urlparse +from pydantic import BaseModel + from backend.util.cloud_storage import get_cloud_storage_handler from backend.util.request import Requests from backend.util.settings import Config @@ -17,6 +19,35 @@ from backend.util.virus_scanner import scan_content_safe if TYPE_CHECKING: from backend.data.execution import ExecutionContext + +class WorkspaceUri(BaseModel): + """Parsed workspace:// URI.""" + + file_ref: str # File ID or path (e.g. "abc123" or "/path/to/file.txt") + mime_type: str | None = None # MIME type from fragment (e.g. "video/mp4") + is_path: bool = False # True if file_ref is a path (starts with "/") + + +def parse_workspace_uri(uri: str) -> WorkspaceUri: + """Parse a workspace:// URI into its components. + + Examples: + "workspace://abc123" → WorkspaceUri(file_ref="abc123", mime_type=None, is_path=False) + "workspace://abc123#video/mp4" → WorkspaceUri(file_ref="abc123", mime_type="video/mp4", is_path=False) + "workspace:///path/to/file.txt" → WorkspaceUri(file_ref="/path/to/file.txt", mime_type=None, is_path=True) + """ + raw = uri.removeprefix("workspace://") + mime_type: str | None = None + if "#" in raw: + raw, fragment = raw.split("#", 1) + mime_type = fragment or None + return WorkspaceUri( + file_ref=raw, + mime_type=mime_type, + is_path=raw.startswith("/"), + ) + + # Return format options for store_media_file # - "for_local_processing": Returns local file path - use with ffmpeg, MoviePy, PIL, etc. # - "for_external_api": Returns data URI (base64) - use when sending content to external APIs @@ -183,22 +214,20 @@ async def store_media_file( "This file type is only available in CoPilot sessions." ) - # Parse workspace reference - # workspace://abc123 - by file ID - # workspace:///path/to/file.txt - by virtual path - file_ref = file[12:] # Remove "workspace://" + # Parse workspace reference (strips #mimeType fragment from file ID) + ws = parse_workspace_uri(file) - if file_ref.startswith("/"): - # Path reference - workspace_content = await workspace_manager.read_file(file_ref) - file_info = await workspace_manager.get_file_info_by_path(file_ref) + if ws.is_path: + # Path reference: workspace:///path/to/file.txt + workspace_content = await workspace_manager.read_file(ws.file_ref) + file_info = await workspace_manager.get_file_info_by_path(ws.file_ref) filename = sanitize_filename( file_info.name if file_info else f"{uuid.uuid4()}.bin" ) else: - # ID reference - workspace_content = await workspace_manager.read_file_by_id(file_ref) - file_info = await workspace_manager.get_file_info(file_ref) + # ID reference: workspace://abc123 or workspace://abc123#video/mp4 + workspace_content = await workspace_manager.read_file_by_id(ws.file_ref) + file_info = await workspace_manager.get_file_info(ws.file_ref) filename = sanitize_filename( file_info.name if file_info else f"{uuid.uuid4()}.bin" ) @@ -334,7 +363,21 @@ async def store_media_file( # Don't re-save if input was already from workspace if is_from_workspace: - # Return original workspace reference + # Return original workspace reference, ensuring MIME type fragment + ws = parse_workspace_uri(file) + if not ws.mime_type: + # Add MIME type fragment if missing (older refs without it) + try: + if ws.is_path: + info = await workspace_manager.get_file_info_by_path( + ws.file_ref + ) + else: + info = await workspace_manager.get_file_info(ws.file_ref) + if info: + return MediaFileType(f"{file}#{info.mimeType}") + except Exception: + pass return MediaFileType(file) # Save new content to workspace @@ -346,7 +389,7 @@ async def store_media_file( filename=filename, overwrite=True, ) - return MediaFileType(f"workspace://{file_record.id}") + return MediaFileType(f"workspace://{file_record.id}#{file_record.mimeType}") else: raise ValueError(f"Invalid return_format: {return_format}") diff --git a/autogpt_platform/backend/backend/util/settings.py b/autogpt_platform/backend/backend/util/settings.py index aa28a4c9ac..50b7428160 100644 --- a/autogpt_platform/backend/backend/util/settings.py +++ b/autogpt_platform/backend/backend/util/settings.py @@ -656,6 +656,7 @@ class Secrets(UpdateTrackingModel["Secrets"], BaseSettings): e2b_api_key: str = Field(default="", description="E2B API key") nvidia_api_key: str = Field(default="", description="Nvidia API key") mem0_api_key: str = Field(default="", description="Mem0 API key") + elevenlabs_api_key: str = Field(default="", description="ElevenLabs API key") linear_client_id: str = Field(default="", description="Linear client ID") linear_client_secret: str = Field(default="", description="Linear client secret") diff --git a/autogpt_platform/backend/poetry.lock b/autogpt_platform/backend/poetry.lock index 91ac358ade..61da8c974f 100644 --- a/autogpt_platform/backend/poetry.lock +++ b/autogpt_platform/backend/poetry.lock @@ -1169,6 +1169,29 @@ attrs = ">=21.3.0" e2b = ">=1.5.4,<2.0.0" httpx = ">=0.20.0,<1.0.0" +[[package]] +name = "elevenlabs" +version = "1.59.0" +description = "" +optional = false +python-versions = "<4.0,>=3.8" +groups = ["main"] +files = [ + {file = "elevenlabs-1.59.0-py3-none-any.whl", hash = "sha256:468145db81a0bc867708b4a8619699f75583e9481b395ec1339d0b443da771ed"}, + {file = "elevenlabs-1.59.0.tar.gz", hash = "sha256:16e735bd594e86d415dd445d249c8cc28b09996cfd627fbc10102c0a84698859"}, +] + +[package.dependencies] +httpx = ">=0.21.2" +pydantic = ">=1.9.2" +pydantic-core = ">=2.18.2,<3.0.0" +requests = ">=2.20" +typing_extensions = ">=4.0.0" +websockets = ">=11.0" + +[package.extras] +pyaudio = ["pyaudio (>=0.2.14)"] + [[package]] name = "email-validator" version = "2.2.0" @@ -7361,6 +7384,28 @@ files = [ defusedxml = ">=0.7.1,<0.8.0" requests = "*" +[[package]] +name = "yt-dlp" +version = "2025.12.8" +description = "A feature-rich command-line audio/video downloader" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "yt_dlp-2025.12.8-py3-none-any.whl", hash = "sha256:36e2584342e409cfbfa0b5e61448a1c5189e345cf4564294456ee509e7d3e065"}, + {file = "yt_dlp-2025.12.8.tar.gz", hash = "sha256:b773c81bb6b71cb2c111cfb859f453c7a71cf2ef44eff234ff155877184c3e4f"}, +] + +[package.extras] +build = ["build", "hatchling (>=1.27.0)", "pip", "setuptools (>=71.0.2)", "wheel"] +curl-cffi = ["curl-cffi (>=0.5.10,<0.6.dev0 || >=0.10.dev0,<0.14) ; implementation_name == \"cpython\""] +default = ["brotli ; implementation_name == \"cpython\"", "brotlicffi ; implementation_name != \"cpython\"", "certifi", "mutagen", "pycryptodomex", "requests (>=2.32.2,<3)", "urllib3 (>=2.0.2,<3)", "websockets (>=13.0)", "yt-dlp-ejs (==0.3.2)"] +dev = ["autopep8 (>=2.0,<3.0)", "pre-commit", "pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)", "ruff (>=0.14.0,<0.15.0)"] +pyinstaller = ["pyinstaller (>=6.17.0)"] +secretstorage = ["cffi", "secretstorage"] +static-analysis = ["autopep8 (>=2.0,<3.0)", "ruff (>=0.14.0,<0.15.0)"] +test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] + [[package]] name = "zerobouncesdk" version = "1.1.2" @@ -7512,4 +7557,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "ee5742dc1a9df50dfc06d4b26a1682cbb2b25cab6b79ce5625ec272f93e4f4bf" +content-hash = "8239323f9ae6713224dffd1fe8ba8b449fe88b6c3c7a90940294a74f43a0387a" diff --git a/autogpt_platform/backend/pyproject.toml b/autogpt_platform/backend/pyproject.toml index fe263e47c0..24aea39f33 100644 --- a/autogpt_platform/backend/pyproject.toml +++ b/autogpt_platform/backend/pyproject.toml @@ -20,6 +20,7 @@ click = "^8.2.0" cryptography = "^45.0" discord-py = "^2.5.2" e2b-code-interpreter = "^1.5.2" +elevenlabs = "^1.50.0" fastapi = "^0.116.1" feedparser = "^6.0.11" flake8 = "^7.3.0" @@ -71,6 +72,7 @@ tweepy = "^4.16.0" uvicorn = { extras = ["standard"], version = "^0.35.0" } websockets = "^15.0" youtube-transcript-api = "^1.2.1" +yt-dlp = "2025.12.08" zerobouncesdk = "^1.1.2" # NOTE: please insert new dependencies in their alphabetical location pytest-snapshot = "^0.9.0" diff --git a/autogpt_platform/frontend/src/app/(platform)/build/components/legacy-builder/DataTable.tsx b/autogpt_platform/frontend/src/app/(platform)/build/components/legacy-builder/DataTable.tsx index 4213711447..c58bdac642 100644 --- a/autogpt_platform/frontend/src/app/(platform)/build/components/legacy-builder/DataTable.tsx +++ b/autogpt_platform/frontend/src/app/(platform)/build/components/legacy-builder/DataTable.tsx @@ -1,6 +1,6 @@ import { beautifyString } from "@/lib/utils"; import { Clipboard, Maximize2 } from "lucide-react"; -import React, { useState } from "react"; +import React, { useMemo, useState } from "react"; import { Button } from "../../../../../components/__legacy__/ui/button"; import { ContentRenderer } from "../../../../../components/__legacy__/ui/render"; import { @@ -11,6 +11,12 @@ import { TableHeader, TableRow, } from "../../../../../components/__legacy__/ui/table"; +import type { OutputMetadata } from "@/components/contextual/OutputRenderers"; +import { + globalRegistry, + OutputItem, +} from "@/components/contextual/OutputRenderers"; +import { Flag, useGetFlag } from "@/services/feature-flags/use-get-flag"; import { useToast } from "../../../../../components/molecules/Toast/use-toast"; import ExpandableOutputDialog from "./ExpandableOutputDialog"; @@ -26,6 +32,9 @@ export default function DataTable({ data, }: DataTableProps) { const { toast } = useToast(); + const enableEnhancedOutputHandling = useGetFlag( + Flag.ENABLE_ENHANCED_OUTPUT_HANDLING, + ); const [expandedDialog, setExpandedDialog] = useState<{ isOpen: boolean; execId: string; @@ -33,6 +42,15 @@ export default function DataTable({ data: any[]; } | null>(null); + // Prepare renderers for each item when enhanced mode is enabled + const getItemRenderer = useMemo(() => { + if (!enableEnhancedOutputHandling) return null; + return (item: unknown) => { + const metadata: OutputMetadata = {}; + return globalRegistry.getRenderer(item, metadata); + }; + }, [enableEnhancedOutputHandling]); + const copyData = (pin: string, data: string) => { navigator.clipboard.writeText(data).then(() => { toast({ @@ -102,15 +120,31 @@ export default function DataTable({ - {value.map((item, index) => ( - - - {index < value.length - 1 && ", "} - - ))} + {value.map((item, index) => { + const renderer = getItemRenderer?.(item); + if (enableEnhancedOutputHandling && renderer) { + const metadata: OutputMetadata = {}; + return ( + + + {index < value.length - 1 && ", "} + + ); + } + return ( + + + {index < value.length - 1 && ", "} + + ); + })} diff --git a/autogpt_platform/frontend/src/app/(platform)/build/components/legacy-builder/NodeOutputs.tsx b/autogpt_platform/frontend/src/app/(platform)/build/components/legacy-builder/NodeOutputs.tsx index d90b7d6a4c..2111db7d99 100644 --- a/autogpt_platform/frontend/src/app/(platform)/build/components/legacy-builder/NodeOutputs.tsx +++ b/autogpt_platform/frontend/src/app/(platform)/build/components/legacy-builder/NodeOutputs.tsx @@ -1,8 +1,14 @@ -import React, { useContext, useState } from "react"; +import React, { useContext, useMemo, useState } from "react"; import { Button } from "@/components/__legacy__/ui/button"; import { Maximize2 } from "lucide-react"; import * as Separator from "@radix-ui/react-separator"; import { ContentRenderer } from "@/components/__legacy__/ui/render"; +import type { OutputMetadata } from "@/components/contextual/OutputRenderers"; +import { + globalRegistry, + OutputItem, +} from "@/components/contextual/OutputRenderers"; +import { Flag, useGetFlag } from "@/services/feature-flags/use-get-flag"; import { beautifyString } from "@/lib/utils"; @@ -21,6 +27,9 @@ export default function NodeOutputs({ data, }: NodeOutputsProps) { const builderContext = useContext(BuilderContext); + const enableEnhancedOutputHandling = useGetFlag( + Flag.ENABLE_ENHANCED_OUTPUT_HANDLING, + ); const [expandedDialog, setExpandedDialog] = useState<{ isOpen: boolean; @@ -37,6 +46,15 @@ export default function NodeOutputs({ const { getNodeTitle } = builderContext; + // Prepare renderers for each item when enhanced mode is enabled + const getItemRenderer = useMemo(() => { + if (!enableEnhancedOutputHandling) return null; + return (item: unknown) => { + const metadata: OutputMetadata = {}; + return globalRegistry.getRenderer(item, metadata); + }; + }, [enableEnhancedOutputHandling]); + const getBeautifiedPinName = (pin: string) => { if (!pin.startsWith("tools_^_")) { return beautifyString(pin); @@ -87,15 +105,31 @@ export default function NodeOutputs({
Data:
- {dataArray.slice(0, 10).map((item, index) => ( - - - {index < Math.min(dataArray.length, 10) - 1 && ", "} - - ))} + {dataArray.slice(0, 10).map((item, index) => { + const renderer = getItemRenderer?.(item); + if (enableEnhancedOutputHandling && renderer) { + const metadata: OutputMetadata = {}; + return ( + + + {index < Math.min(dataArray.length, 10) - 1 && ", "} + + ); + } + return ( + + + {index < Math.min(dataArray.length, 10) - 1 && ", "} + + ); + })} {dataArray.length > 10 && (
diff --git a/autogpt_platform/frontend/src/components/__legacy__/ui/render.tsx b/autogpt_platform/frontend/src/components/__legacy__/ui/render.tsx index 5173326f23..b290c51809 100644 --- a/autogpt_platform/frontend/src/components/__legacy__/ui/render.tsx +++ b/autogpt_platform/frontend/src/components/__legacy__/ui/render.tsx @@ -22,7 +22,7 @@ const isValidVideoUrl = (url: string): boolean => { if (url.startsWith("data:video")) { return true; } - const videoExtensions = /\.(mp4|webm|ogg)$/i; + const videoExtensions = /\.(mp4|webm|ogg|mov|avi|mkv|m4v)$/i; const youtubeRegex = /^(https?:\/\/)?(www\.)?(youtube\.com|youtu\.?be)\/.+$/; const cleanedUrl = url.split("?")[0]; return ( @@ -44,11 +44,29 @@ const isValidAudioUrl = (url: string): boolean => { if (url.startsWith("data:audio")) { return true; } - const audioExtensions = /\.(mp3|wav)$/i; + const audioExtensions = /\.(mp3|wav|ogg|m4a|aac|flac)$/i; const cleanedUrl = url.split("?")[0]; return isValidMediaUri(url) && audioExtensions.test(cleanedUrl); }; +const getVideoMimeType = (url: string): string => { + if (url.startsWith("data:video/")) { + const match = url.match(/^data:(video\/[^;]+)/); + return match?.[1] || "video/mp4"; + } + const extension = url.split("?")[0].split(".").pop()?.toLowerCase(); + const mimeMap: Record = { + mp4: "video/mp4", + webm: "video/webm", + ogg: "video/ogg", + mov: "video/quicktime", + avi: "video/x-msvideo", + mkv: "video/x-matroska", + m4v: "video/mp4", + }; + return mimeMap[extension || ""] || "video/mp4"; +}; + const VideoRenderer: React.FC<{ videoUrl: string }> = ({ videoUrl }) => { const videoId = getYouTubeVideoId(videoUrl); return ( @@ -63,7 +81,7 @@ const VideoRenderer: React.FC<{ videoUrl: string }> = ({ videoUrl }) => { > ) : ( )} diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/MarkdownContent/MarkdownContent.tsx b/autogpt_platform/frontend/src/components/contextual/Chat/components/MarkdownContent/MarkdownContent.tsx index 3dd5eca692..ecadbe938b 100644 --- a/autogpt_platform/frontend/src/components/contextual/Chat/components/MarkdownContent/MarkdownContent.tsx +++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/MarkdownContent/MarkdownContent.tsx @@ -3,7 +3,7 @@ import { getGetWorkspaceDownloadFileByIdUrl } from "@/app/api/__generated__/endpoints/workspace/workspace"; import { cn } from "@/lib/utils"; import { EyeSlash } from "@phosphor-icons/react"; -import React from "react"; +import React, { useState } from "react"; import ReactMarkdown from "react-markdown"; import remarkGfm from "remark-gfm"; @@ -48,7 +48,9 @@ interface InputProps extends React.InputHTMLAttributes { */ function resolveWorkspaceUrl(src: string): string { if (src.startsWith("workspace://")) { - const fileId = src.replace("workspace://", ""); + // Strip MIME type fragment if present (e.g., workspace://abc123#video/mp4 → abc123) + const withoutPrefix = src.replace("workspace://", ""); + const fileId = withoutPrefix.split("#")[0]; // Use the generated API URL helper to get the correct path const apiPath = getGetWorkspaceDownloadFileByIdUrl(fileId); // Route through the Next.js proxy (same pattern as customMutator for client-side) @@ -65,13 +67,49 @@ function isWorkspaceImage(src: string | undefined): boolean { return src?.includes("/workspace/files/") ?? false; } +/** + * Renders a workspace video with controls and an optional "AI cannot see" badge. + */ +function WorkspaceVideo({ + src, + aiCannotSee, +}: { + src: string; + aiCannotSee: boolean; +}) { + return ( + + + {aiCannotSee && ( + + + AI cannot see this video + + )} + + ); +} + /** * Custom image component that shows an indicator when the AI cannot see the image. + * Also handles the "video:" alt-text prefix convention to render