From 0ede203f8e7ebe670be1780cf685231e378ff00c Mon Sep 17 00:00:00 2001 From: Nicholas Tindle Date: Thu, 22 Jan 2026 13:52:10 -0600 Subject: [PATCH] feat(blocks): add VideoNarrationBlock - Move imports to top level - Use tempfile for secure temp paths - Add exception chaining (from e) - Close AudioFileClip in finally block - Document that ducking = reduced volume mix - Extract helper method for test mocking - Proper resource cleanup --- .../backend/backend/blocks/video/narration.py | 190 ++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 autogpt_platform/backend/backend/blocks/video/narration.py diff --git a/autogpt_platform/backend/backend/blocks/video/narration.py b/autogpt_platform/backend/backend/blocks/video/narration.py new file mode 100644 index 0000000000..46e01a3826 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/narration.py @@ -0,0 +1,190 @@ +"""VideoNarrationBlock - Generate AI voice narration and add to video.""" + +import os +import tempfile +import uuid +from typing import Literal + +import requests +from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip + +from backend.data.block import Block, BlockCategory, BlockOutput +from backend.data.block import BlockSchemaInput, BlockSchemaOutput +from backend.data.model import SchemaField, CredentialsMetaInput, APIKeyCredentials +from backend.integrations.providers import ProviderName +from backend.util.exceptions import BlockExecutionError + + +class VideoNarrationBlock(Block): + """Generate AI narration and add to video.""" + + class Input(BlockSchemaInput): + credentials: CredentialsMetaInput[ + Literal[ProviderName.ELEVENLABS], Literal["api_key"] + ] = SchemaField( + description="ElevenLabs API key for voice synthesis" + ) + video_in: str = SchemaField( + description="Input video file", + json_schema_extra={"format": "file"} + ) + script: str = SchemaField( + description="Narration script text" + ) + voice_id: str = SchemaField( + description="ElevenLabs voice ID", + default="21m00Tcm4TlvDq8ikWAM" # Rachel + ) + mix_mode: Literal["replace", "mix", "ducking"] = SchemaField( + description="How to combine with original audio. 'ducking' applies stronger attenuation than 'mix'.", + default="ducking" + ) + narration_volume: float = SchemaField( + description="Narration volume (0.0 to 2.0)", + default=1.0, + ge=0.0, + le=2.0, + advanced=True + ) + original_volume: float = SchemaField( + description="Original audio volume when mixing (0.0 to 1.0)", + default=0.3, + ge=0.0, + le=1.0, + advanced=True + ) + + class Output(BlockSchemaOutput): + video_out: str = SchemaField( + description="Video with narration", + json_schema_extra={"format": "file"} + ) + audio_file: str = SchemaField( + description="Generated audio file", + json_schema_extra={"format": "file"} + ) + + def __init__(self): + super().__init__( + id="e5f6a7b8-c9d0-1234-ef56-789012345678", + description="Generate AI narration and add to video", + categories={BlockCategory.MULTIMEDIA, BlockCategory.AI}, + input_schema=self.Input, + output_schema=self.Output, + test_input={ + "video_in": "/tmp/test.mp4", + "script": "Hello world", + "credentials": {"provider": "elevenlabs", "id": "test", "type": "api_key"} + }, + test_output=[("video_out", str), ("audio_file", str)], + test_mock={"_generate_and_add_narration": lambda *args: ("/tmp/narrated.mp4", "/tmp/audio.mp3")} + ) + + def _generate_and_add_narration( + self, + api_key: str, + video_in: str, + script: str, + voice_id: str, + mix_mode: str, + narration_volume: float, + original_volume: float, + ) -> tuple[str, str]: + """Generate narration and add to video. Extracted for testability.""" + video = None + final = None + narration = None + original = None + + try: + # Generate narration via ElevenLabs + response = requests.post( + f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}", + headers={ + "xi-api-key": api_key, + "Content-Type": "application/json" + }, + json={ + "text": script, + "model_id": "eleven_monolingual_v1" + }, + timeout=120 + ) + response.raise_for_status() + + fd, audio_path = tempfile.mkstemp(suffix=".mp3") + with os.fdopen(fd, "wb") as f: + f.write(response.content) + + # Combine with video + video = VideoFileClip(video_in) + narration = AudioFileClip(audio_path) + narration = narration.volumex(narration_volume) + + if mix_mode == "replace": + final_audio = narration + elif mix_mode == "mix": + if video.audio: + original = video.audio.volumex(original_volume) + final_audio = CompositeAudioClip([original, narration]) + else: + final_audio = narration + else: # ducking - apply stronger attenuation + if video.audio: + # Ducking uses a much lower volume for original audio + ducking_volume = original_volume * 0.3 + original = video.audio.volumex(ducking_volume) + final_audio = CompositeAudioClip([original, narration]) + else: + final_audio = narration + + final = video.set_audio(final_audio) + + fd, output_path = tempfile.mkstemp(suffix=".mp4") + os.close(fd) + final.write_videofile(output_path, logger=None) + + return output_path, audio_path + + finally: + if original: + original.close() + if narration: + narration.close() + if final: + final.close() + if video: + video.close() + + async def run( + self, + input_data: Input, + *, + credentials: APIKeyCredentials, + **kwargs + ) -> BlockOutput: + try: + output_path, audio_path = self._generate_and_add_narration( + credentials.api_key.get_secret_value(), + input_data.video_in, + input_data.script, + input_data.voice_id, + input_data.mix_mode, + input_data.narration_volume, + input_data.original_volume, + ) + yield "video_out", output_path + yield "audio_file", audio_path + + except requests.exceptions.RequestException as e: + raise BlockExecutionError( + message=f"ElevenLabs API error: {e}", + block_name=self.name, + block_id=str(self.id) + ) from e + except Exception as e: + raise BlockExecutionError( + message=f"Failed to add narration: {e}", + block_name=self.name, + block_id=str(self.id) + ) from e