mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-01-23 05:57:58 -05:00
feat(blocks): add VideoNarrationBlock
- Move imports to top level - Use tempfile for secure temp paths - Add exception chaining (from e) - Close AudioFileClip in finally block - Document that ducking = reduced volume mix - Extract helper method for test mocking - Proper resource cleanup
This commit is contained in:
190
autogpt_platform/backend/backend/blocks/video/narration.py
Normal file
190
autogpt_platform/backend/backend/blocks/video/narration.py
Normal file
@@ -0,0 +1,190 @@
|
||||
"""VideoNarrationBlock - Generate AI voice narration and add to video."""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import uuid
|
||||
from typing import Literal
|
||||
|
||||
import requests
|
||||
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
|
||||
|
||||
from backend.data.block import Block, BlockCategory, BlockOutput
|
||||
from backend.data.block import BlockSchemaInput, BlockSchemaOutput
|
||||
from backend.data.model import SchemaField, CredentialsMetaInput, APIKeyCredentials
|
||||
from backend.integrations.providers import ProviderName
|
||||
from backend.util.exceptions import BlockExecutionError
|
||||
|
||||
|
||||
class VideoNarrationBlock(Block):
|
||||
"""Generate AI narration and add to video."""
|
||||
|
||||
class Input(BlockSchemaInput):
|
||||
credentials: CredentialsMetaInput[
|
||||
Literal[ProviderName.ELEVENLABS], Literal["api_key"]
|
||||
] = SchemaField(
|
||||
description="ElevenLabs API key for voice synthesis"
|
||||
)
|
||||
video_in: str = SchemaField(
|
||||
description="Input video file",
|
||||
json_schema_extra={"format": "file"}
|
||||
)
|
||||
script: str = SchemaField(
|
||||
description="Narration script text"
|
||||
)
|
||||
voice_id: str = SchemaField(
|
||||
description="ElevenLabs voice ID",
|
||||
default="21m00Tcm4TlvDq8ikWAM" # Rachel
|
||||
)
|
||||
mix_mode: Literal["replace", "mix", "ducking"] = SchemaField(
|
||||
description="How to combine with original audio. 'ducking' applies stronger attenuation than 'mix'.",
|
||||
default="ducking"
|
||||
)
|
||||
narration_volume: float = SchemaField(
|
||||
description="Narration volume (0.0 to 2.0)",
|
||||
default=1.0,
|
||||
ge=0.0,
|
||||
le=2.0,
|
||||
advanced=True
|
||||
)
|
||||
original_volume: float = SchemaField(
|
||||
description="Original audio volume when mixing (0.0 to 1.0)",
|
||||
default=0.3,
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
advanced=True
|
||||
)
|
||||
|
||||
class Output(BlockSchemaOutput):
|
||||
video_out: str = SchemaField(
|
||||
description="Video with narration",
|
||||
json_schema_extra={"format": "file"}
|
||||
)
|
||||
audio_file: str = SchemaField(
|
||||
description="Generated audio file",
|
||||
json_schema_extra={"format": "file"}
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
id="e5f6a7b8-c9d0-1234-ef56-789012345678",
|
||||
description="Generate AI narration and add to video",
|
||||
categories={BlockCategory.MULTIMEDIA, BlockCategory.AI},
|
||||
input_schema=self.Input,
|
||||
output_schema=self.Output,
|
||||
test_input={
|
||||
"video_in": "/tmp/test.mp4",
|
||||
"script": "Hello world",
|
||||
"credentials": {"provider": "elevenlabs", "id": "test", "type": "api_key"}
|
||||
},
|
||||
test_output=[("video_out", str), ("audio_file", str)],
|
||||
test_mock={"_generate_and_add_narration": lambda *args: ("/tmp/narrated.mp4", "/tmp/audio.mp3")}
|
||||
)
|
||||
|
||||
def _generate_and_add_narration(
|
||||
self,
|
||||
api_key: str,
|
||||
video_in: str,
|
||||
script: str,
|
||||
voice_id: str,
|
||||
mix_mode: str,
|
||||
narration_volume: float,
|
||||
original_volume: float,
|
||||
) -> tuple[str, str]:
|
||||
"""Generate narration and add to video. Extracted for testability."""
|
||||
video = None
|
||||
final = None
|
||||
narration = None
|
||||
original = None
|
||||
|
||||
try:
|
||||
# Generate narration via ElevenLabs
|
||||
response = requests.post(
|
||||
f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
|
||||
headers={
|
||||
"xi-api-key": api_key,
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"text": script,
|
||||
"model_id": "eleven_monolingual_v1"
|
||||
},
|
||||
timeout=120
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
fd, audio_path = tempfile.mkstemp(suffix=".mp3")
|
||||
with os.fdopen(fd, "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
# Combine with video
|
||||
video = VideoFileClip(video_in)
|
||||
narration = AudioFileClip(audio_path)
|
||||
narration = narration.volumex(narration_volume)
|
||||
|
||||
if mix_mode == "replace":
|
||||
final_audio = narration
|
||||
elif mix_mode == "mix":
|
||||
if video.audio:
|
||||
original = video.audio.volumex(original_volume)
|
||||
final_audio = CompositeAudioClip([original, narration])
|
||||
else:
|
||||
final_audio = narration
|
||||
else: # ducking - apply stronger attenuation
|
||||
if video.audio:
|
||||
# Ducking uses a much lower volume for original audio
|
||||
ducking_volume = original_volume * 0.3
|
||||
original = video.audio.volumex(ducking_volume)
|
||||
final_audio = CompositeAudioClip([original, narration])
|
||||
else:
|
||||
final_audio = narration
|
||||
|
||||
final = video.set_audio(final_audio)
|
||||
|
||||
fd, output_path = tempfile.mkstemp(suffix=".mp4")
|
||||
os.close(fd)
|
||||
final.write_videofile(output_path, logger=None)
|
||||
|
||||
return output_path, audio_path
|
||||
|
||||
finally:
|
||||
if original:
|
||||
original.close()
|
||||
if narration:
|
||||
narration.close()
|
||||
if final:
|
||||
final.close()
|
||||
if video:
|
||||
video.close()
|
||||
|
||||
async def run(
|
||||
self,
|
||||
input_data: Input,
|
||||
*,
|
||||
credentials: APIKeyCredentials,
|
||||
**kwargs
|
||||
) -> BlockOutput:
|
||||
try:
|
||||
output_path, audio_path = self._generate_and_add_narration(
|
||||
credentials.api_key.get_secret_value(),
|
||||
input_data.video_in,
|
||||
input_data.script,
|
||||
input_data.voice_id,
|
||||
input_data.mix_mode,
|
||||
input_data.narration_volume,
|
||||
input_data.original_volume,
|
||||
)
|
||||
yield "video_out", output_path
|
||||
yield "audio_file", audio_path
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise BlockExecutionError(
|
||||
message=f"ElevenLabs API error: {e}",
|
||||
block_name=self.name,
|
||||
block_id=str(self.id)
|
||||
) from e
|
||||
except Exception as e:
|
||||
raise BlockExecutionError(
|
||||
message=f"Failed to add narration: {e}",
|
||||
block_name=self.name,
|
||||
block_id=str(self.id)
|
||||
) from e
|
||||
Reference in New Issue
Block a user