From fda10563e74fa5acb4748ad59222f03b4c3a0e76 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Mon, 9 Feb 2026 07:51:52 +0000 Subject: [PATCH] feat(blocks): add video transcription and editing blocks on dev - Add TranscribeVideoBlock and EditVideoByTextBlock to blocks/video/ - Update video/__init__.py with new block exports - Generate block documentation via generate_block_docs.py - Fix wait=False bug in Replicate API calls (was returning Prediction object instead of actual output) - Format fixes Co-authored-by: Nicholas Tindle --- .../backend/backend/blocks/video/__init__.py | 7 + .../backend/blocks/video/edit_by_text.py | 159 ++++++++++++++++++ .../backend/blocks/video/transcribe.py | 139 +++++++++++++++ .../backend/data/execution_queue_test.py | 2 - docs/integrations/README.md | 2 + docs/integrations/SUMMARY.md | 2 + .../block-integrations/video/edit_by_text.md | 37 ++++ .../block-integrations/video/transcribe.md | 34 ++++ 8 files changed, 380 insertions(+), 2 deletions(-) create mode 100644 autogpt_platform/backend/backend/blocks/video/edit_by_text.py create mode 100644 autogpt_platform/backend/backend/blocks/video/transcribe.py create mode 100644 docs/integrations/block-integrations/video/edit_by_text.md create mode 100644 docs/integrations/block-integrations/video/transcribe.md diff --git a/autogpt_platform/backend/backend/blocks/video/__init__.py b/autogpt_platform/backend/backend/blocks/video/__init__.py index 4974ae8a87..af3d42a6d8 100644 --- a/autogpt_platform/backend/backend/blocks/video/__init__.py +++ b/autogpt_platform/backend/backend/blocks/video/__init__.py @@ -9,11 +9,14 @@ This module provides blocks for: - Getting media duration - Looping videos - Adding audio to videos +- Transcribing video speech to text +- Editing videos by modifying their transcript Dependencies: - yt-dlp: For video downloading - moviepy: For video editing operations - elevenlabs: For AI narration (optional) +- replicate: For video transcription and text-based editing """ from backend.blocks.video.add_audio import AddAudioToVideoBlock @@ -21,14 +24,18 @@ from backend.blocks.video.clip import VideoClipBlock from backend.blocks.video.concat import VideoConcatBlock from backend.blocks.video.download import VideoDownloadBlock from backend.blocks.video.duration import MediaDurationBlock +from backend.blocks.video.edit_by_text import EditVideoByTextBlock from backend.blocks.video.loop import LoopVideoBlock from backend.blocks.video.narration import VideoNarrationBlock from backend.blocks.video.text_overlay import VideoTextOverlayBlock +from backend.blocks.video.transcribe import TranscribeVideoBlock __all__ = [ "AddAudioToVideoBlock", + "EditVideoByTextBlock", "LoopVideoBlock", "MediaDurationBlock", + "TranscribeVideoBlock", "VideoClipBlock", "VideoConcatBlock", "VideoDownloadBlock", diff --git a/autogpt_platform/backend/backend/blocks/video/edit_by_text.py b/autogpt_platform/backend/backend/blocks/video/edit_by_text.py new file mode 100644 index 0000000000..a606063fc3 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/edit_by_text.py @@ -0,0 +1,159 @@ +"""EditVideoByTextBlock - Edit a video by modifying its transcript via Replicate.""" + +from __future__ import annotations + +import logging +from typing import Literal + +from replicate.client import Client as ReplicateClient +from replicate.helpers import FileOutput + +from backend.blocks.replicate._auth import ( + TEST_CREDENTIALS, + TEST_CREDENTIALS_INPUT, + ReplicateCredentials, + ReplicateCredentialsInput, +) +from backend.data.block import ( + Block, + BlockCategory, + BlockOutput, + BlockSchemaInput, + BlockSchemaOutput, +) +from backend.data.execution import ExecutionContext +from backend.data.model import CredentialsField, SchemaField +from backend.util.exceptions import BlockExecutionError +from backend.util.file import MediaFileType, store_media_file + +logger = logging.getLogger(__name__) + + +class EditVideoByTextBlock(Block): + """Edit a video by modifying its transcript, cutting segments via Replicate API.""" + + class Input(BlockSchemaInput): + credentials: ReplicateCredentialsInput = CredentialsField( + description="Replicate API key for video editing.", + ) + video_in: MediaFileType = SchemaField( + description="Input video file to edit (URL, data URI, or local path)", + ) + transcription: str = SchemaField( + description="Desired transcript for the output video", + ) + split_at: Literal["word", "character"] = SchemaField( + description="Granularity for transcript matching", + default="word", + ) + + class Output(BlockSchemaOutput): + video_url: str = SchemaField( + description="URL of the edited video", + ) + transcription: str = SchemaField( + description="Transcription used for editing", + ) + + def __init__(self): + super().__init__( + id="98d40049-a1de-465f-bba1-47411298ad1a", + description="Edit a video by modifying its transcript", + categories={BlockCategory.MULTIMEDIA}, + input_schema=self.Input, + output_schema=self.Output, + test_input={ + "credentials": TEST_CREDENTIALS_INPUT, + "video_in": "data:video/mp4;base64,AAAA", + "transcription": "edited transcript", + }, + test_output=[ + ("video_url", "https://replicate.com/output/video.mp4"), + ("transcription", "edited transcript"), + ], + test_mock={ + "_edit_video": lambda *args: "https://replicate.com/output/video.mp4", + "_store_input_video": lambda *args, **kwargs: "data:video/mp4;base64,AAAA", + }, + test_credentials=TEST_CREDENTIALS, + ) + + async def _store_input_video( + self, execution_context: ExecutionContext, file: MediaFileType + ) -> MediaFileType: + """Store input video locally. Extracted for testability.""" + return await store_media_file( + file=file, + execution_context=execution_context, + return_format="for_external_api", + ) + + async def _edit_video( + self, data_uri: str, transcription: str, split_at: str, api_key: str + ) -> str: + """Call Replicate API to edit the video based on the transcript.""" + client = ReplicateClient(api_token=api_key) + + output = await client.async_run( + "jd7h/edit-video-by-editing-text:e010b880347314d07e3ce3b21cbd4c57add51fea3474677a6cb1316751c4cb90", + input={ + "mode": "edit", + "video_in": data_uri, + "transcription": transcription, + "split_at": split_at, + }, + ) + + # Get video URL from output + if isinstance(output, dict) and "video" in output: + video_output = output["video"] + if isinstance(video_output, FileOutput): + return video_output.url + return str(video_output) + + if isinstance(output, list) and len(output) > 0: + video_url = output[0] + if isinstance(video_url, FileOutput): + return video_url.url + return str(video_url) + + if isinstance(output, FileOutput): + return output.url + + if isinstance(output, str): + return output + + raise ValueError(f"Unexpected output format from Replicate API: {output}") + + async def run( + self, + input_data: Input, + *, + credentials: ReplicateCredentials, + execution_context: ExecutionContext, + **kwargs, + ) -> BlockOutput: + try: + # Store video and get data URI for API submission + data_uri = await self._store_input_video( + execution_context, input_data.video_in + ) + + video_url = await self._edit_video( + data_uri, + input_data.transcription, + input_data.split_at, + credentials.api_key.get_secret_value(), + ) + + yield "video_url", video_url + yield "transcription", input_data.transcription + + except BlockExecutionError: + raise + except Exception as e: + raise BlockExecutionError( + message=f"Failed to edit video: {e}", + block_name=self.name, + block_id=str(self.id), + ) from e diff --git a/autogpt_platform/backend/backend/blocks/video/transcribe.py b/autogpt_platform/backend/backend/blocks/video/transcribe.py new file mode 100644 index 0000000000..f26de3c777 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/video/transcribe.py @@ -0,0 +1,139 @@ +"""TranscribeVideoBlock - Transcribe speech from a video file using Replicate.""" + +from __future__ import annotations + +import logging + +from replicate.client import Client as ReplicateClient +from replicate.helpers import FileOutput + +from backend.blocks.replicate._auth import ( + TEST_CREDENTIALS, + TEST_CREDENTIALS_INPUT, + ReplicateCredentials, + ReplicateCredentialsInput, +) +from backend.data.block import ( + Block, + BlockCategory, + BlockOutput, + BlockSchemaInput, + BlockSchemaOutput, +) +from backend.data.execution import ExecutionContext +from backend.data.model import CredentialsField, SchemaField +from backend.util.exceptions import BlockExecutionError +from backend.util.file import MediaFileType, store_media_file + +logger = logging.getLogger(__name__) + + +class TranscribeVideoBlock(Block): + """Transcribe speech from a video file to text via Replicate API.""" + + class Input(BlockSchemaInput): + credentials: ReplicateCredentialsInput = CredentialsField( + description="Replicate API key for video transcription.", + ) + video_in: MediaFileType = SchemaField( + description="Input video file to transcribe (URL, data URI, or local path)", + ) + + class Output(BlockSchemaOutput): + transcription: str = SchemaField( + description="Text transcription extracted from the video", + ) + + def __init__(self): + super().__init__( + id="fa49dad0-a5fc-441c-ba04-2ac206e392d8", + description="Transcribe speech from a video file to text", + categories={BlockCategory.MULTIMEDIA}, + input_schema=self.Input, + output_schema=self.Output, + test_input={ + "credentials": TEST_CREDENTIALS_INPUT, + "video_in": "data:video/mp4;base64,AAAA", + }, + test_output=[("transcription", "example transcript")], + test_mock={ + "_transcribe": lambda *args: "example transcript", + "_store_input_video": lambda *args, **kwargs: "test.mp4", + }, + test_credentials=TEST_CREDENTIALS, + ) + + async def _store_input_video( + self, execution_context: ExecutionContext, file: MediaFileType + ) -> MediaFileType: + """Store input video locally. Extracted for testability.""" + return await store_media_file( + file=file, + execution_context=execution_context, + return_format="for_external_api", + ) + + async def _transcribe(self, data_uri: str, api_key: str) -> str: + """Call Replicate API to transcribe the video.""" + client = ReplicateClient(api_token=api_key) + + output = await client.async_run( + "jd7h/edit-video-by-editing-text:e010b880347314d07e3ce3b21cbd4c57add51fea3474677a6cb1316751c4cb90", + input={ + "mode": "transcribe", + "video_in": data_uri, + }, + ) + + # Handle dictionary response format + if isinstance(output, dict): + if "transcription" in output: + return str(output["transcription"]) + if "error" in output: + raise ValueError(f"API returned error: {output['error']}") + + # Handle list formats + if isinstance(output, list) and len(output) > 0: + if isinstance(output[0], FileOutput): + return output[0].url + if isinstance(output[0], dict) and "text" in output[0]: + return " ".join( + segment.get("text", "") for segment in output # type: ignore + ) + return str(output[0]) + + if isinstance(output, FileOutput): + return output.url + + if isinstance(output, str): + return output + + raise ValueError(f"Unexpected output format from Replicate API: {output}") + + async def run( + self, + input_data: Input, + *, + credentials: ReplicateCredentials, + execution_context: ExecutionContext, + **kwargs, + ) -> BlockOutput: + try: + # Store video and get data URI for API submission + data_uri = await self._store_input_video( + execution_context, input_data.video_in + ) + + transcript = await self._transcribe( + data_uri, credentials.api_key.get_secret_value() + ) + yield "transcription", transcript + + except BlockExecutionError: + raise + except Exception as e: + raise BlockExecutionError( + message=f"Failed to transcribe video: {e}", + block_name=self.name, + block_id=str(self.id), + ) from e diff --git a/autogpt_platform/backend/backend/data/execution_queue_test.py b/autogpt_platform/backend/backend/data/execution_queue_test.py index ffe0fb265b..7a76adfe05 100644 --- a/autogpt_platform/backend/backend/data/execution_queue_test.py +++ b/autogpt_platform/backend/backend/data/execution_queue_test.py @@ -3,8 +3,6 @@ import queue import threading -import pytest - from backend.data.execution import ExecutionQueue diff --git a/docs/integrations/README.md b/docs/integrations/README.md index 97a4d98709..182f3d73e6 100644 --- a/docs/integrations/README.md +++ b/docs/integrations/README.md @@ -474,8 +474,10 @@ Below is a comprehensive list of all available blocks, categorized by their prim | Block Name | Description | |------------|-------------| | [Add Audio To Video](block-integrations/video/add_audio.md#add-audio-to-video) | Block to attach an audio file to a video file using moviepy | +| [Edit Video By Text](block-integrations/video/edit_by_text.md#edit-video-by-text) | Edit a video by modifying its transcript | | [Loop Video](block-integrations/video/loop.md#loop-video) | Block to loop a video to a given duration or number of repeats | | [Media Duration](block-integrations/video/duration.md#media-duration) | Block to get the duration of a media file | +| [Transcribe Video](block-integrations/video/transcribe.md#transcribe-video) | Transcribe speech from a video file to text | | [Video Clip](block-integrations/video/clip.md#video-clip) | Extract a time segment from a video | | [Video Concat](block-integrations/video/concat.md#video-concat) | Merge multiple video clips into one continuous video | | [Video Download](block-integrations/video/download.md#video-download) | Download video from URL (YouTube, Vimeo, news sites, direct links) | diff --git a/docs/integrations/SUMMARY.md b/docs/integrations/SUMMARY.md index f481ae2e0a..85a0eb3802 100644 --- a/docs/integrations/SUMMARY.md +++ b/docs/integrations/SUMMARY.md @@ -133,8 +133,10 @@ * [Video Concat](block-integrations/video/concat.md) * [Video Download](block-integrations/video/download.md) * [Video Duration](block-integrations/video/duration.md) +* [Video Edit By Text](block-integrations/video/edit_by_text.md) * [Video Loop](block-integrations/video/loop.md) * [Video Narration](block-integrations/video/narration.md) * [Video Text Overlay](block-integrations/video/text_overlay.md) +* [Video Transcribe](block-integrations/video/transcribe.md) * [Wolfram LLM API](block-integrations/wolfram/llm_api.md) * [Zerobounce Validate Emails](block-integrations/zerobounce/validate_emails.md) diff --git a/docs/integrations/block-integrations/video/edit_by_text.md b/docs/integrations/block-integrations/video/edit_by_text.md new file mode 100644 index 0000000000..c1e9f500d2 --- /dev/null +++ b/docs/integrations/block-integrations/video/edit_by_text.md @@ -0,0 +1,37 @@ +# Video Edit By Text + +_Add a description of this category of blocks._ + + +## Edit Video By Text + +### What it is +Edit a video by modifying its transcript + +### How it works + +_Add technical explanation here._ + + +### Inputs + +| Input | Description | Type | Required | +|-------|-------------|------|----------| +| video_in | Input video file to edit (URL, data URI, or local path) | str (file) | Yes | +| transcription | Desired transcript for the output video | str | Yes | +| split_at | Granularity for transcript matching | "word" \| "character" | No | + +### Outputs + +| Output | Description | Type | +|--------|-------------|------| +| error | Error message if the operation failed | str | +| video_url | URL of the edited video | str | +| transcription | Transcription used for editing | str | + +### Possible use case + +_Add practical use case examples here._ + + +--- diff --git a/docs/integrations/block-integrations/video/transcribe.md b/docs/integrations/block-integrations/video/transcribe.md new file mode 100644 index 0000000000..b67c9b05bc --- /dev/null +++ b/docs/integrations/block-integrations/video/transcribe.md @@ -0,0 +1,34 @@ +# Video Transcribe + +_Add a description of this category of blocks._ + + +## Transcribe Video + +### What it is +Transcribe speech from a video file to text + +### How it works + +_Add technical explanation here._ + + +### Inputs + +| Input | Description | Type | Required | +|-------|-------------|------|----------| +| video_in | Input video file to transcribe (URL, data URI, or local path) | str (file) | Yes | + +### Outputs + +| Output | Description | Type | +|--------|-------------|------| +| error | Error message if the operation failed | str | +| transcription | Text transcription extracted from the video | str | + +### Possible use case + +_Add practical use case examples here._ + + +---