docs(blocks): improve video block descriptions and regenerate docs

Better "What it is" descriptions so the generated docs are self-explanatory without a separate "What it does" section. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
fix(blocks): use store_media_file for_block_output on edit video output
2026-03-17 03:00:27 -04:00 · 2026-03-07 16:45:20 -06:00 · 2026-03-07 16:35:05 -06:00 · 2026-03-07 16:20:08 -06:00 · 2026-03-04 03:15:39 -06:00 · 2026-02-09 07:51:52 +00:00
7 changed files with 406 additions and 0 deletions
--- a/autogpt_platform/backend/backend/blocks/video/init.py
+++ b/autogpt_platform/backend/backend/blocks/video/init.py
@@ -9,11 +9,14 @@ This module provides blocks for:
 - Getting media duration
 - Looping videos
 - Adding audio to videos
+- Transcribing video speech to text
+- Editing videos by modifying their transcript

 Dependencies:
 - yt-dlp: For video downloading
 - moviepy: For video editing operations
 - elevenlabs: For AI narration (optional)
+- replicate: For video transcription and text-based editing
 """

 from backend.blocks.video.add_audio import AddAudioToVideoBlock
@@ -21,14 +24,18 @@ from backend.blocks.video.clip import VideoClipBlock
 from backend.blocks.video.concat import VideoConcatBlock
 from backend.blocks.video.download import VideoDownloadBlock
 from backend.blocks.video.duration import MediaDurationBlock
+from backend.blocks.video.edit_by_text import EditVideoByTextBlock
 from backend.blocks.video.loop import LoopVideoBlock
 from backend.blocks.video.narration import VideoNarrationBlock
 from backend.blocks.video.text_overlay import VideoTextOverlayBlock
+from backend.blocks.video.transcribe import TranscribeVideoBlock

 __all__ = [
    "AddAudioToVideoBlock",
+    "EditVideoByTextBlock",
    "LoopVideoBlock",
    "MediaDurationBlock",
+    "TranscribeVideoBlock",
    "VideoClipBlock",
    "VideoConcatBlock",
    "VideoDownloadBlock",
--- a/autogpt_platform/backend/backend/blocks/video/edit_by_text.py
+++ b/autogpt_platform/backend/backend/blocks/video/edit_by_text.py
@@ -0,0 +1,175 @@
+"""EditVideoByTextBlock - Edit a video by modifying its transcript via Replicate."""
+
+from __future__ import annotations
+
+import logging
+from typing import Literal
+
+from replicate.client import Client as ReplicateClient
+from replicate.helpers import FileOutput
+
+from backend.blocks._base import (
+    Block,
+    BlockCategory,
+    BlockOutput,
+    BlockSchemaInput,
+    BlockSchemaOutput,
+)
+from backend.blocks.replicate._auth import (
+    TEST_CREDENTIALS,
+    TEST_CREDENTIALS_INPUT,
+    ReplicateCredentials,
+    ReplicateCredentialsInput,
+)
+from backend.data.execution import ExecutionContext
+from backend.data.model import CredentialsField, SchemaField
+from backend.util.exceptions import BlockExecutionError
+from backend.util.file import MediaFileType, store_media_file
+
+logger = logging.getLogger(__name__)
+
+
+class EditVideoByTextBlock(Block):
+    """Edit a video by modifying its transcript, cutting segments via Replicate API."""
+
+    class Input(BlockSchemaInput):
+        credentials: ReplicateCredentialsInput = CredentialsField(
+            description="Replicate API key for video editing.",
+        )
+        video_in: MediaFileType = SchemaField(
+            description="Input video file to edit (URL, data URI, or local path)",
+        )
+        transcription: str = SchemaField(
+            description="Modified transcript of the input video — segments absent from this text will be cut from the output video",
+        )
+        split_at: Literal["word", "character"] = SchemaField(
+            description="Alignment granularity for transcript matching: 'word' aligns cuts at word boundaries, 'character' allows finer sub-word alignment",
+            default="word",
+        )
+
+    class Output(BlockSchemaOutput):
+        video_out: MediaFileType = SchemaField(
+            description="Edited video file (path or data URI)",
+        )
+        transcription: str = SchemaField(
+            description="Transcription used for editing",
+        )
+
+    def __init__(self):
+        super().__init__(
+            id="98d40049-a1de-465f-bba1-47411298ad1a",
+            description="Edit a video by modifying its transcript — segments you remove from the transcript are cut from the output video",
+            categories={BlockCategory.MULTIMEDIA},
+            input_schema=self.Input,
+            output_schema=self.Output,
+            test_input={
+                "credentials": TEST_CREDENTIALS_INPUT,
+                "video_in": "data:video/mp4;base64,AAAA",
+                "transcription": "edited transcript",
+            },
+            test_output=[
+                ("video_out", str),
+                ("transcription", "edited transcript"),
+            ],
+            test_mock={
+                "_edit_video": lambda *args: "https://replicate.com/output/video.mp4",
+                "_store_input_video": lambda *args, **kwargs: "data:video/mp4;base64,AAAA",
+                "_store_output_video": lambda *args, **kwargs: "edited_video.mp4",
+            },
+            test_credentials=TEST_CREDENTIALS,
+        )
+
+    async def _store_input_video(
+        self, execution_context: ExecutionContext, file: MediaFileType
+    ) -> MediaFileType:
+        """Store input video locally. Extracted for testability."""
+        return await store_media_file(
+            file=file,
+            execution_context=execution_context,
+            return_format="for_external_api",
+        )
+
+    async def _store_output_video(
+        self, execution_context: ExecutionContext, file: MediaFileType
+    ) -> MediaFileType:
+        """Store output video. Extracted for testability."""
+        return await store_media_file(
+            file=file,
+            execution_context=execution_context,
+            return_format="for_block_output",
+        )
+
+    async def _edit_video(
+        self, data_uri: str, transcription: str, split_at: str, api_key: str
+    ) -> str:
+        """Call Replicate API to edit the video based on the transcript."""
+        client = ReplicateClient(api_token=api_key)
+
+        output = await client.async_run(
+            "jd7h/edit-video-by-editing-text:e010b880347314d07e3ce3b21cbd4c57add51fea3474677a6cb1316751c4cb90",
+            input={
+                "mode": "edit",
+                "video_in": data_uri,
+                "transcription": transcription,
+                "split_at": split_at,
+            },
+        )
+
+        # Get video URL from output
+        if isinstance(output, dict) and "video" in output:
+            video_output = output["video"]
+            if isinstance(video_output, FileOutput):
+                return video_output.url
+            return str(video_output)
+
+        if isinstance(output, list) and len(output) > 0:
+            video_url = output[0]
+            if isinstance(video_url, FileOutput):
+                return video_url.url
+            return str(video_url)
+
+        if isinstance(output, FileOutput):
+            return output.url
+
+        if isinstance(output, str):
+            return output
+
+        raise ValueError(f"Unexpected output format from Replicate API: {output}")
+
+    async def run(
+        self,
+        input_data: Input,
+        *,
+        credentials: ReplicateCredentials,
+        execution_context: ExecutionContext,
+        **kwargs,
+    ) -> BlockOutput:
+        try:
+            # Store video and get data URI for API submission
+            data_uri = await self._store_input_video(
+                execution_context, input_data.video_in
+            )
+
+            video_url = await self._edit_video(
+                data_uri,
+                input_data.transcription,
+                input_data.split_at,
+                credentials.api_key.get_secret_value(),
+            )
+
+            # Store output through workspace so CoPilot gets workspace:// URIs
+            video_out = await self._store_output_video(
+                execution_context, MediaFileType(video_url)
+            )
+
+            yield "video_out", video_out
+            yield "transcription", input_data.transcription
+
+        except BlockExecutionError:
+            raise
+        except Exception as e:
+            raise BlockExecutionError(
+                message=f"Failed to edit video: {e}",
+                block_name=self.name,
+                block_id=str(self.id),
+            ) from e
--- a/autogpt_platform/backend/backend/blocks/video/transcribe.py
+++ b/autogpt_platform/backend/backend/blocks/video/transcribe.py
@@ -0,0 +1,141 @@
+"""TranscribeVideoBlock - Transcribe speech from a video file using Replicate."""
+
+from __future__ import annotations
+
+import logging
+
+from replicate.client import Client as ReplicateClient
+from replicate.helpers import FileOutput
+
+from backend.blocks._base import (
+    Block,
+    BlockCategory,
+    BlockOutput,
+    BlockSchemaInput,
+    BlockSchemaOutput,
+)
+from backend.blocks.replicate._auth import (
+    TEST_CREDENTIALS,
+    TEST_CREDENTIALS_INPUT,
+    ReplicateCredentials,
+    ReplicateCredentialsInput,
+)
+from backend.data.execution import ExecutionContext
+from backend.data.model import CredentialsField, SchemaField
+from backend.util.exceptions import BlockExecutionError
+from backend.util.file import MediaFileType, store_media_file
+
+logger = logging.getLogger(__name__)
+
+
+class TranscribeVideoBlock(Block):
+    """Transcribe speech from a video file to text via Replicate API."""
+
+    class Input(BlockSchemaInput):
+        credentials: ReplicateCredentialsInput = CredentialsField(
+            description="Replicate API key for video transcription.",
+        )
+        video_in: MediaFileType = SchemaField(
+            description="Input video file to transcribe (URL, data URI, or local path)",
+        )
+
+    class Output(BlockSchemaOutput):
+        transcription: str = SchemaField(
+            description="Text transcription extracted from the video",
+        )
+
+    def __init__(self):
+        super().__init__(
+            id="fa49dad0-a5fc-441c-ba04-2ac206e392d8",
+            description="Extract spoken words from a video and return them as a text transcription",
+            categories={BlockCategory.MULTIMEDIA},
+            input_schema=self.Input,
+            output_schema=self.Output,
+            test_input={
+                "credentials": TEST_CREDENTIALS_INPUT,
+                "video_in": "data:video/mp4;base64,AAAA",
+            },
+            test_output=[("transcription", "example transcript")],
+            test_mock={
+                "_transcribe": lambda *args: "example transcript",
+                "_store_input_video": lambda *args, **kwargs: "test.mp4",
+            },
+            test_credentials=TEST_CREDENTIALS,
+        )
+
+    async def _store_input_video(
+        self, execution_context: ExecutionContext, file: MediaFileType
+    ) -> MediaFileType:
+        """Store input video locally. Extracted for testability."""
+        return await store_media_file(
+            file=file,
+            execution_context=execution_context,
+            return_format="for_external_api",
+        )
+
+    async def _transcribe(self, data_uri: str, api_key: str) -> str:
+        """Call Replicate API to transcribe the video."""
+        client = ReplicateClient(api_token=api_key)
+
+        output = await client.async_run(
+            "jd7h/edit-video-by-editing-text:e010b880347314d07e3ce3b21cbd4c57add51fea3474677a6cb1316751c4cb90",
+            input={
+                "mode": "transcribe",
+                "video_in": data_uri,
+            },
+        )
+
+        # Handle dictionary response format
+        if isinstance(output, dict):
+            if "transcription" in output:
+                return str(output["transcription"])
+            if "error" in output:
+                raise ValueError(f"API returned error: {output['error']}")
+
+        # Handle list formats
+        if isinstance(output, list) and len(output) > 0:
+            if isinstance(output[0], FileOutput):
+                content = await output[0].aread()
+                return content.decode("utf-8")
+            if isinstance(output[0], dict) and "text" in output[0]:
+                return " ".join(
+                    segment.get("text", "") for segment in output  # type: ignore
+                )
+            return str(output[0])
+
+        if isinstance(output, FileOutput):
+            content = await output.aread()
+            return content.decode("utf-8")
+
+        if isinstance(output, str):
+            return output
+
+        raise ValueError(f"Unexpected output format from Replicate API: {output}")
+
+    async def run(
+        self,
+        input_data: Input,
+        *,
+        credentials: ReplicateCredentials,
+        execution_context: ExecutionContext,
+        **kwargs,
+    ) -> BlockOutput:
+        try:
+            # Store video and get data URI for API submission
+            data_uri = await self._store_input_video(
+                execution_context, input_data.video_in
+            )
+
+            transcript = await self._transcribe(
+                data_uri, credentials.api_key.get_secret_value()
+            )
+            yield "transcription", transcript
+
+        except BlockExecutionError:
+            raise
+        except Exception as e:
+            raise BlockExecutionError(
+                message=f"Failed to transcribe video: {e}",
+                block_name=self.name,
+                block_id=str(self.id),
+            ) from e
--- a/docs/integrations/README.md
+++ b/docs/integrations/README.md
@@ -492,8 +492,10 @@ Below is a comprehensive list of all available blocks, categorized by their prim
 | Block Name | Description |
 |------------|-------------|
 | [Add Audio To Video](block-integrations/video/add_audio.md#add-audio-to-video) | Block to attach an audio file to a video file using moviepy |
+| [Edit Video By Text](block-integrations/video/edit_by_text.md#edit-video-by-text) | Edit a video by modifying its transcript — segments you remove from the transcript are cut from the output video |
 | [Loop Video](block-integrations/video/loop.md#loop-video) | Block to loop a video to a given duration or number of repeats |
 | [Media Duration](block-integrations/video/duration.md#media-duration) | Block to get the duration of a media file |
+| [Transcribe Video](block-integrations/video/transcribe.md#transcribe-video) | Extract spoken words from a video and return them as a text transcription |
 | [Video Clip](block-integrations/video/clip.md#video-clip) | Extract a time segment from a video |
 | [Video Concat](block-integrations/video/concat.md#video-concat) | Merge multiple video clips into one continuous video |
 | [Video Download](block-integrations/video/download.md#video-download) | Download video from URL (YouTube, Vimeo, news sites, direct links) |
--- a/docs/integrations/SUMMARY.md
+++ b/docs/integrations/SUMMARY.md
@@ -136,8 +136,10 @@
 * [Video Concat](block-integrations/video/concat.md)
 * [Video Download](block-integrations/video/download.md)
 * [Video Duration](block-integrations/video/duration.md)
+* [Video Edit By Text](block-integrations/video/edit_by_text.md)
 * [Video Loop](block-integrations/video/loop.md)
 * [Video Narration](block-integrations/video/narration.md)
 * [Video Text Overlay](block-integrations/video/text_overlay.md)
+* [Video Transcribe](block-integrations/video/transcribe.md)
 * [Wolfram LLM API](block-integrations/wolfram/llm_api.md)
 * [Zerobounce Validate Emails](block-integrations/zerobounce/validate_emails.md)
--- a/docs/integrations/block-integrations/video/edit_by_text.md
+++ b/docs/integrations/block-integrations/video/edit_by_text.md
@@ -0,0 +1,41 @@
+# Video Edit By Text
+<!-- MANUAL: file_description -->
+This block edits a video by modifying its transcript — segments absent from the supplied transcript are cut from the output video, powered by the Replicate API.
+<!-- END MANUAL -->
+
+## Edit Video By Text
+
+### What it is
+Edit a video by modifying its transcript — segments you remove from the transcript are cut from the output video
+
+### How it works
+<!-- MANUAL: how_it_works -->
+The block sends the input video and the desired transcript to the Replicate API using the `jd7h/edit-video-by-editing-text` model in "edit" mode. The model aligns the provided transcript against the original speech in the video and removes any video segments whose speech is not present in the supplied transcript. The `split_at` parameter controls alignment granularity: `word` (default) aligns cuts at word boundaries for natural-sounding edits, while `character` allows finer sub-word alignment for more precise control. The block returns the edited video (stored via the workspace file system) along with the transcript that was used.
+<!-- END MANUAL -->
+
+### Inputs
+
+| Input | Description | Type | Required |
+|-------|-------------|------|----------|
+| video_in | Input video file to edit (URL, data URI, or local path) | str (file) | Yes |
+| transcription | Modified transcript of the input video — segments absent from this text will be cut from the output video | str | Yes |
+| split_at | Alignment granularity for transcript matching: 'word' aligns cuts at word boundaries, 'character' allows finer sub-word alignment | "word" \| "character" | No |
+
+### Outputs
+
+| Output | Description | Type |
+|--------|-------------|------|
+| error | Error message if the operation failed | str |
+| video_out | Edited video file (path or data URI) | str (file) |
+| transcription | Transcription used for editing | str |
+
+### Possible use case
+<!-- MANUAL: use_case -->
+**Interview Cleanup**: Remove filler words, false starts, or off-topic tangents from recorded interviews by editing the transcript and regenerating the video.
+
+**Content Highlights**: Extract key segments from long-form video content by keeping only the relevant portions of the transcript.
+
+**Automated Moderation**: Remove flagged or inappropriate speech segments from user-generated video content by stripping those lines from the transcript.
+<!-- END MANUAL -->
+
+---
--- a/docs/integrations/block-integrations/video/transcribe.md
+++ b/docs/integrations/block-integrations/video/transcribe.md
@@ -0,0 +1,38 @@
+# Video Transcribe
+<!-- MANUAL: file_description -->
+This block transcribes speech from a video file to text using the Replicate API.
+<!-- END MANUAL -->
+
+## Transcribe Video
+
+### What it is
+Extract spoken words from a video and return them as a text transcription
+
+### How it works
+<!-- MANUAL: how_it_works -->
+The block sends the input video to the Replicate API using the `jd7h/edit-video-by-editing-text` model in "transcribe" mode. This model analyzes the audio track of the video, performs speech recognition, and returns the detected speech as text. The block handles multiple API response formats (dictionary, list, string, and file output) to reliably extract the transcript text.
+<!-- END MANUAL -->
+
+### Inputs
+
+| Input | Description | Type | Required |
+|-------|-------------|------|----------|
+| video_in | Input video file to transcribe (URL, data URI, or local path) | str (file) | Yes |
+
+### Outputs
+
+| Output | Description | Type |
+|--------|-------------|------|
+| error | Error message if the operation failed | str |
+| transcription | Text transcription extracted from the video | str |
+
+### Possible use case
+<!-- MANUAL: use_case -->
+**Subtitle Generation**: Transcribe video dialogue to create subtitle or caption files for accessibility and localization.
+
+**Searchable Video Archives**: Convert speech in recorded meetings, interviews, or lectures into searchable text for indexing and retrieval.
+
+**LLM Content Pipeline**: Feed video transcripts into language models for summarization, analysis, or content repurposing workflows.
+<!-- END MANUAL -->
+
+---
Author	SHA1	Message	Date
Nicholas Tindle	5797afd28b	docs(blocks): improve video block descriptions and regenerate docs Better "What it is" descriptions so the generated docs are self-explanatory without a separate "What it does" section. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-07 16:45:20 -06:00
Nicholas Tindle	ac27f1b825	fix(blocks): use store_media_file for_block_output on edit video output The edit block was returning a raw Replicate URL instead of piping the output through store_media_file with for_block_output. This broke workspace:// URI generation in CoPilot. Matches the pattern used by all sibling video blocks (clip, narration, concat, etc). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-07 16:35:05 -06:00
Nicholas Tindle	063a379c64	fix(blocks): correct imports, improve descriptions, and read FileOutput content - Fix imports to use backend.blocks._base instead of backend.data.block - Improve field descriptions for transcription and split_at inputs - Read FileOutput content with aread() instead of returning URL - Fill in manual documentation sections for both video blocks Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-07 16:20:08 -06:00
Nicholas Tindle	b98c02278a	Merge branch 'dev' into codex/add-edit-video-and-transcribe-video-blocks	2026-03-04 03:15:39 -06:00
claude[bot]	fda10563e7	feat(blocks): add video transcription and editing blocks on dev - Add TranscribeVideoBlock and EditVideoByTextBlock to blocks/video/ - Update video/__init__.py with new block exports - Generate block documentation via generate_block_docs.py - Fix wait=False bug in Replicate API calls (was returning Prediction object instead of actual output) - Format fixes Co-authored-by: Nicholas Tindle <ntindle@users.noreply.github.com>	2026-02-09 07:51:52 +00:00