fix(blocks): correct imports, improve descriptions, and read FileOutput content

- Fix imports to use backend.blocks._base instead of backend.data.block - Improve field descriptions for transcription and split_at inputs - Read FileOutput content with aread() instead of returning URL - Fill in manual documentation sections for both video blocks Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-08 03:00:28 -04:00 · 2026-03-07 16:20:08 -06:00
parent b98c02278a
commit 063a379c64
4 changed files with 42 additions and 26 deletions
--- a/autogpt_platform/backend/backend/blocks/video/edit_by_text.py
+++ b/autogpt_platform/backend/backend/blocks/video/edit_by_text.py
@@ -8,19 +8,19 @@ from typing import Literal
 from replicate.client import Client as ReplicateClient
 from replicate.helpers import FileOutput

-from backend.blocks.replicate._auth import (
-    TEST_CREDENTIALS,
-    TEST_CREDENTIALS_INPUT,
-    ReplicateCredentials,
-    ReplicateCredentialsInput,
-)
-from backend.data.block import (
+from backend.blocks._base import (
    Block,
    BlockCategory,
    BlockOutput,
    BlockSchemaInput,
    BlockSchemaOutput,
 )
+from backend.blocks.replicate._auth import (
+    TEST_CREDENTIALS,
+    TEST_CREDENTIALS_INPUT,
+    ReplicateCredentials,
+    ReplicateCredentialsInput,
+)
 from backend.data.execution import ExecutionContext
 from backend.data.model import CredentialsField, SchemaField
 from backend.util.exceptions import BlockExecutionError
@@ -40,10 +40,10 @@ class EditVideoByTextBlock(Block):
            description="Input video file to edit (URL, data URI, or local path)",
        )
        transcription: str = SchemaField(
-            description="Desired transcript for the output video",
+            description="Modified transcript of the input video — segments absent from this text will be cut from the output video",
        )
        split_at: Literal["word", "character"] = SchemaField(
-            description="Granularity for transcript matching",
+            description="Alignment granularity for transcript matching: 'word' aligns cuts at word boundaries, 'character' allows finer sub-word alignment",
            default="word",
        )

--- a/autogpt_platform/backend/backend/blocks/video/transcribe.py
+++ b/autogpt_platform/backend/backend/blocks/video/transcribe.py
@@ -7,19 +7,19 @@ import logging
 from replicate.client import Client as ReplicateClient
 from replicate.helpers import FileOutput

-from backend.blocks.replicate._auth import (
-    TEST_CREDENTIALS,
-    TEST_CREDENTIALS_INPUT,
-    ReplicateCredentials,
-    ReplicateCredentialsInput,
-)
-from backend.data.block import (
+from backend.blocks._base import (
    Block,
    BlockCategory,
    BlockOutput,
    BlockSchemaInput,
    BlockSchemaOutput,
 )
+from backend.blocks.replicate._auth import (
+    TEST_CREDENTIALS,
+    TEST_CREDENTIALS_INPUT,
+    ReplicateCredentials,
+    ReplicateCredentialsInput,
+)
 from backend.data.execution import ExecutionContext
 from backend.data.model import CredentialsField, SchemaField
 from backend.util.exceptions import BlockExecutionError
@@ -95,7 +95,8 @@ class TranscribeVideoBlock(Block):
        # Handle list formats
        if isinstance(output, list) and len(output) > 0:
            if isinstance(output[0], FileOutput):
-                return output[0].url
+                content = await output[0].aread()
+                return content.decode("utf-8")
            if isinstance(output[0], dict) and "text" in output[0]:
                return " ".join(
                    segment.get("text", "") for segment in output  # type: ignore
@@ -103,7 +104,8 @@ class TranscribeVideoBlock(Block):
            return str(output[0])

        if isinstance(output, FileOutput):
-            return output.url
+            content = await output.aread()
+            return content.decode("utf-8")

        if isinstance(output, str):
            return output
--- a/docs/integrations/block-integrations/video/edit_by_text.md
+++ b/docs/integrations/block-integrations/video/edit_by_text.md
@@ -1,6 +1,6 @@
 # Video Edit By Text
 <!-- MANUAL: file_description -->
-_Add a description of this category of blocks._
+This block edits a video by modifying its transcript — segments absent from the supplied transcript are cut from the output video, powered by the Replicate API.
 <!-- END MANUAL -->

 ## Edit Video By Text
@@ -8,9 +8,12 @@ _Add a description of this category of blocks._
 ### What it is
 Edit a video by modifying its transcript

+### What it does
+Takes a video and a modified version of its transcript, then produces a new video with only the segments that match the provided transcript. Any spoken segments you remove from the transcript will be cut from the output video.
+
 ### How it works
 <!-- MANUAL: how_it_works -->
-_Add technical explanation here._
+The block sends the input video and the desired transcript to the Replicate API using the `jd7h/edit-video-by-editing-text` model in "edit" mode. The model aligns the provided transcript against the original speech in the video and removes any video segments whose speech is not present in the supplied transcript. The `split_at` parameter controls alignment granularity: `word` (default) aligns cuts at word boundaries for natural-sounding edits, while `character` allows finer sub-word alignment for more precise control. The block returns the URL of the edited video along with the transcript that was used.
 <!-- END MANUAL -->

 ### Inputs
@@ -18,8 +21,8 @@ _Add technical explanation here._
 | Input | Description | Type | Required |
 |-------|-------------|------|----------|
 | video_in | Input video file to edit (URL, data URI, or local path) | str (file) | Yes |
-| transcription | Desired transcript for the output video | str | Yes |
-| split_at | Granularity for transcript matching | "word" \| "character" | No |
+| transcription | Modified transcript of the input video — segments absent from this text will be cut from the output video | str | Yes |
+| split_at | Alignment granularity for transcript matching: `word` aligns cuts at word boundaries (default), `character` allows finer sub-word alignment | "word" \| "character" | No (default: `word`) |

 ### Outputs

@@ -31,7 +34,11 @@ _Add technical explanation here._

 ### Possible use case
 <!-- MANUAL: use_case -->
-_Add practical use case examples here._
+**Interview Cleanup**: Remove filler words, false starts, or off-topic tangents from recorded interviews by editing the transcript and regenerating the video.
+
+**Content Highlights**: Extract key segments from long-form video content by keeping only the relevant portions of the transcript.
+
+**Automated Moderation**: Remove flagged or inappropriate speech segments from user-generated video content by stripping those lines from the transcript.
 <!-- END MANUAL -->

 ---
--- a/docs/integrations/block-integrations/video/transcribe.md
+++ b/docs/integrations/block-integrations/video/transcribe.md
@@ -1,6 +1,6 @@
 # Video Transcribe
 <!-- MANUAL: file_description -->
-_Add a description of this category of blocks._
+This block transcribes speech from a video file to text using the Replicate API.
 <!-- END MANUAL -->

 ## Transcribe Video
@@ -8,9 +8,12 @@ _Add a description of this category of blocks._
 ### What it is
 Transcribe speech from a video file to text

+### What it does
+Extracts spoken words from a video and returns them as a text transcription. The block accepts video input as a URL, data URI, or local path and outputs the full transcript as a string.
+
 ### How it works
 <!-- MANUAL: how_it_works -->
-_Add technical explanation here._
+The block sends the input video to the Replicate API using the `jd7h/edit-video-by-editing-text` model in "transcribe" mode. This model analyzes the audio track of the video, performs speech recognition, and returns the detected speech as text. The block handles multiple API response formats (dictionary, list, string, and file output) to reliably extract the transcript text.
 <!-- END MANUAL -->

 ### Inputs
@@ -28,7 +31,11 @@ _Add technical explanation here._

 ### Possible use case
 <!-- MANUAL: use_case -->
-_Add practical use case examples here._
+**Subtitle Generation**: Transcribe video dialogue to create subtitle or caption files for accessibility and localization.
+
+**Searchable Video Archives**: Convert speech in recorded meetings, interviews, or lectures into searchable text for indexing and retrieval.
+
+**LLM Content Pipeline**: Feed video transcripts into language models for summarization, analysis, or content repurposing workflows.
 <!-- END MANUAL -->

 ---