mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-03-17 03:00:27 -04:00
Compare commits
5 Commits
feat/githu
...
codex/add-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5797afd28b | ||
|
|
ac27f1b825 | ||
|
|
063a379c64 | ||
|
|
b98c02278a | ||
|
|
fda10563e7 |
@@ -9,11 +9,14 @@ This module provides blocks for:
|
||||
- Getting media duration
|
||||
- Looping videos
|
||||
- Adding audio to videos
|
||||
- Transcribing video speech to text
|
||||
- Editing videos by modifying their transcript
|
||||
|
||||
Dependencies:
|
||||
- yt-dlp: For video downloading
|
||||
- moviepy: For video editing operations
|
||||
- elevenlabs: For AI narration (optional)
|
||||
- replicate: For video transcription and text-based editing
|
||||
"""
|
||||
|
||||
from backend.blocks.video.add_audio import AddAudioToVideoBlock
|
||||
@@ -21,14 +24,18 @@ from backend.blocks.video.clip import VideoClipBlock
|
||||
from backend.blocks.video.concat import VideoConcatBlock
|
||||
from backend.blocks.video.download import VideoDownloadBlock
|
||||
from backend.blocks.video.duration import MediaDurationBlock
|
||||
from backend.blocks.video.edit_by_text import EditVideoByTextBlock
|
||||
from backend.blocks.video.loop import LoopVideoBlock
|
||||
from backend.blocks.video.narration import VideoNarrationBlock
|
||||
from backend.blocks.video.text_overlay import VideoTextOverlayBlock
|
||||
from backend.blocks.video.transcribe import TranscribeVideoBlock
|
||||
|
||||
__all__ = [
|
||||
"AddAudioToVideoBlock",
|
||||
"EditVideoByTextBlock",
|
||||
"LoopVideoBlock",
|
||||
"MediaDurationBlock",
|
||||
"TranscribeVideoBlock",
|
||||
"VideoClipBlock",
|
||||
"VideoConcatBlock",
|
||||
"VideoDownloadBlock",
|
||||
|
||||
175
autogpt_platform/backend/backend/blocks/video/edit_by_text.py
Normal file
175
autogpt_platform/backend/backend/blocks/video/edit_by_text.py
Normal file
@@ -0,0 +1,175 @@
|
||||
"""EditVideoByTextBlock - Edit a video by modifying its transcript via Replicate."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Literal
|
||||
|
||||
from replicate.client import Client as ReplicateClient
|
||||
from replicate.helpers import FileOutput
|
||||
|
||||
from backend.blocks._base import (
|
||||
Block,
|
||||
BlockCategory,
|
||||
BlockOutput,
|
||||
BlockSchemaInput,
|
||||
BlockSchemaOutput,
|
||||
)
|
||||
from backend.blocks.replicate._auth import (
|
||||
TEST_CREDENTIALS,
|
||||
TEST_CREDENTIALS_INPUT,
|
||||
ReplicateCredentials,
|
||||
ReplicateCredentialsInput,
|
||||
)
|
||||
from backend.data.execution import ExecutionContext
|
||||
from backend.data.model import CredentialsField, SchemaField
|
||||
from backend.util.exceptions import BlockExecutionError
|
||||
from backend.util.file import MediaFileType, store_media_file
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EditVideoByTextBlock(Block):
|
||||
"""Edit a video by modifying its transcript, cutting segments via Replicate API."""
|
||||
|
||||
class Input(BlockSchemaInput):
|
||||
credentials: ReplicateCredentialsInput = CredentialsField(
|
||||
description="Replicate API key for video editing.",
|
||||
)
|
||||
video_in: MediaFileType = SchemaField(
|
||||
description="Input video file to edit (URL, data URI, or local path)",
|
||||
)
|
||||
transcription: str = SchemaField(
|
||||
description="Modified transcript of the input video — segments absent from this text will be cut from the output video",
|
||||
)
|
||||
split_at: Literal["word", "character"] = SchemaField(
|
||||
description="Alignment granularity for transcript matching: 'word' aligns cuts at word boundaries, 'character' allows finer sub-word alignment",
|
||||
default="word",
|
||||
)
|
||||
|
||||
class Output(BlockSchemaOutput):
|
||||
video_out: MediaFileType = SchemaField(
|
||||
description="Edited video file (path or data URI)",
|
||||
)
|
||||
transcription: str = SchemaField(
|
||||
description="Transcription used for editing",
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
id="98d40049-a1de-465f-bba1-47411298ad1a",
|
||||
description="Edit a video by modifying its transcript — segments you remove from the transcript are cut from the output video",
|
||||
categories={BlockCategory.MULTIMEDIA},
|
||||
input_schema=self.Input,
|
||||
output_schema=self.Output,
|
||||
test_input={
|
||||
"credentials": TEST_CREDENTIALS_INPUT,
|
||||
"video_in": "data:video/mp4;base64,AAAA",
|
||||
"transcription": "edited transcript",
|
||||
},
|
||||
test_output=[
|
||||
("video_out", str),
|
||||
("transcription", "edited transcript"),
|
||||
],
|
||||
test_mock={
|
||||
"_edit_video": lambda *args: "https://replicate.com/output/video.mp4",
|
||||
"_store_input_video": lambda *args, **kwargs: "data:video/mp4;base64,AAAA",
|
||||
"_store_output_video": lambda *args, **kwargs: "edited_video.mp4",
|
||||
},
|
||||
test_credentials=TEST_CREDENTIALS,
|
||||
)
|
||||
|
||||
async def _store_input_video(
|
||||
self, execution_context: ExecutionContext, file: MediaFileType
|
||||
) -> MediaFileType:
|
||||
"""Store input video locally. Extracted for testability."""
|
||||
return await store_media_file(
|
||||
file=file,
|
||||
execution_context=execution_context,
|
||||
return_format="for_external_api",
|
||||
)
|
||||
|
||||
async def _store_output_video(
|
||||
self, execution_context: ExecutionContext, file: MediaFileType
|
||||
) -> MediaFileType:
|
||||
"""Store output video. Extracted for testability."""
|
||||
return await store_media_file(
|
||||
file=file,
|
||||
execution_context=execution_context,
|
||||
return_format="for_block_output",
|
||||
)
|
||||
|
||||
async def _edit_video(
|
||||
self, data_uri: str, transcription: str, split_at: str, api_key: str
|
||||
) -> str:
|
||||
"""Call Replicate API to edit the video based on the transcript."""
|
||||
client = ReplicateClient(api_token=api_key)
|
||||
|
||||
output = await client.async_run(
|
||||
"jd7h/edit-video-by-editing-text:e010b880347314d07e3ce3b21cbd4c57add51fea3474677a6cb1316751c4cb90",
|
||||
input={
|
||||
"mode": "edit",
|
||||
"video_in": data_uri,
|
||||
"transcription": transcription,
|
||||
"split_at": split_at,
|
||||
},
|
||||
)
|
||||
|
||||
# Get video URL from output
|
||||
if isinstance(output, dict) and "video" in output:
|
||||
video_output = output["video"]
|
||||
if isinstance(video_output, FileOutput):
|
||||
return video_output.url
|
||||
return str(video_output)
|
||||
|
||||
if isinstance(output, list) and len(output) > 0:
|
||||
video_url = output[0]
|
||||
if isinstance(video_url, FileOutput):
|
||||
return video_url.url
|
||||
return str(video_url)
|
||||
|
||||
if isinstance(output, FileOutput):
|
||||
return output.url
|
||||
|
||||
if isinstance(output, str):
|
||||
return output
|
||||
|
||||
raise ValueError(f"Unexpected output format from Replicate API: {output}")
|
||||
|
||||
async def run(
|
||||
self,
|
||||
input_data: Input,
|
||||
*,
|
||||
credentials: ReplicateCredentials,
|
||||
execution_context: ExecutionContext,
|
||||
**kwargs,
|
||||
) -> BlockOutput:
|
||||
try:
|
||||
# Store video and get data URI for API submission
|
||||
data_uri = await self._store_input_video(
|
||||
execution_context, input_data.video_in
|
||||
)
|
||||
|
||||
video_url = await self._edit_video(
|
||||
data_uri,
|
||||
input_data.transcription,
|
||||
input_data.split_at,
|
||||
credentials.api_key.get_secret_value(),
|
||||
)
|
||||
|
||||
# Store output through workspace so CoPilot gets workspace:// URIs
|
||||
video_out = await self._store_output_video(
|
||||
execution_context, MediaFileType(video_url)
|
||||
)
|
||||
|
||||
yield "video_out", video_out
|
||||
yield "transcription", input_data.transcription
|
||||
|
||||
except BlockExecutionError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise BlockExecutionError(
|
||||
message=f"Failed to edit video: {e}",
|
||||
block_name=self.name,
|
||||
block_id=str(self.id),
|
||||
) from e
|
||||
141
autogpt_platform/backend/backend/blocks/video/transcribe.py
Normal file
141
autogpt_platform/backend/backend/blocks/video/transcribe.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""TranscribeVideoBlock - Transcribe speech from a video file using Replicate."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from replicate.client import Client as ReplicateClient
|
||||
from replicate.helpers import FileOutput
|
||||
|
||||
from backend.blocks._base import (
|
||||
Block,
|
||||
BlockCategory,
|
||||
BlockOutput,
|
||||
BlockSchemaInput,
|
||||
BlockSchemaOutput,
|
||||
)
|
||||
from backend.blocks.replicate._auth import (
|
||||
TEST_CREDENTIALS,
|
||||
TEST_CREDENTIALS_INPUT,
|
||||
ReplicateCredentials,
|
||||
ReplicateCredentialsInput,
|
||||
)
|
||||
from backend.data.execution import ExecutionContext
|
||||
from backend.data.model import CredentialsField, SchemaField
|
||||
from backend.util.exceptions import BlockExecutionError
|
||||
from backend.util.file import MediaFileType, store_media_file
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TranscribeVideoBlock(Block):
|
||||
"""Transcribe speech from a video file to text via Replicate API."""
|
||||
|
||||
class Input(BlockSchemaInput):
|
||||
credentials: ReplicateCredentialsInput = CredentialsField(
|
||||
description="Replicate API key for video transcription.",
|
||||
)
|
||||
video_in: MediaFileType = SchemaField(
|
||||
description="Input video file to transcribe (URL, data URI, or local path)",
|
||||
)
|
||||
|
||||
class Output(BlockSchemaOutput):
|
||||
transcription: str = SchemaField(
|
||||
description="Text transcription extracted from the video",
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
id="fa49dad0-a5fc-441c-ba04-2ac206e392d8",
|
||||
description="Extract spoken words from a video and return them as a text transcription",
|
||||
categories={BlockCategory.MULTIMEDIA},
|
||||
input_schema=self.Input,
|
||||
output_schema=self.Output,
|
||||
test_input={
|
||||
"credentials": TEST_CREDENTIALS_INPUT,
|
||||
"video_in": "data:video/mp4;base64,AAAA",
|
||||
},
|
||||
test_output=[("transcription", "example transcript")],
|
||||
test_mock={
|
||||
"_transcribe": lambda *args: "example transcript",
|
||||
"_store_input_video": lambda *args, **kwargs: "test.mp4",
|
||||
},
|
||||
test_credentials=TEST_CREDENTIALS,
|
||||
)
|
||||
|
||||
async def _store_input_video(
|
||||
self, execution_context: ExecutionContext, file: MediaFileType
|
||||
) -> MediaFileType:
|
||||
"""Store input video locally. Extracted for testability."""
|
||||
return await store_media_file(
|
||||
file=file,
|
||||
execution_context=execution_context,
|
||||
return_format="for_external_api",
|
||||
)
|
||||
|
||||
async def _transcribe(self, data_uri: str, api_key: str) -> str:
|
||||
"""Call Replicate API to transcribe the video."""
|
||||
client = ReplicateClient(api_token=api_key)
|
||||
|
||||
output = await client.async_run(
|
||||
"jd7h/edit-video-by-editing-text:e010b880347314d07e3ce3b21cbd4c57add51fea3474677a6cb1316751c4cb90",
|
||||
input={
|
||||
"mode": "transcribe",
|
||||
"video_in": data_uri,
|
||||
},
|
||||
)
|
||||
|
||||
# Handle dictionary response format
|
||||
if isinstance(output, dict):
|
||||
if "transcription" in output:
|
||||
return str(output["transcription"])
|
||||
if "error" in output:
|
||||
raise ValueError(f"API returned error: {output['error']}")
|
||||
|
||||
# Handle list formats
|
||||
if isinstance(output, list) and len(output) > 0:
|
||||
if isinstance(output[0], FileOutput):
|
||||
content = await output[0].aread()
|
||||
return content.decode("utf-8")
|
||||
if isinstance(output[0], dict) and "text" in output[0]:
|
||||
return " ".join(
|
||||
segment.get("text", "") for segment in output # type: ignore
|
||||
)
|
||||
return str(output[0])
|
||||
|
||||
if isinstance(output, FileOutput):
|
||||
content = await output.aread()
|
||||
return content.decode("utf-8")
|
||||
|
||||
if isinstance(output, str):
|
||||
return output
|
||||
|
||||
raise ValueError(f"Unexpected output format from Replicate API: {output}")
|
||||
|
||||
async def run(
|
||||
self,
|
||||
input_data: Input,
|
||||
*,
|
||||
credentials: ReplicateCredentials,
|
||||
execution_context: ExecutionContext,
|
||||
**kwargs,
|
||||
) -> BlockOutput:
|
||||
try:
|
||||
# Store video and get data URI for API submission
|
||||
data_uri = await self._store_input_video(
|
||||
execution_context, input_data.video_in
|
||||
)
|
||||
|
||||
transcript = await self._transcribe(
|
||||
data_uri, credentials.api_key.get_secret_value()
|
||||
)
|
||||
yield "transcription", transcript
|
||||
|
||||
except BlockExecutionError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise BlockExecutionError(
|
||||
message=f"Failed to transcribe video: {e}",
|
||||
block_name=self.name,
|
||||
block_id=str(self.id),
|
||||
) from e
|
||||
@@ -492,8 +492,10 @@ Below is a comprehensive list of all available blocks, categorized by their prim
|
||||
| Block Name | Description |
|
||||
|------------|-------------|
|
||||
| [Add Audio To Video](block-integrations/video/add_audio.md#add-audio-to-video) | Block to attach an audio file to a video file using moviepy |
|
||||
| [Edit Video By Text](block-integrations/video/edit_by_text.md#edit-video-by-text) | Edit a video by modifying its transcript — segments you remove from the transcript are cut from the output video |
|
||||
| [Loop Video](block-integrations/video/loop.md#loop-video) | Block to loop a video to a given duration or number of repeats |
|
||||
| [Media Duration](block-integrations/video/duration.md#media-duration) | Block to get the duration of a media file |
|
||||
| [Transcribe Video](block-integrations/video/transcribe.md#transcribe-video) | Extract spoken words from a video and return them as a text transcription |
|
||||
| [Video Clip](block-integrations/video/clip.md#video-clip) | Extract a time segment from a video |
|
||||
| [Video Concat](block-integrations/video/concat.md#video-concat) | Merge multiple video clips into one continuous video |
|
||||
| [Video Download](block-integrations/video/download.md#video-download) | Download video from URL (YouTube, Vimeo, news sites, direct links) |
|
||||
|
||||
@@ -136,8 +136,10 @@
|
||||
* [Video Concat](block-integrations/video/concat.md)
|
||||
* [Video Download](block-integrations/video/download.md)
|
||||
* [Video Duration](block-integrations/video/duration.md)
|
||||
* [Video Edit By Text](block-integrations/video/edit_by_text.md)
|
||||
* [Video Loop](block-integrations/video/loop.md)
|
||||
* [Video Narration](block-integrations/video/narration.md)
|
||||
* [Video Text Overlay](block-integrations/video/text_overlay.md)
|
||||
* [Video Transcribe](block-integrations/video/transcribe.md)
|
||||
* [Wolfram LLM API](block-integrations/wolfram/llm_api.md)
|
||||
* [Zerobounce Validate Emails](block-integrations/zerobounce/validate_emails.md)
|
||||
|
||||
41
docs/integrations/block-integrations/video/edit_by_text.md
Normal file
41
docs/integrations/block-integrations/video/edit_by_text.md
Normal file
@@ -0,0 +1,41 @@
|
||||
# Video Edit By Text
|
||||
<!-- MANUAL: file_description -->
|
||||
This block edits a video by modifying its transcript — segments absent from the supplied transcript are cut from the output video, powered by the Replicate API.
|
||||
<!-- END MANUAL -->
|
||||
|
||||
## Edit Video By Text
|
||||
|
||||
### What it is
|
||||
Edit a video by modifying its transcript — segments you remove from the transcript are cut from the output video
|
||||
|
||||
### How it works
|
||||
<!-- MANUAL: how_it_works -->
|
||||
The block sends the input video and the desired transcript to the Replicate API using the `jd7h/edit-video-by-editing-text` model in "edit" mode. The model aligns the provided transcript against the original speech in the video and removes any video segments whose speech is not present in the supplied transcript. The `split_at` parameter controls alignment granularity: `word` (default) aligns cuts at word boundaries for natural-sounding edits, while `character` allows finer sub-word alignment for more precise control. The block returns the edited video (stored via the workspace file system) along with the transcript that was used.
|
||||
<!-- END MANUAL -->
|
||||
|
||||
### Inputs
|
||||
|
||||
| Input | Description | Type | Required |
|
||||
|-------|-------------|------|----------|
|
||||
| video_in | Input video file to edit (URL, data URI, or local path) | str (file) | Yes |
|
||||
| transcription | Modified transcript of the input video — segments absent from this text will be cut from the output video | str | Yes |
|
||||
| split_at | Alignment granularity for transcript matching: 'word' aligns cuts at word boundaries, 'character' allows finer sub-word alignment | "word" \| "character" | No |
|
||||
|
||||
### Outputs
|
||||
|
||||
| Output | Description | Type |
|
||||
|--------|-------------|------|
|
||||
| error | Error message if the operation failed | str |
|
||||
| video_out | Edited video file (path or data URI) | str (file) |
|
||||
| transcription | Transcription used for editing | str |
|
||||
|
||||
### Possible use case
|
||||
<!-- MANUAL: use_case -->
|
||||
**Interview Cleanup**: Remove filler words, false starts, or off-topic tangents from recorded interviews by editing the transcript and regenerating the video.
|
||||
|
||||
**Content Highlights**: Extract key segments from long-form video content by keeping only the relevant portions of the transcript.
|
||||
|
||||
**Automated Moderation**: Remove flagged or inappropriate speech segments from user-generated video content by stripping those lines from the transcript.
|
||||
<!-- END MANUAL -->
|
||||
|
||||
---
|
||||
38
docs/integrations/block-integrations/video/transcribe.md
Normal file
38
docs/integrations/block-integrations/video/transcribe.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# Video Transcribe
|
||||
<!-- MANUAL: file_description -->
|
||||
This block transcribes speech from a video file to text using the Replicate API.
|
||||
<!-- END MANUAL -->
|
||||
|
||||
## Transcribe Video
|
||||
|
||||
### What it is
|
||||
Extract spoken words from a video and return them as a text transcription
|
||||
|
||||
### How it works
|
||||
<!-- MANUAL: how_it_works -->
|
||||
The block sends the input video to the Replicate API using the `jd7h/edit-video-by-editing-text` model in "transcribe" mode. This model analyzes the audio track of the video, performs speech recognition, and returns the detected speech as text. The block handles multiple API response formats (dictionary, list, string, and file output) to reliably extract the transcript text.
|
||||
<!-- END MANUAL -->
|
||||
|
||||
### Inputs
|
||||
|
||||
| Input | Description | Type | Required |
|
||||
|-------|-------------|------|----------|
|
||||
| video_in | Input video file to transcribe (URL, data URI, or local path) | str (file) | Yes |
|
||||
|
||||
### Outputs
|
||||
|
||||
| Output | Description | Type |
|
||||
|--------|-------------|------|
|
||||
| error | Error message if the operation failed | str |
|
||||
| transcription | Text transcription extracted from the video | str |
|
||||
|
||||
### Possible use case
|
||||
<!-- MANUAL: use_case -->
|
||||
**Subtitle Generation**: Transcribe video dialogue to create subtitle or caption files for accessibility and localization.
|
||||
|
||||
**Searchable Video Archives**: Convert speech in recorded meetings, interviews, or lectures into searchable text for indexing and retrieval.
|
||||
|
||||
**LLM Content Pipeline**: Feed video transcripts into language models for summarization, analysis, or content repurposing workflows.
|
||||
<!-- END MANUAL -->
|
||||
|
||||
---
|
||||
Reference in New Issue
Block a user