Merge branch 'dev' into swiftyos/automat-19-elevenlabs

elevenlabs integration
2026-01-20 20:48:11 -05:00 · 2025-07-14 12:18:45 +02:00 · 2025-07-11 11:21:29 +02:00
8 changed files with 1082 additions and 0 deletions
--- a/autogpt_platform/backend/backend/blocks/elevenlabs/init.py
+++ b/autogpt_platform/backend/backend/blocks/elevenlabs/init.py
@@ -0,0 +1,48 @@
+"""
+ElevenLabs integration blocks for AutoGPT Platform.
+"""
+
+# Speech generation blocks
+from .speech import (
+    ElevenLabsGenerateSpeechBlock,
+    ElevenLabsGenerateSpeechWithTimestampsBlock,
+)
+
+# Speech-to-text blocks
+from .transcription import (
+    ElevenLabsTranscribeAudioAsyncBlock,
+    ElevenLabsTranscribeAudioSyncBlock,
+)
+
+# Webhook trigger blocks
+from .triggers import ElevenLabsWebhookTriggerBlock
+
+# Utility blocks
+from .utility import ElevenLabsGetUsageStatsBlock, ElevenLabsListModelsBlock
+
+# Voice management blocks
+from .voices import (
+    ElevenLabsCreateVoiceCloneBlock,
+    ElevenLabsDeleteVoiceBlock,
+    ElevenLabsGetVoiceDetailsBlock,
+    ElevenLabsListVoicesBlock,
+)
+
+__all__ = [
+    # Voice management
+    "ElevenLabsListVoicesBlock",
+    "ElevenLabsGetVoiceDetailsBlock",
+    "ElevenLabsCreateVoiceCloneBlock",
+    "ElevenLabsDeleteVoiceBlock",
+    # Speech generation
+    "ElevenLabsGenerateSpeechBlock",
+    "ElevenLabsGenerateSpeechWithTimestampsBlock",
+    # Speech-to-text
+    "ElevenLabsTranscribeAudioSyncBlock",
+    "ElevenLabsTranscribeAudioAsyncBlock",
+    # Utility
+    "ElevenLabsListModelsBlock",
+    "ElevenLabsGetUsageStatsBlock",
+    # Webhook triggers
+    "ElevenLabsWebhookTriggerBlock",
+]
--- a/autogpt_platform/backend/backend/blocks/elevenlabs/_config.py
+++ b/autogpt_platform/backend/backend/blocks/elevenlabs/_config.py
@@ -0,0 +1,16 @@
+"""
+Shared configuration for all ElevenLabs blocks using the SDK pattern.
+"""
+
+from backend.sdk import BlockCostType, ProviderBuilder
+
+from ._webhook import ElevenLabsWebhookManager
+
+# Configure the ElevenLabs provider with API key authentication
+elevenlabs = (
+    ProviderBuilder("elevenlabs")
+    .with_api_key("ELEVENLABS_API_KEY", "ElevenLabs API Key")
+    .with_webhook_manager(ElevenLabsWebhookManager)
+    .with_base_cost(2, BlockCostType.RUN)  # Base cost for API calls
+    .build()
+)
--- a/autogpt_platform/backend/backend/blocks/elevenlabs/_webhook.py
+++ b/autogpt_platform/backend/backend/blocks/elevenlabs/_webhook.py
@@ -0,0 +1,82 @@
+"""
+ElevenLabs webhook manager for handling webhook events.
+"""
+
+import hashlib
+import hmac
+from typing import Tuple
+
+from backend.data.model import Credentials
+from backend.sdk import BaseWebhooksManager, ProviderName, Webhook
+
+
+class ElevenLabsWebhookManager(BaseWebhooksManager):
+    """Manages ElevenLabs webhook events."""
+
+    PROVIDER_NAME = ProviderName("elevenlabs")
+
+    @classmethod
+    async def validate_payload(cls, webhook: Webhook, request) -> Tuple[dict, str]:
+        """
+        Validate incoming webhook payload and signature.
+
+        ElevenLabs supports HMAC authentication for webhooks.
+        """
+        payload = await request.json()
+
+        # Verify webhook signature if configured
+        if webhook.secret:
+            webhook_secret = webhook.config.get("webhook_secret")
+            if webhook_secret:
+                # Get the raw body for signature verification
+                body = await request.body()
+
+                # Calculate expected signature
+                expected_signature = hmac.new(
+                    webhook_secret.encode(), body, hashlib.sha256
+                ).hexdigest()
+
+                # Get signature from headers
+                signature = request.headers.get("x-elevenlabs-signature")
+
+                if signature and not hmac.compare_digest(signature, expected_signature):
+                    raise ValueError("Invalid webhook signature")
+
+        # Extract event type from payload
+        event_type = payload.get("type", "unknown")
+        return payload, event_type
+
+    async def _register_webhook(
+        self,
+        credentials: Credentials,
+        webhook_type: str,
+        resource: str,
+        events: list[str],
+        ingress_url: str,
+        secret: str,
+    ) -> tuple[str, dict]:
+        """
+        Register a webhook with ElevenLabs.
+
+        Note: ElevenLabs webhook registration is done through their dashboard,
+        not via API. This is a placeholder implementation.
+        """
+        # ElevenLabs requires manual webhook setup through dashboard
+        # Return empty webhook ID and config with instructions
+        config = {
+            "manual_setup_required": True,
+            "webhook_secret": secret,
+            "instructions": "Please configure webhook URL in ElevenLabs dashboard",
+        }
+        return "", config
+
+    async def _deregister_webhook(
+        self, webhook: Webhook, credentials: Credentials
+    ) -> None:
+        """
+        Deregister a webhook with ElevenLabs.
+
+        Note: ElevenLabs webhook removal is done through their dashboard.
+        """
+        # ElevenLabs requires manual webhook removal through dashboard
+        pass
--- a/autogpt_platform/backend/backend/blocks/elevenlabs/speech.py
+++ b/autogpt_platform/backend/backend/blocks/elevenlabs/speech.py
@@ -0,0 +1,179 @@
+"""
+ElevenLabs speech generation (text-to-speech) blocks.
+"""
+
+from typing import Optional
+
+from backend.sdk import (
+    APIKeyCredentials,
+    Block,
+    BlockCategory,
+    BlockOutput,
+    BlockSchema,
+    CredentialsMetaInput,
+    Requests,
+    SchemaField,
+)
+
+from ._config import elevenlabs
+
+
+class ElevenLabsGenerateSpeechBlock(Block):
+    """
+    Turn text into audio (binary).
+    """
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = elevenlabs.credentials_field(
+            description="ElevenLabs API credentials"
+        )
+        voice_id: str = SchemaField(description="ID of the voice to use")
+        text: str = SchemaField(description="Text to convert to speech")
+        model_id: str = SchemaField(
+            description="Model ID to use for generation",
+            default="eleven_multilingual_v2",
+        )
+        output_format: str = SchemaField(
+            description="Audio format (e.g., mp3_44100_128)",
+            default="mp3_44100_128",
+        )
+        voice_settings: Optional[dict] = SchemaField(
+            description="Override voice settings (stability, similarity_boost, etc.)",
+            default=None,
+        )
+        language_code: Optional[str] = SchemaField(
+            description="Language code to enforce output language", default=None
+        )
+        seed: Optional[int] = SchemaField(
+            description="Seed for reproducible output", default=None
+        )
+
+    class Output(BlockSchema):
+        audio: str = SchemaField(description="Base64-encoded audio data")
+
+    def __init__(self):
+        super().__init__(
+            id="c5d6e7f8-a9b0-c1d2-e3f4-a5b6c7d8e9f0",
+            description="Generate speech audio from text using a specified voice",
+            categories={BlockCategory.AI},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+        import base64
+
+        api_key = credentials.api_key.get_secret_value()
+
+        # Build request body
+        body: dict[str, str | int | dict] = {
+            "text": input_data.text,
+            "model_id": input_data.model_id,
+        }
+
+        # Add optional fields
+        if input_data.voice_settings:
+            body["voice_settings"] = input_data.voice_settings
+        if input_data.language_code:
+            body["language_code"] = input_data.language_code
+        if input_data.seed is not None:
+            body["seed"] = input_data.seed
+
+        # Generate speech
+        response = await Requests().post(
+            f"https://api.elevenlabs.io/v1/text-to-speech/{input_data.voice_id}",
+            headers={
+                "xi-api-key": api_key,
+                "Content-Type": "application/json",
+            },
+            json=body,
+            params={"output_format": input_data.output_format},
+        )
+
+        # Get audio data and encode to base64
+        audio_data = response.content
+        audio_base64 = base64.b64encode(audio_data).decode("utf-8")
+
+        yield "audio", audio_base64
+
+
+class ElevenLabsGenerateSpeechWithTimestampsBlock(Block):
+    """
+    Text to audio AND per-character timing data.
+    """
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = elevenlabs.credentials_field(
+            description="ElevenLabs API credentials"
+        )
+        voice_id: str = SchemaField(description="ID of the voice to use")
+        text: str = SchemaField(description="Text to convert to speech")
+        model_id: str = SchemaField(
+            description="Model ID to use for generation",
+            default="eleven_multilingual_v2",
+        )
+        output_format: str = SchemaField(
+            description="Audio format (e.g., mp3_44100_128)",
+            default="mp3_44100_128",
+        )
+        voice_settings: Optional[dict] = SchemaField(
+            description="Override voice settings (stability, similarity_boost, etc.)",
+            default=None,
+        )
+        language_code: Optional[str] = SchemaField(
+            description="Language code to enforce output language", default=None
+        )
+
+    class Output(BlockSchema):
+        audio_base64: str = SchemaField(description="Base64-encoded audio data")
+        alignment: dict = SchemaField(
+            description="Character-level timing alignment data"
+        )
+        normalized_alignment: dict = SchemaField(
+            description="Normalized text alignment data"
+        )
+
+    def __init__(self):
+        super().__init__(
+            id="d6e7f8a9-b0c1-d2e3-f4a5-b6c7d8e9f0a1",
+            description="Generate speech with character-level timestamp information",
+            categories={BlockCategory.AI},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+        api_key = credentials.api_key.get_secret_value()
+
+        # Build request body
+        body: dict[str, str | dict] = {
+            "text": input_data.text,
+            "model_id": input_data.model_id,
+        }
+
+        # Add optional fields
+        if input_data.voice_settings:
+            body["voice_settings"] = input_data.voice_settings
+        if input_data.language_code:
+            body["language_code"] = input_data.language_code
+
+        # Generate speech with timestamps
+        response = await Requests().post(
+            f"https://api.elevenlabs.io/v1/text-to-speech/{input_data.voice_id}/with-timestamps",
+            headers={
+                "xi-api-key": api_key,
+                "Content-Type": "application/json",
+            },
+            json=body,
+            params={"output_format": input_data.output_format},
+        )
+
+        data = response.json()
+
+        yield "audio_base64", data.get("audio_base64", "")
+        yield "alignment", data.get("alignment", {})
+        yield "normalized_alignment", data.get("normalized_alignment", {})
--- a/autogpt_platform/backend/backend/blocks/elevenlabs/transcription.py
+++ b/autogpt_platform/backend/backend/blocks/elevenlabs/transcription.py
@@ -0,0 +1,232 @@
+"""
+ElevenLabs speech-to-text (transcription) blocks.
+"""
+
+from typing import Optional
+
+from backend.sdk import (
+    APIKeyCredentials,
+    Block,
+    BlockCategory,
+    BlockOutput,
+    BlockSchema,
+    CredentialsMetaInput,
+    Requests,
+    SchemaField,
+)
+
+from ._config import elevenlabs
+
+
+class ElevenLabsTranscribeAudioSyncBlock(Block):
+    """
+    Synchronously convert audio to text (+ word timestamps, diarization).
+    """
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = elevenlabs.credentials_field(
+            description="ElevenLabs API credentials"
+        )
+        model_id: str = SchemaField(
+            description="Model ID for transcription", default="scribe_v1"
+        )
+        file: Optional[str] = SchemaField(
+            description="Base64-encoded audio file", default=None
+        )
+        cloud_storage_url: Optional[str] = SchemaField(
+            description="URL to audio file in cloud storage", default=None
+        )
+        language_code: Optional[str] = SchemaField(
+            description="Language code (ISO 639-1 or -3) to improve accuracy",
+            default=None,
+        )
+        diarize: bool = SchemaField(
+            description="Enable speaker diarization", default=False
+        )
+        num_speakers: Optional[int] = SchemaField(
+            description="Expected number of speakers (max 32)", default=None
+        )
+        timestamps_granularity: str = SchemaField(
+            description="Timestamp detail level: word, character, or none",
+            default="word",
+        )
+        tag_audio_events: bool = SchemaField(
+            description="Tag non-speech sounds (laughter, noise)", default=True
+        )
+
+    class Output(BlockSchema):
+        text: str = SchemaField(description="Full transcribed text")
+        words: list[dict] = SchemaField(
+            description="Array with word timing and speaker info"
+        )
+        language_code: str = SchemaField(description="Detected language code")
+        language_probability: float = SchemaField(
+            description="Confidence in language detection"
+        )
+
+    def __init__(self):
+        super().__init__(
+            id="e7f8a9b0-c1d2-e3f4-a5b6-c7d8e9f0a1b2",
+            description="Transcribe audio to text with timing and speaker information",
+            categories={BlockCategory.AI},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+        import base64
+        from io import BytesIO
+
+        api_key = credentials.api_key.get_secret_value()
+
+        # Validate input - must have either file or URL
+        if not input_data.file and not input_data.cloud_storage_url:
+            raise ValueError("Either 'file' or 'cloud_storage_url' must be provided")
+        if input_data.file and input_data.cloud_storage_url:
+            raise ValueError(
+                "Only one of 'file' or 'cloud_storage_url' should be provided"
+            )
+
+        # Build form data
+        form_data = {
+            "model_id": input_data.model_id,
+            "diarize": str(input_data.diarize).lower(),
+            "timestamps_granularity": input_data.timestamps_granularity,
+            "tag_audio_events": str(input_data.tag_audio_events).lower(),
+        }
+
+        if input_data.language_code:
+            form_data["language_code"] = input_data.language_code
+        if input_data.num_speakers is not None:
+            form_data["num_speakers"] = str(input_data.num_speakers)
+
+        # Handle file or URL
+        files = None
+        if input_data.file:
+            # Decode base64 file
+            file_data = base64.b64decode(input_data.file)
+            files = [("file", ("audio.wav", BytesIO(file_data), "audio/wav"))]
+        elif input_data.cloud_storage_url:
+            form_data["cloud_storage_url"] = input_data.cloud_storage_url
+
+        # Transcribe audio
+        response = await Requests().post(
+            "https://api.elevenlabs.io/v1/speech-to-text",
+            headers={"xi-api-key": api_key},
+            data=form_data,
+            files=files,
+        )
+
+        data = response.json()
+
+        yield "text", data.get("text", "")
+        yield "words", data.get("words", [])
+        yield "language_code", data.get("language_code", "")
+        yield "language_probability", data.get("language_probability", 0.0)
+
+
+class ElevenLabsTranscribeAudioAsyncBlock(Block):
+    """
+    Kick off transcription that returns quickly; result arrives via webhook.
+    """
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = elevenlabs.credentials_field(
+            description="ElevenLabs API credentials"
+        )
+        model_id: str = SchemaField(
+            description="Model ID for transcription", default="scribe_v1"
+        )
+        file: Optional[str] = SchemaField(
+            description="Base64-encoded audio file", default=None
+        )
+        cloud_storage_url: Optional[str] = SchemaField(
+            description="URL to audio file in cloud storage", default=None
+        )
+        language_code: Optional[str] = SchemaField(
+            description="Language code (ISO 639-1 or -3) to improve accuracy",
+            default=None,
+        )
+        diarize: bool = SchemaField(
+            description="Enable speaker diarization", default=False
+        )
+        num_speakers: Optional[int] = SchemaField(
+            description="Expected number of speakers (max 32)", default=None
+        )
+        timestamps_granularity: str = SchemaField(
+            description="Timestamp detail level: word, character, or none",
+            default="word",
+        )
+        webhook_url: str = SchemaField(
+            description="URL to receive transcription result",
+            default="",
+        )
+
+    class Output(BlockSchema):
+        tracking_id: str = SchemaField(description="ID to track the transcription job")
+
+    def __init__(self):
+        super().__init__(
+            id="f8a9b0c1-d2e3-f4a5-b6c7-d8e9f0a1b2c3",
+            description="Start async transcription with webhook callback",
+            categories={BlockCategory.AI},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+        import base64
+        import uuid
+        from io import BytesIO
+
+        api_key = credentials.api_key.get_secret_value()
+
+        # Validate input
+        if not input_data.file and not input_data.cloud_storage_url:
+            raise ValueError("Either 'file' or 'cloud_storage_url' must be provided")
+        if input_data.file and input_data.cloud_storage_url:
+            raise ValueError(
+                "Only one of 'file' or 'cloud_storage_url' should be provided"
+            )
+
+        # Build form data
+        form_data = {
+            "model_id": input_data.model_id,
+            "diarize": str(input_data.diarize).lower(),
+            "timestamps_granularity": input_data.timestamps_granularity,
+            "webhook": "true",  # Enable async mode
+        }
+
+        if input_data.language_code:
+            form_data["language_code"] = input_data.language_code
+        if input_data.num_speakers is not None:
+            form_data["num_speakers"] = str(input_data.num_speakers)
+        if input_data.webhook_url:
+            form_data["webhook_url"] = input_data.webhook_url
+
+        # Handle file or URL
+        files = None
+        if input_data.file:
+            # Decode base64 file
+            file_data = base64.b64decode(input_data.file)
+            files = [("file", ("audio.wav", BytesIO(file_data), "audio/wav"))]
+        elif input_data.cloud_storage_url:
+            form_data["cloud_storage_url"] = input_data.cloud_storage_url
+
+        # Start async transcription
+        response = await Requests().post(
+            "https://api.elevenlabs.io/v1/speech-to-text",
+            headers={"xi-api-key": api_key},
+            data=form_data,
+            files=files,
+        )
+
+        # Generate tracking ID (API might return one)
+        data = response.json()
+        tracking_id = data.get("tracking_id", str(uuid.uuid4()))
+
+        yield "tracking_id", tracking_id
--- a/autogpt_platform/backend/backend/blocks/elevenlabs/triggers.py
+++ b/autogpt_platform/backend/backend/blocks/elevenlabs/triggers.py
@@ -0,0 +1,160 @@
+"""
+ElevenLabs webhook trigger blocks.
+"""
+
+from pydantic import BaseModel
+
+from backend.sdk import (
+    Block,
+    BlockCategory,
+    BlockOutput,
+    BlockSchema,
+    BlockType,
+    BlockWebhookConfig,
+    CredentialsMetaInput,
+    ProviderName,
+    SchemaField,
+)
+
+from ._config import elevenlabs
+
+
+class ElevenLabsWebhookTriggerBlock(Block):
+    """
+    Starts a flow when ElevenLabs POSTs an event (STT finished, voice removal, etc.).
+    """
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = elevenlabs.credentials_field(
+            description="ElevenLabs API credentials"
+        )
+        webhook_url: str = SchemaField(
+            description="URL to receive webhooks (auto-generated)",
+            default="",
+            hidden=True,
+        )
+
+        class EventsFilter(BaseModel):
+            """ElevenLabs event types to subscribe to"""
+
+            speech_to_text_completed: bool = SchemaField(
+                description="Speech-to-text transcription completed", default=True
+            )
+            post_call_transcription: bool = SchemaField(
+                description="Conversational AI call transcription completed",
+                default=True,
+            )
+            voice_removal_notice: bool = SchemaField(
+                description="Voice scheduled for removal", default=True
+            )
+            voice_removed: bool = SchemaField(
+                description="Voice has been removed", default=True
+            )
+            voice_removal_notice_withdrawn: bool = SchemaField(
+                description="Voice removal cancelled", default=True
+            )
+
+        events: EventsFilter = SchemaField(
+            title="Events", description="The events to subscribe to"
+        )
+
+        # Webhook payload - populated by the system
+        payload: dict = SchemaField(
+            description="Webhook payload data",
+            default={},
+            hidden=True,
+        )
+
+    class Output(BlockSchema):
+        type: str = SchemaField(description="Event type")
+        event_timestamp: int = SchemaField(description="Unix timestamp of the event")
+        data: dict = SchemaField(description="Event-specific data payload")
+
+    def __init__(self):
+        super().__init__(
+            id="c1d2e3f4-a5b6-c7d8-e9f0-a1b2c3d4e5f6",
+            description="Receive webhook events from ElevenLabs",
+            categories={BlockCategory.DEVELOPER_TOOLS},
+            input_schema=self.Input,
+            output_schema=self.Output,
+            block_type=BlockType.WEBHOOK,
+            webhook_config=BlockWebhookConfig(
+                provider=ProviderName("elevenlabs"),
+                webhook_type="notification",
+                event_filter_input="events",
+                resource_format="",
+            ),
+        )
+
+    async def run(self, input_data: Input, **kwargs) -> BlockOutput:
+        # Extract webhook data
+        payload = input_data.payload
+
+        # Extract event type
+        event_type = payload.get("type", "unknown")
+
+        # Map event types to filter fields
+        event_filter_map = {
+            "speech_to_text_completed": input_data.events.speech_to_text_completed,
+            "post_call_transcription": input_data.events.post_call_transcription,
+            "voice_removal_notice": input_data.events.voice_removal_notice,
+            "voice_removed": input_data.events.voice_removed,
+            "voice_removal_notice_withdrawn": input_data.events.voice_removal_notice_withdrawn,
+        }
+
+        # Check if this event type is enabled
+        if not event_filter_map.get(event_type, False):
+            # Skip this event
+            return
+
+        # Extract common fields
+        yield "type", event_type
+        yield "event_timestamp", payload.get("event_timestamp", 0)
+
+        # Extract event-specific data
+        data = payload.get("data", {})
+
+        # Process based on event type
+        if event_type == "speech_to_text_completed":
+            # STT transcription completed
+            processed_data = {
+                "transcription_id": data.get("transcription_id"),
+                "text": data.get("text"),
+                "words": data.get("words", []),
+                "language_code": data.get("language_code"),
+                "language_probability": data.get("language_probability"),
+            }
+        elif event_type == "post_call_transcription":
+            # Conversational AI call transcription
+            processed_data = {
+                "agent_id": data.get("agent_id"),
+                "conversation_id": data.get("conversation_id"),
+                "transcript": data.get("transcript"),
+                "metadata": data.get("metadata", {}),
+            }
+        elif event_type == "voice_removal_notice":
+            # Voice scheduled for removal
+            processed_data = {
+                "voice_id": data.get("voice_id"),
+                "voice_name": data.get("voice_name"),
+                "removal_date": data.get("removal_date"),
+                "reason": data.get("reason"),
+            }
+        elif event_type == "voice_removal_notice_withdrawn":
+            # Voice removal cancelled
+            processed_data = {
+                "voice_id": data.get("voice_id"),
+                "voice_name": data.get("voice_name"),
+            }
+        elif event_type == "voice_removed":
+            # Voice has been removed
+            processed_data = {
+                "voice_id": data.get("voice_id"),
+                "voice_name": data.get("voice_name"),
+                "removed_at": data.get("removed_at"),
+            }
+        else:
+            # Unknown event type, pass through raw data
+            processed_data = data
+
+        yield "data", processed_data
--- a/autogpt_platform/backend/backend/blocks/elevenlabs/utility.py
+++ b/autogpt_platform/backend/backend/blocks/elevenlabs/utility.py
@@ -0,0 +1,116 @@
+"""
+ElevenLabs utility blocks for models and usage stats.
+"""
+
+from backend.sdk import (
+    APIKeyCredentials,
+    Block,
+    BlockCategory,
+    BlockOutput,
+    BlockSchema,
+    CredentialsMetaInput,
+    Requests,
+    SchemaField,
+)
+
+from ._config import elevenlabs
+
+
+class ElevenLabsListModelsBlock(Block):
+    """
+    Get all available model IDs & capabilities.
+    """
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = elevenlabs.credentials_field(
+            description="ElevenLabs API credentials"
+        )
+
+    class Output(BlockSchema):
+        models: list[dict] = SchemaField(
+            description="Array of model objects with capabilities"
+        )
+
+    def __init__(self):
+        super().__init__(
+            id="a9b0c1d2-e3f4-a5b6-c7d8-e9f0a1b2c3d4",
+            description="List all available voice models and their capabilities",
+            categories={BlockCategory.DEVELOPER_TOOLS},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+        api_key = credentials.api_key.get_secret_value()
+
+        # Fetch models
+        response = await Requests().get(
+            "https://api.elevenlabs.io/v1/models",
+            headers={"xi-api-key": api_key},
+        )
+
+        models = response.json()
+
+        yield "models", models
+
+
+class ElevenLabsGetUsageStatsBlock(Block):
+    """
+    Character / credit usage for billing dashboards.
+    """
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = elevenlabs.credentials_field(
+            description="ElevenLabs API credentials"
+        )
+        start_unix: int = SchemaField(
+            description="Start timestamp in Unix epoch seconds"
+        )
+        end_unix: int = SchemaField(description="End timestamp in Unix epoch seconds")
+        aggregation_interval: str = SchemaField(
+            description="Aggregation interval: daily or monthly",
+            default="daily",
+        )
+
+    class Output(BlockSchema):
+        usage: list[dict] = SchemaField(description="Array of usage data per interval")
+        total_character_count: int = SchemaField(
+            description="Total characters used in period"
+        )
+        total_requests: int = SchemaField(description="Total API requests in period")
+
+    def __init__(self):
+        super().__init__(
+            id="b0c1d2e3-f4a5-b6c7-d8e9-f0a1b2c3d4e5",
+            description="Get character and credit usage statistics",
+            categories={BlockCategory.DEVELOPER_TOOLS},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+        api_key = credentials.api_key.get_secret_value()
+
+        # Build query parameters
+        params = {
+            "start_unix": input_data.start_unix,
+            "end_unix": input_data.end_unix,
+            "aggregation_interval": input_data.aggregation_interval,
+        }
+
+        # Fetch usage stats
+        response = await Requests().get(
+            "https://api.elevenlabs.io/v1/usage/character-stats",
+            headers={"xi-api-key": api_key},
+            params=params,
+        )
+
+        data = response.json()
+
+        yield "usage", data.get("usage", [])
+        yield "total_character_count", data.get("total_character_count", 0)
+        yield "total_requests", data.get("total_requests", 0)
--- a/autogpt_platform/backend/backend/blocks/elevenlabs/voices.py
+++ b/autogpt_platform/backend/backend/blocks/elevenlabs/voices.py
@@ -0,0 +1,249 @@
+"""
+ElevenLabs voice management blocks.
+"""
+
+from typing import Optional
+
+from backend.sdk import (
+    APIKeyCredentials,
+    Block,
+    BlockCategory,
+    BlockOutput,
+    BlockSchema,
+    CredentialsMetaInput,
+    Requests,
+    SchemaField,
+)
+
+from ._config import elevenlabs
+
+
+class ElevenLabsListVoicesBlock(Block):
+    """
+    Fetch all voices the account can use (for pick-lists, UI menus, etc.).
+    """
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = elevenlabs.credentials_field(
+            description="ElevenLabs API credentials"
+        )
+        search: str = SchemaField(
+            description="Search term to filter voices", default=""
+        )
+        voice_type: Optional[str] = SchemaField(
+            description="Filter by voice type: premade, cloned, or professional",
+            default=None,
+        )
+        page_size: int = SchemaField(
+            description="Number of voices per page (max 100)", default=10
+        )
+        next_page_token: str = SchemaField(
+            description="Token for fetching next page", default=""
+        )
+
+    class Output(BlockSchema):
+        voices: list[dict] = SchemaField(
+            description="Array of voice objects with id, name, category, etc."
+        )
+        next_page_token: Optional[str] = SchemaField(
+            description="Token for fetching next page, null if no more pages"
+        )
+
+    def __init__(self):
+        super().__init__(
+            id="e1a2b3c4-d5e6-f7a8-b9c0-d1e2f3a4b5c6",
+            description="List all available voices with filtering and pagination",
+            categories={BlockCategory.AI},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+        api_key = credentials.api_key.get_secret_value()
+
+        # Build query parameters
+        params: dict[str, str | int] = {"page_size": input_data.page_size}
+
+        if input_data.search:
+            params["search"] = input_data.search
+        if input_data.voice_type:
+            params["voice_type"] = input_data.voice_type
+        if input_data.next_page_token:
+            params["next_page_token"] = input_data.next_page_token
+
+        # Fetch voices
+        response = await Requests().get(
+            "https://api.elevenlabs.io/v2/voices",
+            headers={"xi-api-key": api_key},
+            params=params,
+        )
+
+        data = response.json()
+
+        yield "voices", data.get("voices", [])
+        yield "next_page_token", data.get("next_page_token")
+
+
+class ElevenLabsGetVoiceDetailsBlock(Block):
+    """
+    Retrieve metadata/settings for a single voice.
+    """
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = elevenlabs.credentials_field(
+            description="ElevenLabs API credentials"
+        )
+        voice_id: str = SchemaField(description="The ID of the voice to retrieve")
+
+    class Output(BlockSchema):
+        voice: dict = SchemaField(
+            description="Voice object with name, labels, settings, etc."
+        )
+
+    def __init__(self):
+        super().__init__(
+            id="f2a3b4c5-d6e7-f8a9-b0c1-d2e3f4a5b6c7",
+            description="Get detailed information about a specific voice",
+            categories={BlockCategory.AI},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+        api_key = credentials.api_key.get_secret_value()
+
+        # Fetch voice details
+        response = await Requests().get(
+            f"https://api.elevenlabs.io/v1/voices/{input_data.voice_id}",
+            headers={"xi-api-key": api_key},
+        )
+
+        voice = response.json()
+
+        yield "voice", voice
+
+
+class ElevenLabsCreateVoiceCloneBlock(Block):
+    """
+    Upload sample clips to create a custom (IVC) voice.
+    """
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = elevenlabs.credentials_field(
+            description="ElevenLabs API credentials"
+        )
+        name: str = SchemaField(description="Name for the new voice")
+        files: list[str] = SchemaField(
+            description="Base64-encoded audio files (1-10 files, max 25MB each)"
+        )
+        description: str = SchemaField(
+            description="Description of the voice", default=""
+        )
+        labels: dict = SchemaField(
+            description="Metadata labels (e.g., accent, age)", default={}
+        )
+        remove_background_noise: bool = SchemaField(
+            description="Whether to remove background noise from samples", default=False
+        )
+
+    class Output(BlockSchema):
+        voice_id: str = SchemaField(description="ID of the newly created voice")
+        requires_verification: bool = SchemaField(
+            description="Whether the voice requires verification"
+        )
+
+    def __init__(self):
+        super().__init__(
+            id="a3b4c5d6-e7f8-a9b0-c1d2-e3f4a5b6c7d8",
+            description="Create a new voice clone from audio samples",
+            categories={BlockCategory.AI},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+        import base64
+        import json
+        from io import BytesIO
+
+        api_key = credentials.api_key.get_secret_value()
+
+        # Prepare multipart form data
+        form_data = {
+            "name": input_data.name,
+        }
+
+        if input_data.description:
+            form_data["description"] = input_data.description
+        if input_data.labels:
+            form_data["labels"] = json.dumps(input_data.labels)
+        if input_data.remove_background_noise:
+            form_data["remove_background_noise"] = "true"
+
+        # Prepare files
+        files = []
+        for i, file_b64 in enumerate(input_data.files):
+            file_data = base64.b64decode(file_b64)
+            files.append(
+                ("files", (f"sample_{i}.mp3", BytesIO(file_data), "audio/mpeg"))
+            )
+
+        # Create voice
+        response = await Requests().post(
+            "https://api.elevenlabs.io/v1/voices/add",
+            headers={"xi-api-key": api_key},
+            data=form_data,
+            files=files,
+        )
+
+        result = response.json()
+
+        yield "voice_id", result.get("voice_id", "")
+        yield "requires_verification", result.get("requires_verification", False)
+
+
+class ElevenLabsDeleteVoiceBlock(Block):
+    """
+    Permanently remove a custom voice.
+    """
+
+    class Input(BlockSchema):
+        credentials: CredentialsMetaInput = elevenlabs.credentials_field(
+            description="ElevenLabs API credentials"
+        )
+        voice_id: str = SchemaField(description="The ID of the voice to delete")
+
+    class Output(BlockSchema):
+        status: str = SchemaField(description="Deletion status (ok or error)")
+
+    def __init__(self):
+        super().__init__(
+            id="b4c5d6e7-f8a9-b0c1-d2e3-f4a5b6c7d8e9",
+            description="Delete a custom voice from your account",
+            categories={BlockCategory.AI},
+            input_schema=self.Input,
+            output_schema=self.Output,
+        )
+
+    async def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+        api_key = credentials.api_key.get_secret_value()
+
+        # Delete voice
+        response = await Requests().delete(
+            f"https://api.elevenlabs.io/v1/voices/{input_data.voice_id}",
+            headers={"xi-api-key": api_key},
+        )
+
+        # Check if successful
+        if response.status in [200, 204]:
+            yield "status", "ok"
+        else:
+            yield "status", "error"
Author	SHA1	Message	Date
Swifty	4e2bcebbc6	Merge branch 'dev' into swiftyos/automat-19-elevenlabs	2025-07-14 12:18:45 +02:00
SwiftyOS	0d1aafbf10	elevenlabs integration	2025-07-11 11:21:29 +02:00