tts v1 working

2026-04-08 03:00:28 -04:00 · 2024-08-14 11:22:33 +02:00
parent b5b47146c6
commit 42af69648d
1 changed files with 102 additions and 0 deletions
--- a/rnd/autogpt_server/autogpt_server/blocks/tts.py
+++ b/rnd/autogpt_server/autogpt_server/blocks/tts.py
@@ -0,0 +1,102 @@
+from autogpt_server.data.block import Block, BlockCategory, BlockOutput, BlockSchema
+from pathlib import Path
+from openai import OpenAI
+from autogpt_server.data.model import BlockSecret, SchemaField, SecretField
+
+class TextToSpeechBlock(Block):
+    class Input(BlockSchema):
+        api_key: BlockSecret = SecretField(
+            key="openai_api_key",
+            description="Your OpenAI API key",
+            placeholder="Enter your OpenAI API key",
+        )
+        text: str = SchemaField(
+            description="The text to convert to speech",
+            placeholder="Enter the text you want to convert to speech",
+        )
+        voice: str = SchemaField(
+            description="The voice to use for speech synthesis",
+            placeholder="alloy",
+            default="alloy",
+        )
+        model: str = SchemaField(
+            default="tts-1",
+            description="The TTS model to use",
+            placeholder="tts-1",
+        )
+        output_path: str = SchemaField(
+            description="The path where the output audio file will be saved",
+            placeholder="/path/to/output/speech.mp3",
+        )
+
+    class Output(BlockSchema):
+        file_path: str = SchemaField(description="The path of the generated audio file")
+        file_size: int = SchemaField(description="The size of the generated audio file in bytes")
+        duration: float = SchemaField(description="The duration of the generated audio in seconds")
+        error: str = SchemaField(description="Error message if the TTS conversion failed")
+
+    def __init__(self):
+        super().__init__(
+            id="1a2b3c4d-5e6f-7g8h-9i0j-1k2l3m4n5o6p",
+            input_schema=TextToSpeechBlock.Input,
+            output_schema=TextToSpeechBlock.Output,
+            description="Uses OpenAI to covert the input string into a audio stored as an mp3 in the specified output dir",
+            categories=[BlockCategory.AI, BlockCategory.OUTPUT],
+            test_input={
+                "api_key": "your_test_api_key",
+                "text": "Hello, this is a test for text-to-speech conversion.",
+                "voice": "alloy",
+                "model": "tts-1",
+                "output_path": "/tmp/test_speech.mp3",
+            },
+            test_output=[
+                ("file_path", "/tmp/test_speech.mp3"),
+                ("file_size", 12345),
+                ("duration", 3.5),
+            ],
+            test_mock={
+                "create_speech": lambda *args, **kwargs: MockResponse(),
+            },
+        )
+
+    def create_speech(self, api_key: str, text: str, voice: str, model: str, output_path: str):
+        client = OpenAI(api_key=api_key)
+        speech_file_path = Path(output_path)
+
+        response = client.audio.speech.create(
+            model=model,
+            voice=voice,
+            input=text
+        )
+
+        response.stream_to_file(speech_file_path)
+        return speech_file_path
+
+    def run(self, input_data: Input) -> BlockOutput:
+        try:
+            output_file = self.create_speech(
+                api_key=input_data.api_key.get_secret_value(),
+                text=input_data.text,
+                voice=input_data.voice,
+                model=input_data.model,
+                output_path=input_data.output_path
+            )
+
+            file_size = output_file.stat().st_size
+            
+            # Here we would typically use a library like pydub to get the duration
+            # For simplicity, we'll estimate it based on average speech rate
+            estimated_duration = len(input_data.text.split()) / 2.5  # Assuming 150 words per minute
+
+            yield "file_path", str(output_file)
+            yield "file_size", file_size
+            yield "duration", estimated_duration
+
+        except Exception as e:
+            yield "error", f"Error occurred during text-to-speech conversion: {str(e)}"
+
+class MockResponse:
+    def stream_to_file(self, path):
+        # Mock implementation for testing
+        with open(path, 'w') as f:
+            f.write("Mock audio content")