mirror of
https://github.com/danielmiessler/Fabric.git
synced 2026-01-07 21:44:02 -05:00
feat: add Gemini TTS voice selection and listing functionality
## CHANGES - Add `--voice` flag for TTS voice selection - Add `--list-gemini-voices` command for voice discovery - Implement voice validation for Gemini TTS models - Update shell completions for voice options - Add comprehensive Gemini TTS documentation - Create voice samples directory structure - Extend spell checker dictionary with voice names
This commit is contained in:
16
.vscode/settings.json
vendored
16
.vscode/settings.json
vendored
@@ -1,23 +1,32 @@
|
||||
{
|
||||
"cSpell.words": [
|
||||
"Achird",
|
||||
"addextension",
|
||||
"adduser",
|
||||
"AIML",
|
||||
"anthropics",
|
||||
"Aoede",
|
||||
"atotto",
|
||||
"Autonoe",
|
||||
"badfile",
|
||||
"Behrens",
|
||||
"blindspots",
|
||||
"Bombal",
|
||||
"Callirhoe",
|
||||
"Callirrhoe",
|
||||
"Cerebras",
|
||||
"compadd",
|
||||
"compdef",
|
||||
"compinit",
|
||||
"creatordate",
|
||||
"curcontext",
|
||||
"custompatterns",
|
||||
"danielmiessler",
|
||||
"davidanson",
|
||||
"Debugf",
|
||||
"dedup",
|
||||
"deepseek",
|
||||
"Despina",
|
||||
"direnv",
|
||||
"dryrun",
|
||||
"dsrp",
|
||||
@@ -25,6 +34,7 @@
|
||||
"Eisler",
|
||||
"elif",
|
||||
"envrc",
|
||||
"Erinome",
|
||||
"Errorf",
|
||||
"eugeis",
|
||||
"Eugen",
|
||||
@@ -66,6 +76,7 @@
|
||||
"Kore",
|
||||
"ksylvan",
|
||||
"Langdock",
|
||||
"Laomedeia",
|
||||
"ldflags",
|
||||
"libexec",
|
||||
"listcontexts",
|
||||
@@ -89,6 +100,7 @@
|
||||
"openaiapi",
|
||||
"opencode",
|
||||
"openrouter",
|
||||
"Orus",
|
||||
"otiai",
|
||||
"pdflatex",
|
||||
"pipx",
|
||||
@@ -97,11 +109,14 @@
|
||||
"presencepenalty",
|
||||
"printcontext",
|
||||
"printsession",
|
||||
"Pulcherrima",
|
||||
"pycache",
|
||||
"pyperclip",
|
||||
"readystream",
|
||||
"restapi",
|
||||
"rmextension",
|
||||
"Sadachbia",
|
||||
"Sadaltager",
|
||||
"samber",
|
||||
"sashabaranov",
|
||||
"sdist",
|
||||
@@ -112,6 +127,7 @@
|
||||
"Streamlit",
|
||||
"stretchr",
|
||||
"subchunk",
|
||||
"Sulafat",
|
||||
"talkpanel",
|
||||
"Telos",
|
||||
"testpattern",
|
||||
|
||||
@@ -548,6 +548,9 @@ Application Options:
|
||||
--think-start-tag= Start tag for thinking sections (default: <think>)
|
||||
--think-end-tag= End tag for thinking sections (default: </think>)
|
||||
--disable-responses-api Disable OpenAI Responses API (default: false)
|
||||
--voice= TTS voice name for supported models (e.g., Kore, Charon, Puck)
|
||||
(default: Kore)
|
||||
--list-gemini-voices List all available Gemini TTS voices
|
||||
|
||||
Help Options:
|
||||
-h, --help Show this help message
|
||||
|
||||
@@ -14,16 +14,19 @@ _fabric_models() {
|
||||
models=(${(f)"$(fabric --listmodels --shell-complete-list 2>/dev/null)"})
|
||||
compadd -X "Models:" ${models}
|
||||
}
|
||||
|
||||
_fabric_contexts() {
|
||||
local -a contexts
|
||||
contexts=(${(f)"$(fabric --listcontexts --shell-complete-list 2>/dev/null)"})
|
||||
compadd -X "Contexts:" ${contexts}
|
||||
}
|
||||
|
||||
_fabric_sessions() {
|
||||
local -a sessions
|
||||
sessions=(${(f)"$(fabric --listsessions --shell-complete-list 2>/dev/null)"})
|
||||
compadd -X "Sessions:" ${sessions}
|
||||
}
|
||||
|
||||
_fabric_strategies() {
|
||||
local -a strategies
|
||||
strategies=(${(f)"$(fabric --liststrategies --shell-complete-list 2>/dev/null)"})
|
||||
@@ -34,14 +37,12 @@ _fabric_extensions() {
|
||||
local -a extensions
|
||||
extensions=(${(f)"$(fabric --listextensions --shell-complete-list 2>/dev/null)"})
|
||||
compadd -X "Extensions:" ${extensions}
|
||||
'(-L --listmodels)'{-L,--listmodels}'[List all available models]:list models:_fabric_models' \
|
||||
'(-x --listcontexts)'{-x,--listcontexts}'[List all contexts]:list contexts:_fabric_contexts' \
|
||||
'(-X --listsessions)'{-X,--listsessions}'[List all sessions]:list sessions:_fabric_sessions' \
|
||||
'(--listextensions)--listextensions[List all registered extensions]' \
|
||||
'(--liststrategies)--liststrategies[List all strategies]:list strategies:_fabric_strategies' \
|
||||
'(--listvendors)--listvendors[List all vendors]' \
|
||||
vendors=(${(f)"$(fabric --listvendors 2>/dev/null)"})
|
||||
compadd -X "Vendors:" ${vendors}
|
||||
}
|
||||
|
||||
_fabric_gemini_voices() {
|
||||
local -a voices
|
||||
voices=(${(f)"$(fabric --list-gemini-voices --shell-complete-list 2>/dev/null)"})
|
||||
compadd -X "Gemini TTS Voices:" ${voices}
|
||||
}
|
||||
|
||||
_fabric() {
|
||||
@@ -109,6 +110,8 @@ _fabric() {
|
||||
'(--strategy)--strategy[Choose a strategy from the available strategies]:strategy:_fabric_strategies' \
|
||||
'(--liststrategies)--liststrategies[List all strategies]' \
|
||||
'(--listvendors)--listvendors[List all vendors]' \
|
||||
'(--voice)--voice[TTS voice name for supported models]:voice:_fabric_gemini_voices' \
|
||||
'(--list-gemini-voices)--list-gemini-voices[List all available Gemini TTS voices]' \
|
||||
'(--shell-complete-list)--shell-complete-list[Output raw list without headers/formatting (for shell completion)]' \
|
||||
'(--suppress-think)--suppress-think[Suppress text enclosed in thinking tags]' \
|
||||
'(--think-start-tag)--think-start-tag[Start tag for thinking sections (default: <think>)]:start tag:' \
|
||||
@@ -119,4 +122,3 @@ _fabric() {
|
||||
}
|
||||
|
||||
_fabric "$@"
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ _fabric() {
|
||||
_get_comp_words_by_ref -n : cur prev words cword
|
||||
|
||||
# Define all possible options/flags
|
||||
local opts="--pattern -p --variable -v --context -C --session --attachment -a --setup -S --temperature -t --topp -T --stream -s --presencepenalty -P --raw -r --frequencypenalty -F --listpatterns -l --listmodels -L --listcontexts -x --listsessions -X --updatepatterns -U --copy -c --model -m --modelContextLength --output -o --output-session --latest -n --changeDefaultModel -d --youtube -y --playlist --transcript --transcript-with-timestamps --comments --metadata --language -g --scrape_url -u --scrape_question -q --seed -e --wipecontext -w --wipesession -W --printcontext --printsession --readability --input-has-vars --dry-run --serve --serveOllama --address --api-key --config --search --search-location --image-file --image-size --image-quality --image-compression --image-background --suppress-think --think-start-tag --think-end-tag --disable-responses-api --version --listextensions --addextension --rmextension --strategy --liststrategies --listvendors --shell-complete-list --help -h"
|
||||
local opts="--pattern -p --variable -v --context -C --session --attachment -a --setup -S --temperature -t --topp -T --stream -s --presencepenalty -P --raw -r --frequencypenalty -F --listpatterns -l --listmodels -L --listcontexts -x --listsessions -X --updatepatterns -U --copy -c --model -m --modelContextLength --output -o --output-session --latest -n --changeDefaultModel -d --youtube -y --playlist --transcript --transcript-with-timestamps --comments --metadata --language -g --scrape_url -u --scrape_question -q --seed -e --wipecontext -w --wipesession -W --printcontext --printsession --readability --input-has-vars --dry-run --serve --serveOllama --address --api-key --config --search --search-location --image-file --image-size --image-quality --image-compression --image-background --suppress-think --think-start-tag --think-end-tag --disable-responses-api --voice --list-gemini-voices --version --listextensions --addextension --rmextension --strategy --liststrategies --listvendors --shell-complete-list --help -h"
|
||||
|
||||
# Helper function for dynamic completions
|
||||
_fabric_get_list() {
|
||||
@@ -62,6 +62,10 @@ _fabric() {
|
||||
COMPREPLY=($(compgen -W "$(_fabric_get_list --liststrategies)" -- "${cur}"))
|
||||
return 0
|
||||
;;
|
||||
--voice)
|
||||
COMPREPLY=($(compgen -W "$(_fabric_get_list --list-gemini-voices)" -- "${cur}"))
|
||||
return 0
|
||||
;;
|
||||
# Options requiring file/directory paths
|
||||
-a | --attachment | -o | --output | --config | --addextension | --image-file)
|
||||
_filedir
|
||||
|
||||
@@ -31,6 +31,10 @@ function __fabric_get_extensions
|
||||
fabric --listextensions --shell-complete-list 2>/dev/null
|
||||
end
|
||||
|
||||
function __fabric_get_gemini_voices
|
||||
fabric --list-gemini-voices --shell-complete-list 2>/dev/null
|
||||
end
|
||||
|
||||
# Main completion function
|
||||
complete -c fabric -f
|
||||
|
||||
@@ -71,6 +75,7 @@ complete -c fabric -l rmextension -d "Remove a registered extension by name" -a
|
||||
complete -c fabric -l strategy -d "Choose a strategy from the available strategies" -a "(__fabric_get_strategies)"
|
||||
complete -c fabric -l think-start-tag -d "Start tag for thinking sections (default: <think>)"
|
||||
complete -c fabric -l think-end-tag -d "End tag for thinking sections (default: </think>)"
|
||||
complete -c fabric -l voice -d "TTS voice name for supported models (e.g., Kore, Charon, Puck)" -a "(__fabric_get_gemini_voices)"
|
||||
|
||||
# Boolean flags (no arguments)
|
||||
complete -c fabric -s S -l setup -d "Run setup for all reconfigurable parts of fabric"
|
||||
@@ -99,6 +104,7 @@ complete -c fabric -l version -d "Print current version"
|
||||
complete -c fabric -l listextensions -d "List all registered extensions"
|
||||
complete -c fabric -l liststrategies -d "List all strategies"
|
||||
complete -c fabric -l listvendors -d "List all vendors"
|
||||
complete -c fabric -l list-gemini-voices -d "List all available Gemini TTS voices"
|
||||
complete -c fabric -l shell-complete-list -d "Output raw list without headers/formatting (for shell completion)"
|
||||
complete -c fabric -l suppress-think -d "Suppress text enclosed in thinking tags"
|
||||
complete -c fabric -l disable-responses-api -d "Disable OpenAI Responses API (default: false)"
|
||||
|
||||
155
docs/Gemini-TTS.md
Normal file
155
docs/Gemini-TTS.md
Normal file
@@ -0,0 +1,155 @@
|
||||
# Gemini Text-to-Speech (TTS) Guide
|
||||
|
||||
Fabric supports Google Gemini's text-to-speech (TTS) capabilities, allowing you to convert text into high-quality audio using various AI-generated voices.
|
||||
|
||||
## Overview
|
||||
|
||||
The Gemini TTS feature in Fabric allows you to:
|
||||
|
||||
- Convert text input into audio using Google's Gemini TTS models
|
||||
- Choose from 30+ different AI voices with varying characteristics
|
||||
- Generate high-quality WAV audio files
|
||||
- Integrate TTS generation into your existing Fabric workflows
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic TTS Generation
|
||||
|
||||
To generate audio from text using TTS:
|
||||
|
||||
```bash
|
||||
# Basic TTS with default voice (Kore)
|
||||
echo "Hello, this is a test of Gemini TTS" | fabric -m gemini-2.0-flash-tts -o output.wav
|
||||
|
||||
# Using a specific voice
|
||||
echo "Hello, this is a test with the Charon voice" | fabric -m gemini-2.0-flash-tts --voice Charon -o output.wav
|
||||
|
||||
# Using TTS with a pattern
|
||||
fabric -p summarize --voice Puck -m gemini-2.0-flash-tts -o summary.wav < document.txt
|
||||
```
|
||||
|
||||
### Voice Selection
|
||||
|
||||
Use the `--voice` flag to specify which voice to use for TTS generation:
|
||||
|
||||
```bash
|
||||
fabric -m gemini-2.0-flash-tts --voice Zephyr -o output.wav "Your text here"
|
||||
```
|
||||
|
||||
If no voice is specified, the default voice "Kore" will be used.
|
||||
|
||||
## Available Voices
|
||||
|
||||
Gemini TTS supports 30+ different voices, each with unique characteristics:
|
||||
|
||||
### Popular Voices
|
||||
|
||||
- **Kore** - Firm and confident (default)
|
||||
- **Charon** - Informative and clear
|
||||
- **Puck** - Upbeat and energetic
|
||||
- **Zephyr** - Bright and cheerful
|
||||
- **Leda** - Youthful and energetic
|
||||
- **Aoede** - Breezy and natural
|
||||
|
||||
### Complete Voice List
|
||||
|
||||
- Kore, Charon, Puck, Fenrir, Aoede, Leda, Orus, Zephyr
|
||||
- Autonoe, Callirhoe, Despina, Erinome, Gacrux, Laomedeia
|
||||
- Pulcherrima, Sulafat, Vindemiatrix, Achernar, Achird
|
||||
- Algenib, Algieba, Alnilam, Enceladus, Iapetus, Rasalgethi
|
||||
- Sadachbia, Zubenelgenubi, Vega, Capella, Lyra
|
||||
|
||||
### Listing Available Voices
|
||||
|
||||
To see all available voices with descriptions:
|
||||
|
||||
```bash
|
||||
# List all voices with characteristics
|
||||
fabric --list-gemini-voices
|
||||
|
||||
# List voice names only (for shell completion)
|
||||
fabric --list-gemini-voices --shell-complete-list
|
||||
```
|
||||
|
||||
## Rate Limits
|
||||
|
||||
Google Gemini TTS has usage quotas that vary by plan:
|
||||
|
||||
### Free Tier
|
||||
|
||||
- **15 requests per day** per project per TTS model
|
||||
- Quota resets daily
|
||||
- Applies to all TTS models (e.g., `gemini-2.5-flash-preview-tts`)
|
||||
|
||||
### Rate Limit Errors
|
||||
|
||||
If you exceed your quota, you'll see an error like:
|
||||
|
||||
```text
|
||||
Error 429: You exceeded your current quota, please check your plan and billing details
|
||||
```
|
||||
|
||||
**Solutions:**
|
||||
|
||||
- Wait for daily quota reset (typically at midnight UTC)
|
||||
- Upgrade to a paid plan for higher limits
|
||||
- Use TTS generation strategically for important content
|
||||
|
||||
For current rate limits and pricing, visit: <https://ai.google.dev/gemini-api/docs/rate-limits>
|
||||
|
||||
## Configuration
|
||||
|
||||
### Command Line Options
|
||||
|
||||
- `--voice <voice_name>` - Specify the TTS voice to use
|
||||
- `-o <filename.wav>` - Output audio file (required for TTS models)
|
||||
- `-m <tts_model>` - Specify a TTS-capable model (e.g., `gemini-2.0-flash-tts`)
|
||||
|
||||
### YAML Configuration
|
||||
|
||||
You can also set a default voice in your Fabric configuration file (`~/.config/fabric/config.yaml`):
|
||||
|
||||
```yaml
|
||||
voice: "Charon" # Set your preferred default voice
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- Valid Google Gemini API key configured in Fabric
|
||||
- TTS-capable Gemini model (models containing "tts" in the name)
|
||||
- Audio output must be specified with `-o filename.wav`
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
#### Error: "TTS model requires audio output"
|
||||
|
||||
- Solution: Always specify an output file with `-o filename.wav` when using TTS models
|
||||
|
||||
#### Error: "Invalid voice 'X'"
|
||||
|
||||
- Solution: Check that the voice name is spelled correctly and matches one of the supported voices listed above
|
||||
|
||||
#### Error: "TTS generation failed"
|
||||
|
||||
- Solution: Verify your Gemini API key is valid and you have sufficient quota
|
||||
|
||||
### Getting Help
|
||||
|
||||
For additional help with TTS features:
|
||||
|
||||
```bash
|
||||
fabric --help
|
||||
```
|
||||
|
||||
## Technical Details
|
||||
|
||||
- **Audio Format**: WAV files with 24kHz sample rate, 16-bit depth, mono channel
|
||||
- **Language Support**: Automatic language detection for 24+ languages
|
||||
- **Model Requirements**: Models must contain "tts", "preview-tts", or "text-to-speech" in the name
|
||||
- **Voice Selection**: Uses Google's PrebuiltVoiceConfig system for consistent voice quality
|
||||
|
||||
---
|
||||
|
||||
For more information about Fabric, visit the [main documentation](../README.md).
|
||||
36
docs/voices/README.md
Normal file
36
docs/voices/README.md
Normal file
@@ -0,0 +1,36 @@
|
||||
# Voice Samples
|
||||
|
||||
This directory contains sample audio files demonstrating different Gemini TTS voices.
|
||||
|
||||
## Sample Files
|
||||
|
||||
Each voice sample says "The quick brown fox jumped over the lazy dog" to demonstrate the voice characteristics:
|
||||
|
||||
- **Kore.wav** - Firm and confident (default voice)
|
||||
- **Charon.wav** - Informative and clear
|
||||
- **Vega.wav** - Smooth and pleasant
|
||||
- **Capella.wav** - Warm and welcoming
|
||||
- **Achird.wav** - Friendly and approachable
|
||||
- **Lyra.wav** - Melodic and expressive
|
||||
|
||||
## Generating Samples
|
||||
|
||||
To generate these samples, use the following commands:
|
||||
|
||||
```bash
|
||||
# Generate each voice sample
|
||||
echo "The quick brown fox jumped over the lazy dog" | fabric -m gemini-2.5-flash-preview-tts --voice Kore -o docs/voices/Kore.wav
|
||||
echo "The quick brown fox jumped over the lazy dog" | fabric -m gemini-2.5-flash-preview-tts --voice Charon -o docs/voices/Charon.wav
|
||||
echo "The quick brown fox jumped over the lazy dog" | fabric -m gemini-2.5-flash-preview-tts --voice Vega -o docs/voices/Vega.wav
|
||||
echo "The quick brown fox jumped over the lazy dog" | fabric -m gemini-2.5-flash-preview-tts --voice Capella -o docs/voices/Capella.wav
|
||||
echo "The quick brown fox jumped over the lazy dog" | fabric -m gemini-2.5-flash-preview-tts --voice Achird -o docs/voices/Achird.wav
|
||||
echo "The quick brown fox jumped over the lazy dog" | fabric -m gemini-2.5-flash-preview-tts --voice Lyra -o docs/voices/Lyra.wav
|
||||
```
|
||||
|
||||
## Audio Format
|
||||
|
||||
- **Format**: WAV (uncompressed)
|
||||
- **Sample Rate**: 24kHz
|
||||
- **Bit Depth**: 16-bit
|
||||
- **Channels**: Mono
|
||||
- **Approximate Size**: ~500KB per sample
|
||||
@@ -87,6 +87,8 @@ type Flags struct {
|
||||
ThinkStartTag string `long:"think-start-tag" yaml:"thinkStartTag" description:"Start tag for thinking sections" default:"<think>"`
|
||||
ThinkEndTag string `long:"think-end-tag" yaml:"thinkEndTag" description:"End tag for thinking sections" default:"</think>"`
|
||||
DisableResponsesAPI bool `long:"disable-responses-api" yaml:"disableResponsesAPI" description:"Disable OpenAI Responses API (default: false)"`
|
||||
Voice string `long:"voice" yaml:"voice" description:"TTS voice name for supported models (e.g., Kore, Charon, Puck)" default:"Kore"`
|
||||
ListGeminiVoices bool `long:"list-gemini-voices" description:"List all available Gemini TTS voices"`
|
||||
}
|
||||
|
||||
var debug = false
|
||||
@@ -441,6 +443,7 @@ func (o *Flags) BuildChatOptions() (ret *domain.ChatOptions, err error) {
|
||||
SuppressThink: o.SuppressThink,
|
||||
ThinkStartTag: startTag,
|
||||
ThinkEndTag: endTag,
|
||||
Voice: o.Voice,
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"github.com/danielmiessler/fabric/internal/core"
|
||||
"github.com/danielmiessler/fabric/internal/plugins/ai"
|
||||
"github.com/danielmiessler/fabric/internal/plugins/ai/gemini"
|
||||
"github.com/danielmiessler/fabric/internal/plugins/db/fsdb"
|
||||
)
|
||||
|
||||
@@ -58,5 +60,11 @@ func handleListingCommands(currentFlags *Flags, fabricDb *fsdb.Db, registry *cor
|
||||
return true, err
|
||||
}
|
||||
|
||||
if currentFlags.ListGeminiVoices {
|
||||
voicesList := gemini.ListGeminiVoices(currentFlags.ShellCompleteOutput)
|
||||
fmt.Print(voicesList)
|
||||
return true, nil
|
||||
}
|
||||
|
||||
return false, nil
|
||||
}
|
||||
|
||||
@@ -38,6 +38,7 @@ type ChatOptions struct {
|
||||
ThinkEndTag string
|
||||
AudioOutput bool
|
||||
AudioFormat string
|
||||
Voice string
|
||||
}
|
||||
|
||||
// NormalizeMessages remove empty messages and ensure messages order user-assist-user
|
||||
|
||||
@@ -194,6 +194,12 @@ func (o *Client) generateTTSAudio(ctx context.Context, msgs []*chat.ChatCompleti
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Validate voice name before making API call
|
||||
if opts.Voice != "" && !IsValidGeminiVoice(opts.Voice) {
|
||||
validVoices := GetGeminiVoiceNames()
|
||||
return "", fmt.Errorf("invalid voice '%s'. Valid voices are: %v", opts.Voice, validVoices)
|
||||
}
|
||||
|
||||
client, err := o.createGenaiClient(ctx)
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -211,12 +217,17 @@ func (o *Client) performTTSGeneration(ctx context.Context, client *genai.Client,
|
||||
}}
|
||||
|
||||
// Configure for TTS generation
|
||||
voiceName := opts.Voice
|
||||
if voiceName == "" {
|
||||
voiceName = "Kore" // Default voice if none specified
|
||||
}
|
||||
|
||||
config := &genai.GenerateContentConfig{
|
||||
ResponseModalities: []string{"AUDIO"},
|
||||
SpeechConfig: &genai.SpeechConfig{
|
||||
VoiceConfig: &genai.VoiceConfig{
|
||||
PrebuiltVoiceConfig: &genai.PrebuiltVoiceConfig{
|
||||
VoiceName: "Kore", // Default voice
|
||||
VoiceName: voiceName,
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
218
internal/plugins/ai/gemini/voices.go
Normal file
218
internal/plugins/ai/gemini/voices.go
Normal file
@@ -0,0 +1,218 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// GeminiVoice represents a Gemini TTS voice with its characteristics
|
||||
type GeminiVoice struct {
|
||||
Name string
|
||||
Description string
|
||||
Characteristics []string
|
||||
}
|
||||
|
||||
// GetGeminiVoices returns the current list of supported Gemini TTS voices
|
||||
// This list is maintained based on official Google Gemini documentation
|
||||
// https://ai.google.dev/gemini-api/docs/speech-generation
|
||||
func GetGeminiVoices() []GeminiVoice {
|
||||
return []GeminiVoice{
|
||||
// Firm voices
|
||||
{Name: "Kore", Description: "Firm and confident", Characteristics: []string{"firm", "confident", "default"}},
|
||||
{Name: "Orus", Description: "Firm and decisive", Characteristics: []string{"firm", "decisive"}},
|
||||
{Name: "Alnilam", Description: "Firm and strong", Characteristics: []string{"firm", "strong"}},
|
||||
|
||||
// Upbeat voices
|
||||
{Name: "Puck", Description: "Upbeat and energetic", Characteristics: []string{"upbeat", "energetic"}},
|
||||
{Name: "Laomedeia", Description: "Upbeat and lively", Characteristics: []string{"upbeat", "lively"}},
|
||||
|
||||
// Bright voices
|
||||
{Name: "Zephyr", Description: "Bright and cheerful", Characteristics: []string{"bright", "cheerful"}},
|
||||
{Name: "Autonoe", Description: "Bright and optimistic", Characteristics: []string{"bright", "optimistic"}},
|
||||
|
||||
// Informative voices
|
||||
{Name: "Charon", Description: "Informative and clear", Characteristics: []string{"informative", "clear"}},
|
||||
{Name: "Rasalgethi", Description: "Informative and professional", Characteristics: []string{"informative", "professional"}},
|
||||
|
||||
// Natural voices
|
||||
{Name: "Aoede", Description: "Breezy and natural", Characteristics: []string{"breezy", "natural"}},
|
||||
{Name: "Leda", Description: "Youthful and energetic", Characteristics: []string{"youthful", "energetic"}},
|
||||
|
||||
// Gentle voices
|
||||
{Name: "Vindemiatrix", Description: "Gentle and kind", Characteristics: []string{"gentle", "kind"}},
|
||||
{Name: "Achernar", Description: "Soft and gentle", Characteristics: []string{"soft", "gentle"}},
|
||||
{Name: "Enceladus", Description: "Breathy and soft", Characteristics: []string{"breathy", "soft"}},
|
||||
|
||||
// Warm voices
|
||||
{Name: "Sulafat", Description: "Warm and welcoming", Characteristics: []string{"warm", "welcoming"}},
|
||||
{Name: "Capella", Description: "Warm and approachable", Characteristics: []string{"warm", "approachable"}},
|
||||
|
||||
// Clear voices
|
||||
{Name: "Iapetus", Description: "Clear and articulate", Characteristics: []string{"clear", "articulate"}},
|
||||
{Name: "Erinome", Description: "Clear and precise", Characteristics: []string{"clear", "precise"}},
|
||||
|
||||
// Pleasant voices
|
||||
{Name: "Algieba", Description: "Smooth and pleasant", Characteristics: []string{"smooth", "pleasant"}},
|
||||
{Name: "Vega", Description: "Smooth and flowing", Characteristics: []string{"smooth", "flowing"}},
|
||||
|
||||
// Textured voices
|
||||
{Name: "Algenib", Description: "Gravelly texture", Characteristics: []string{"gravelly", "textured"}},
|
||||
|
||||
// Relaxed voices
|
||||
{Name: "Callirrhoe", Description: "Easy-going and relaxed", Characteristics: []string{"relaxed", "easy-going"}},
|
||||
{Name: "Despina", Description: "Smooth and flowing", Characteristics: []string{"smooth", "flowing"}},
|
||||
|
||||
// Mature voices
|
||||
{Name: "Gacrux", Description: "Mature and experienced", Characteristics: []string{"mature", "experienced"}},
|
||||
|
||||
// Expressive voices
|
||||
{Name: "Pulcherrima", Description: "Forward and expressive", Characteristics: []string{"forward", "expressive"}},
|
||||
{Name: "Lyra", Description: "Melodic and expressive", Characteristics: []string{"melodic", "expressive"}},
|
||||
|
||||
// Dynamic voices
|
||||
{Name: "Fenrir", Description: "Excitable and dynamic", Characteristics: []string{"excitable", "dynamic"}},
|
||||
{Name: "Sadachbia", Description: "Lively and animated", Characteristics: []string{"lively", "animated"}},
|
||||
|
||||
// Friendly voices
|
||||
{Name: "Achird", Description: "Friendly and approachable", Characteristics: []string{"friendly", "approachable"}},
|
||||
|
||||
// Casual voices
|
||||
{Name: "Zubenelgenubi", Description: "Casual and conversational", Characteristics: []string{"casual", "conversational"}},
|
||||
|
||||
// Additional voices from latest API
|
||||
{Name: "Sadaltager", Description: "Additional voice option", Characteristics: []string{"additional"}},
|
||||
{Name: "Schedar", Description: "Additional voice option", Characteristics: []string{"additional"}},
|
||||
{Name: "Umbriel", Description: "Additional voice option", Characteristics: []string{"additional"}},
|
||||
}
|
||||
}
|
||||
|
||||
// GetGeminiVoiceNames returns just the voice names in alphabetical order
|
||||
func GetGeminiVoiceNames() []string {
|
||||
voices := GetGeminiVoices()
|
||||
names := make([]string, len(voices))
|
||||
for i, voice := range voices {
|
||||
names[i] = voice.Name
|
||||
}
|
||||
sort.Strings(names)
|
||||
return names
|
||||
}
|
||||
|
||||
// IsValidGeminiVoice checks if a voice name is valid
|
||||
func IsValidGeminiVoice(voiceName string) bool {
|
||||
if voiceName == "" {
|
||||
return true // Empty voice is valid (will use default)
|
||||
}
|
||||
|
||||
for _, voice := range GetGeminiVoices() {
|
||||
if voice.Name == voiceName {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// GetGeminiVoiceByName returns a specific voice by name
|
||||
func GetGeminiVoiceByName(name string) (*GeminiVoice, error) {
|
||||
for _, voice := range GetGeminiVoices() {
|
||||
if voice.Name == name {
|
||||
return &voice, nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("voice '%s' not found", name)
|
||||
}
|
||||
|
||||
// ListGeminiVoices formats the voice list for display
|
||||
func ListGeminiVoices(shellCompleteMode bool) string {
|
||||
if shellCompleteMode {
|
||||
// For shell completion, just return voice names
|
||||
names := GetGeminiVoiceNames()
|
||||
result := ""
|
||||
for _, name := range names {
|
||||
result += name + "\n"
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// For human-readable output
|
||||
voices := GetGeminiVoices()
|
||||
result := "Available Gemini Text-to-Speech voices:\n\n"
|
||||
|
||||
// Group by characteristics for better readability
|
||||
groups := map[string][]GeminiVoice{
|
||||
"Firm & Confident": {},
|
||||
"Bright & Cheerful": {},
|
||||
"Warm & Welcoming": {},
|
||||
"Clear & Professional": {},
|
||||
"Natural & Expressive": {},
|
||||
"Other Voices": {},
|
||||
}
|
||||
|
||||
for _, voice := range voices {
|
||||
placed := false
|
||||
for _, char := range voice.Characteristics {
|
||||
switch char {
|
||||
case "firm", "confident", "decisive", "strong":
|
||||
if !placed {
|
||||
groups["Firm & Confident"] = append(groups["Firm & Confident"], voice)
|
||||
placed = true
|
||||
}
|
||||
case "bright", "cheerful", "upbeat", "energetic", "lively":
|
||||
if !placed {
|
||||
groups["Bright & Cheerful"] = append(groups["Bright & Cheerful"], voice)
|
||||
placed = true
|
||||
}
|
||||
case "warm", "welcoming", "friendly", "approachable":
|
||||
if !placed {
|
||||
groups["Warm & Welcoming"] = append(groups["Warm & Welcoming"], voice)
|
||||
placed = true
|
||||
}
|
||||
case "clear", "informative", "professional", "articulate":
|
||||
if !placed {
|
||||
groups["Clear & Professional"] = append(groups["Clear & Professional"], voice)
|
||||
placed = true
|
||||
}
|
||||
case "natural", "expressive", "melodic", "breezy":
|
||||
if !placed {
|
||||
groups["Natural & Expressive"] = append(groups["Natural & Expressive"], voice)
|
||||
placed = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if !placed {
|
||||
groups["Other Voices"] = append(groups["Other Voices"], voice)
|
||||
}
|
||||
}
|
||||
|
||||
// Output grouped voices
|
||||
for groupName, groupVoices := range groups {
|
||||
if len(groupVoices) > 0 {
|
||||
result += fmt.Sprintf("%s:\n", groupName)
|
||||
for _, voice := range groupVoices {
|
||||
defaultStr := ""
|
||||
if voice.Name == "Kore" {
|
||||
defaultStr = " (default)"
|
||||
}
|
||||
result += fmt.Sprintf(" %-15s - %s%s\n", voice.Name, voice.Description, defaultStr)
|
||||
}
|
||||
result += "\n"
|
||||
}
|
||||
}
|
||||
|
||||
result += "Use --voice <voice_name> to select a specific voice.\n"
|
||||
result += "Example: fabric --voice Charon -m gemini-2.0-flash-tts -o output.wav \"Hello world\"\n"
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// NOTE: This implementation maintains a curated list based on official Google documentation.
|
||||
// In the future, if Google provides a dynamic voice discovery API, this can be updated
|
||||
// to make API calls for real-time voice discovery.
|
||||
//
|
||||
// The current approach ensures:
|
||||
// 1. Fast response times (no API calls needed)
|
||||
// 2. Reliable voice information with descriptions
|
||||
// 3. Easy maintenance when new voices are added
|
||||
// 4. Offline functionality
|
||||
//
|
||||
// To update voices: Monitor Google's Gemini TTS documentation at:
|
||||
// https://ai.google.dev/gemini-api/docs/speech-generation
|
||||
Reference in New Issue
Block a user