mirror of
https://github.com/danielmiessler/Fabric.git
synced 2026-04-24 03:00:15 -04:00
feat: add Gemini TTS voice selection and listing functionality
## CHANGES - Add `--voice` flag for TTS voice selection - Add `--list-gemini-voices` command for voice discovery - Implement voice validation for Gemini TTS models - Update shell completions for voice options - Add comprehensive Gemini TTS documentation - Create voice samples directory structure - Extend spell checker dictionary with voice names
This commit is contained in:
@@ -194,6 +194,12 @@ func (o *Client) generateTTSAudio(ctx context.Context, msgs []*chat.ChatCompleti
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Validate voice name before making API call
|
||||
if opts.Voice != "" && !IsValidGeminiVoice(opts.Voice) {
|
||||
validVoices := GetGeminiVoiceNames()
|
||||
return "", fmt.Errorf("invalid voice '%s'. Valid voices are: %v", opts.Voice, validVoices)
|
||||
}
|
||||
|
||||
client, err := o.createGenaiClient(ctx)
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -211,12 +217,17 @@ func (o *Client) performTTSGeneration(ctx context.Context, client *genai.Client,
|
||||
}}
|
||||
|
||||
// Configure for TTS generation
|
||||
voiceName := opts.Voice
|
||||
if voiceName == "" {
|
||||
voiceName = "Kore" // Default voice if none specified
|
||||
}
|
||||
|
||||
config := &genai.GenerateContentConfig{
|
||||
ResponseModalities: []string{"AUDIO"},
|
||||
SpeechConfig: &genai.SpeechConfig{
|
||||
VoiceConfig: &genai.VoiceConfig{
|
||||
PrebuiltVoiceConfig: &genai.PrebuiltVoiceConfig{
|
||||
VoiceName: "Kore", // Default voice
|
||||
VoiceName: voiceName,
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
218
internal/plugins/ai/gemini/voices.go
Normal file
218
internal/plugins/ai/gemini/voices.go
Normal file
@@ -0,0 +1,218 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// GeminiVoice represents a Gemini TTS voice with its characteristics
|
||||
type GeminiVoice struct {
|
||||
Name string
|
||||
Description string
|
||||
Characteristics []string
|
||||
}
|
||||
|
||||
// GetGeminiVoices returns the current list of supported Gemini TTS voices
|
||||
// This list is maintained based on official Google Gemini documentation
|
||||
// https://ai.google.dev/gemini-api/docs/speech-generation
|
||||
func GetGeminiVoices() []GeminiVoice {
|
||||
return []GeminiVoice{
|
||||
// Firm voices
|
||||
{Name: "Kore", Description: "Firm and confident", Characteristics: []string{"firm", "confident", "default"}},
|
||||
{Name: "Orus", Description: "Firm and decisive", Characteristics: []string{"firm", "decisive"}},
|
||||
{Name: "Alnilam", Description: "Firm and strong", Characteristics: []string{"firm", "strong"}},
|
||||
|
||||
// Upbeat voices
|
||||
{Name: "Puck", Description: "Upbeat and energetic", Characteristics: []string{"upbeat", "energetic"}},
|
||||
{Name: "Laomedeia", Description: "Upbeat and lively", Characteristics: []string{"upbeat", "lively"}},
|
||||
|
||||
// Bright voices
|
||||
{Name: "Zephyr", Description: "Bright and cheerful", Characteristics: []string{"bright", "cheerful"}},
|
||||
{Name: "Autonoe", Description: "Bright and optimistic", Characteristics: []string{"bright", "optimistic"}},
|
||||
|
||||
// Informative voices
|
||||
{Name: "Charon", Description: "Informative and clear", Characteristics: []string{"informative", "clear"}},
|
||||
{Name: "Rasalgethi", Description: "Informative and professional", Characteristics: []string{"informative", "professional"}},
|
||||
|
||||
// Natural voices
|
||||
{Name: "Aoede", Description: "Breezy and natural", Characteristics: []string{"breezy", "natural"}},
|
||||
{Name: "Leda", Description: "Youthful and energetic", Characteristics: []string{"youthful", "energetic"}},
|
||||
|
||||
// Gentle voices
|
||||
{Name: "Vindemiatrix", Description: "Gentle and kind", Characteristics: []string{"gentle", "kind"}},
|
||||
{Name: "Achernar", Description: "Soft and gentle", Characteristics: []string{"soft", "gentle"}},
|
||||
{Name: "Enceladus", Description: "Breathy and soft", Characteristics: []string{"breathy", "soft"}},
|
||||
|
||||
// Warm voices
|
||||
{Name: "Sulafat", Description: "Warm and welcoming", Characteristics: []string{"warm", "welcoming"}},
|
||||
{Name: "Capella", Description: "Warm and approachable", Characteristics: []string{"warm", "approachable"}},
|
||||
|
||||
// Clear voices
|
||||
{Name: "Iapetus", Description: "Clear and articulate", Characteristics: []string{"clear", "articulate"}},
|
||||
{Name: "Erinome", Description: "Clear and precise", Characteristics: []string{"clear", "precise"}},
|
||||
|
||||
// Pleasant voices
|
||||
{Name: "Algieba", Description: "Smooth and pleasant", Characteristics: []string{"smooth", "pleasant"}},
|
||||
{Name: "Vega", Description: "Smooth and flowing", Characteristics: []string{"smooth", "flowing"}},
|
||||
|
||||
// Textured voices
|
||||
{Name: "Algenib", Description: "Gravelly texture", Characteristics: []string{"gravelly", "textured"}},
|
||||
|
||||
// Relaxed voices
|
||||
{Name: "Callirrhoe", Description: "Easy-going and relaxed", Characteristics: []string{"relaxed", "easy-going"}},
|
||||
{Name: "Despina", Description: "Smooth and flowing", Characteristics: []string{"smooth", "flowing"}},
|
||||
|
||||
// Mature voices
|
||||
{Name: "Gacrux", Description: "Mature and experienced", Characteristics: []string{"mature", "experienced"}},
|
||||
|
||||
// Expressive voices
|
||||
{Name: "Pulcherrima", Description: "Forward and expressive", Characteristics: []string{"forward", "expressive"}},
|
||||
{Name: "Lyra", Description: "Melodic and expressive", Characteristics: []string{"melodic", "expressive"}},
|
||||
|
||||
// Dynamic voices
|
||||
{Name: "Fenrir", Description: "Excitable and dynamic", Characteristics: []string{"excitable", "dynamic"}},
|
||||
{Name: "Sadachbia", Description: "Lively and animated", Characteristics: []string{"lively", "animated"}},
|
||||
|
||||
// Friendly voices
|
||||
{Name: "Achird", Description: "Friendly and approachable", Characteristics: []string{"friendly", "approachable"}},
|
||||
|
||||
// Casual voices
|
||||
{Name: "Zubenelgenubi", Description: "Casual and conversational", Characteristics: []string{"casual", "conversational"}},
|
||||
|
||||
// Additional voices from latest API
|
||||
{Name: "Sadaltager", Description: "Additional voice option", Characteristics: []string{"additional"}},
|
||||
{Name: "Schedar", Description: "Additional voice option", Characteristics: []string{"additional"}},
|
||||
{Name: "Umbriel", Description: "Additional voice option", Characteristics: []string{"additional"}},
|
||||
}
|
||||
}
|
||||
|
||||
// GetGeminiVoiceNames returns just the voice names in alphabetical order
|
||||
func GetGeminiVoiceNames() []string {
|
||||
voices := GetGeminiVoices()
|
||||
names := make([]string, len(voices))
|
||||
for i, voice := range voices {
|
||||
names[i] = voice.Name
|
||||
}
|
||||
sort.Strings(names)
|
||||
return names
|
||||
}
|
||||
|
||||
// IsValidGeminiVoice checks if a voice name is valid
|
||||
func IsValidGeminiVoice(voiceName string) bool {
|
||||
if voiceName == "" {
|
||||
return true // Empty voice is valid (will use default)
|
||||
}
|
||||
|
||||
for _, voice := range GetGeminiVoices() {
|
||||
if voice.Name == voiceName {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// GetGeminiVoiceByName returns a specific voice by name
|
||||
func GetGeminiVoiceByName(name string) (*GeminiVoice, error) {
|
||||
for _, voice := range GetGeminiVoices() {
|
||||
if voice.Name == name {
|
||||
return &voice, nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("voice '%s' not found", name)
|
||||
}
|
||||
|
||||
// ListGeminiVoices formats the voice list for display
|
||||
func ListGeminiVoices(shellCompleteMode bool) string {
|
||||
if shellCompleteMode {
|
||||
// For shell completion, just return voice names
|
||||
names := GetGeminiVoiceNames()
|
||||
result := ""
|
||||
for _, name := range names {
|
||||
result += name + "\n"
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// For human-readable output
|
||||
voices := GetGeminiVoices()
|
||||
result := "Available Gemini Text-to-Speech voices:\n\n"
|
||||
|
||||
// Group by characteristics for better readability
|
||||
groups := map[string][]GeminiVoice{
|
||||
"Firm & Confident": {},
|
||||
"Bright & Cheerful": {},
|
||||
"Warm & Welcoming": {},
|
||||
"Clear & Professional": {},
|
||||
"Natural & Expressive": {},
|
||||
"Other Voices": {},
|
||||
}
|
||||
|
||||
for _, voice := range voices {
|
||||
placed := false
|
||||
for _, char := range voice.Characteristics {
|
||||
switch char {
|
||||
case "firm", "confident", "decisive", "strong":
|
||||
if !placed {
|
||||
groups["Firm & Confident"] = append(groups["Firm & Confident"], voice)
|
||||
placed = true
|
||||
}
|
||||
case "bright", "cheerful", "upbeat", "energetic", "lively":
|
||||
if !placed {
|
||||
groups["Bright & Cheerful"] = append(groups["Bright & Cheerful"], voice)
|
||||
placed = true
|
||||
}
|
||||
case "warm", "welcoming", "friendly", "approachable":
|
||||
if !placed {
|
||||
groups["Warm & Welcoming"] = append(groups["Warm & Welcoming"], voice)
|
||||
placed = true
|
||||
}
|
||||
case "clear", "informative", "professional", "articulate":
|
||||
if !placed {
|
||||
groups["Clear & Professional"] = append(groups["Clear & Professional"], voice)
|
||||
placed = true
|
||||
}
|
||||
case "natural", "expressive", "melodic", "breezy":
|
||||
if !placed {
|
||||
groups["Natural & Expressive"] = append(groups["Natural & Expressive"], voice)
|
||||
placed = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if !placed {
|
||||
groups["Other Voices"] = append(groups["Other Voices"], voice)
|
||||
}
|
||||
}
|
||||
|
||||
// Output grouped voices
|
||||
for groupName, groupVoices := range groups {
|
||||
if len(groupVoices) > 0 {
|
||||
result += fmt.Sprintf("%s:\n", groupName)
|
||||
for _, voice := range groupVoices {
|
||||
defaultStr := ""
|
||||
if voice.Name == "Kore" {
|
||||
defaultStr = " (default)"
|
||||
}
|
||||
result += fmt.Sprintf(" %-15s - %s%s\n", voice.Name, voice.Description, defaultStr)
|
||||
}
|
||||
result += "\n"
|
||||
}
|
||||
}
|
||||
|
||||
result += "Use --voice <voice_name> to select a specific voice.\n"
|
||||
result += "Example: fabric --voice Charon -m gemini-2.0-flash-tts -o output.wav \"Hello world\"\n"
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// NOTE: This implementation maintains a curated list based on official Google documentation.
|
||||
// In the future, if Google provides a dynamic voice discovery API, this can be updated
|
||||
// to make API calls for real-time voice discovery.
|
||||
//
|
||||
// The current approach ensures:
|
||||
// 1. Fast response times (no API calls needed)
|
||||
// 2. Reliable voice information with descriptions
|
||||
// 3. Easy maintenance when new voices are added
|
||||
// 4. Offline functionality
|
||||
//
|
||||
// To update voices: Monitor Google's Gemini TTS documentation at:
|
||||
// https://ai.google.dev/gemini-api/docs/speech-generation
|
||||
Reference in New Issue
Block a user