feat: add Gemini TTS voice selection and listing functionality

## CHANGES - Add `--voice` flag for TTS voice selection - Add `--list-gemini-voices` command for voice discovery - Implement voice validation for Gemini TTS models - Update shell completions for voice options - Add comprehensive Gemini TTS documentation - Create voice samples directory structure - Extend spell checker dictionary with voice names
2026-04-24 03:00:15 -04:00 · 2025-07-26 15:11:30 -07:00
parent eab335873e
commit 614b1322d5
12 changed files with 474 additions and 11 deletions
--- a/internal/plugins/ai/gemini/gemini.go
+++ b/internal/plugins/ai/gemini/gemini.go
@@ -194,6 +194,12 @@ func (o *Client) generateTTSAudio(ctx context.Context, msgs []*chat.ChatCompleti
 		return "", err
 	}

+	// Validate voice name before making API call
+	if opts.Voice != "" && !IsValidGeminiVoice(opts.Voice) {
+		validVoices := GetGeminiVoiceNames()
+		return "", fmt.Errorf("invalid voice '%s'. Valid voices are: %v", opts.Voice, validVoices)
+	}
+
 	client, err := o.createGenaiClient(ctx)
 	if err != nil {
 		return "", err
@@ -211,12 +217,17 @@ func (o *Client) performTTSGeneration(ctx context.Context, client *genai.Client,
 	}}

 	// Configure for TTS generation
+	voiceName := opts.Voice
+	if voiceName == "" {
+		voiceName = "Kore" // Default voice if none specified
+	}
+
 	config := &genai.GenerateContentConfig{
 		ResponseModalities: []string{"AUDIO"},
 		SpeechConfig: &genai.SpeechConfig{
 			VoiceConfig: &genai.VoiceConfig{
 				PrebuiltVoiceConfig: &genai.PrebuiltVoiceConfig{
-					VoiceName: "Kore", // Default voice
+					VoiceName: voiceName,
 				},
 			},
 		},
--- a/internal/plugins/ai/gemini/voices.go
+++ b/internal/plugins/ai/gemini/voices.go
@@ -0,0 +1,218 @@
+package gemini
+
+import (
+	"fmt"
+	"sort"
+)
+
+// GeminiVoice represents a Gemini TTS voice with its characteristics
+type GeminiVoice struct {
+	Name            string
+	Description     string
+	Characteristics []string
+}
+
+// GetGeminiVoices returns the current list of supported Gemini TTS voices
+// This list is maintained based on official Google Gemini documentation
+// https://ai.google.dev/gemini-api/docs/speech-generation
+func GetGeminiVoices() []GeminiVoice {
+	return []GeminiVoice{
+		// Firm voices
+		{Name: "Kore", Description: "Firm and confident", Characteristics: []string{"firm", "confident", "default"}},
+		{Name: "Orus", Description: "Firm and decisive", Characteristics: []string{"firm", "decisive"}},
+		{Name: "Alnilam", Description: "Firm and strong", Characteristics: []string{"firm", "strong"}},
+
+		// Upbeat voices
+		{Name: "Puck", Description: "Upbeat and energetic", Characteristics: []string{"upbeat", "energetic"}},
+		{Name: "Laomedeia", Description: "Upbeat and lively", Characteristics: []string{"upbeat", "lively"}},
+
+		// Bright voices
+		{Name: "Zephyr", Description: "Bright and cheerful", Characteristics: []string{"bright", "cheerful"}},
+		{Name: "Autonoe", Description: "Bright and optimistic", Characteristics: []string{"bright", "optimistic"}},
+
+		// Informative voices
+		{Name: "Charon", Description: "Informative and clear", Characteristics: []string{"informative", "clear"}},
+		{Name: "Rasalgethi", Description: "Informative and professional", Characteristics: []string{"informative", "professional"}},
+
+		// Natural voices
+		{Name: "Aoede", Description: "Breezy and natural", Characteristics: []string{"breezy", "natural"}},
+		{Name: "Leda", Description: "Youthful and energetic", Characteristics: []string{"youthful", "energetic"}},
+
+		// Gentle voices
+		{Name: "Vindemiatrix", Description: "Gentle and kind", Characteristics: []string{"gentle", "kind"}},
+		{Name: "Achernar", Description: "Soft and gentle", Characteristics: []string{"soft", "gentle"}},
+		{Name: "Enceladus", Description: "Breathy and soft", Characteristics: []string{"breathy", "soft"}},
+
+		// Warm voices
+		{Name: "Sulafat", Description: "Warm and welcoming", Characteristics: []string{"warm", "welcoming"}},
+		{Name: "Capella", Description: "Warm and approachable", Characteristics: []string{"warm", "approachable"}},
+
+		// Clear voices
+		{Name: "Iapetus", Description: "Clear and articulate", Characteristics: []string{"clear", "articulate"}},
+		{Name: "Erinome", Description: "Clear and precise", Characteristics: []string{"clear", "precise"}},
+
+		// Pleasant voices
+		{Name: "Algieba", Description: "Smooth and pleasant", Characteristics: []string{"smooth", "pleasant"}},
+		{Name: "Vega", Description: "Smooth and flowing", Characteristics: []string{"smooth", "flowing"}},
+
+		// Textured voices
+		{Name: "Algenib", Description: "Gravelly texture", Characteristics: []string{"gravelly", "textured"}},
+
+		// Relaxed voices
+		{Name: "Callirrhoe", Description: "Easy-going and relaxed", Characteristics: []string{"relaxed", "easy-going"}},
+		{Name: "Despina", Description: "Smooth and flowing", Characteristics: []string{"smooth", "flowing"}},
+
+		// Mature voices
+		{Name: "Gacrux", Description: "Mature and experienced", Characteristics: []string{"mature", "experienced"}},
+
+		// Expressive voices
+		{Name: "Pulcherrima", Description: "Forward and expressive", Characteristics: []string{"forward", "expressive"}},
+		{Name: "Lyra", Description: "Melodic and expressive", Characteristics: []string{"melodic", "expressive"}},
+
+		// Dynamic voices
+		{Name: "Fenrir", Description: "Excitable and dynamic", Characteristics: []string{"excitable", "dynamic"}},
+		{Name: "Sadachbia", Description: "Lively and animated", Characteristics: []string{"lively", "animated"}},
+
+		// Friendly voices
+		{Name: "Achird", Description: "Friendly and approachable", Characteristics: []string{"friendly", "approachable"}},
+
+		// Casual voices
+		{Name: "Zubenelgenubi", Description: "Casual and conversational", Characteristics: []string{"casual", "conversational"}},
+
+		// Additional voices from latest API
+		{Name: "Sadaltager", Description: "Additional voice option", Characteristics: []string{"additional"}},
+		{Name: "Schedar", Description: "Additional voice option", Characteristics: []string{"additional"}},
+		{Name: "Umbriel", Description: "Additional voice option", Characteristics: []string{"additional"}},
+	}
+}
+
+// GetGeminiVoiceNames returns just the voice names in alphabetical order
+func GetGeminiVoiceNames() []string {
+	voices := GetGeminiVoices()
+	names := make([]string, len(voices))
+	for i, voice := range voices {
+		names[i] = voice.Name
+	}
+	sort.Strings(names)
+	return names
+}
+
+// IsValidGeminiVoice checks if a voice name is valid
+func IsValidGeminiVoice(voiceName string) bool {
+	if voiceName == "" {
+		return true // Empty voice is valid (will use default)
+	}
+
+	for _, voice := range GetGeminiVoices() {
+		if voice.Name == voiceName {
+			return true
+		}
+	}
+	return false
+}
+
+// GetGeminiVoiceByName returns a specific voice by name
+func GetGeminiVoiceByName(name string) (*GeminiVoice, error) {
+	for _, voice := range GetGeminiVoices() {
+		if voice.Name == name {
+			return &voice, nil
+		}
+	}
+	return nil, fmt.Errorf("voice '%s' not found", name)
+}
+
+// ListGeminiVoices formats the voice list for display
+func ListGeminiVoices(shellCompleteMode bool) string {
+	if shellCompleteMode {
+		// For shell completion, just return voice names
+		names := GetGeminiVoiceNames()
+		result := ""
+		for _, name := range names {
+			result += name + "\n"
+		}
+		return result
+	}
+
+	// For human-readable output
+	voices := GetGeminiVoices()
+	result := "Available Gemini Text-to-Speech voices:\n\n"
+
+	// Group by characteristics for better readability
+	groups := map[string][]GeminiVoice{
+		"Firm & Confident":     {},
+		"Bright & Cheerful":    {},
+		"Warm & Welcoming":     {},
+		"Clear & Professional": {},
+		"Natural & Expressive": {},
+		"Other Voices":         {},
+	}
+
+	for _, voice := range voices {
+		placed := false
+		for _, char := range voice.Characteristics {
+			switch char {
+			case "firm", "confident", "decisive", "strong":
+				if !placed {
+					groups["Firm & Confident"] = append(groups["Firm & Confident"], voice)
+					placed = true
+				}
+			case "bright", "cheerful", "upbeat", "energetic", "lively":
+				if !placed {
+					groups["Bright & Cheerful"] = append(groups["Bright & Cheerful"], voice)
+					placed = true
+				}
+			case "warm", "welcoming", "friendly", "approachable":
+				if !placed {
+					groups["Warm & Welcoming"] = append(groups["Warm & Welcoming"], voice)
+					placed = true
+				}
+			case "clear", "informative", "professional", "articulate":
+				if !placed {
+					groups["Clear & Professional"] = append(groups["Clear & Professional"], voice)
+					placed = true
+				}
+			case "natural", "expressive", "melodic", "breezy":
+				if !placed {
+					groups["Natural & Expressive"] = append(groups["Natural & Expressive"], voice)
+					placed = true
+				}
+			}
+		}
+		if !placed {
+			groups["Other Voices"] = append(groups["Other Voices"], voice)
+		}
+	}
+
+	// Output grouped voices
+	for groupName, groupVoices := range groups {
+		if len(groupVoices) > 0 {
+			result += fmt.Sprintf("%s:\n", groupName)
+			for _, voice := range groupVoices {
+				defaultStr := ""
+				if voice.Name == "Kore" {
+					defaultStr = " (default)"
+				}
+				result += fmt.Sprintf("  %-15s - %s%s\n", voice.Name, voice.Description, defaultStr)
+			}
+			result += "\n"
+		}
+	}
+
+	result += "Use --voice <voice_name> to select a specific voice.\n"
+	result += "Example: fabric --voice Charon -m gemini-2.0-flash-tts -o output.wav \"Hello world\"\n"
+
+	return result
+}
+
+// NOTE: This implementation maintains a curated list based on official Google documentation.
+// In the future, if Google provides a dynamic voice discovery API, this can be updated
+// to make API calls for real-time voice discovery.
+//
+// The current approach ensures:
+// 1. Fast response times (no API calls needed)
+// 2. Reliable voice information with descriptions
+// 3. Easy maintenance when new voices are added
+// 4. Offline functionality
+//
+// To update voices: Monitor Google's Gemini TTS documentation at:
+// https://ai.google.dev/gemini-api/docs/speech-generation