chore(release): Update version to v1.4.291

Merge pull request #1715 from ksylvan/0818-openai-transcribe-using-openai-models
Add speech-to-text via OpenAI with transcription flags and completions
2026-01-11 07:18:03 -05:00 · 2025-08-18 15:05:02 +00:00 · 2025-08-18 08:02:36 -07:00 · 2025-08-18 07:59:50 -07:00
15 changed files with 277 additions and 7 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -99,6 +99,7 @@
 		"mbed",
 		"metacharacters",
 		"Miessler",
+		"mpga",
 		"nometa",
 		"numpy",
 		"ollama",
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # Changelog

+## v1.4.291 (2025-08-18)
+
+### PR [#1715](https://github.com/danielmiessler/Fabric/pull/1715) by [ksylvan](https://github.com/ksylvan): feat: add speech-to-text via OpenAI with transcription flags and comp…
+
+- Add --transcribe-file flag to transcribe audio or video
+- Add --transcribe-model flag with model listing and completion
+- Add --split-media-file flag to chunk files over 25MB
+- Implement OpenAI transcription using Whisper and GPT-4o Transcribe
+- Integrate transcription pipeline into CLI before readability processing
+
 ## v1.4.290 (2025-08-17)

 ### PR [#1714](https://github.com/danielmiessler/Fabric/pull/1714) by [ksylvan](https://github.com/ksylvan): feat: add per-pattern model mapping support via environment variables
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ Below are the **new features and capabilities** we've added (newest first):

 ### Recent Major Features

+- [v1.4.290](https://github.com/danielmiessler/fabric/releases/tag/v1.4.290) (Aug 18, 2025) — **Speech To Text**: Add OpenAI speech-to-text support with `--transcribe-file`, `--transcribe-model`, and `--split-media-file` flags.
 - [v1.4.287](https://github.com/danielmiessler/fabric/releases/tag/v1.4.287) (Aug 16, 2025) — **AI Reasoning**: Add Thinking to Gemini models and introduce `readme_updates` python script
 - [v1.4.286](https://github.com/danielmiessler/fabric/releases/tag/v1.4.286) (Aug 14, 2025) — **AI Reasoning**: Introduce Thinking Config Across Anthropic and OpenAI Providers
 - [v1.4.285](https://github.com/danielmiessler/fabric/releases/tag/v1.4.285) (Aug 13, 2025) — **Extended Context**: Enable One Million Token Context Beta Feature for Sonnet-4
@@ -648,7 +649,7 @@ Fabric _Patterns_ are different than most prompts you'll see.
 Here's an example of a Fabric Pattern.

 ```bash
-https://github.com/danielmiessler/fabric/blob/main/patterns/extract_wisdom/system.md
+https://github.com/danielmiessler/Fabric/blob/main/data/patterns/extract_wisdom/system.md
 ```

 <img width="1461" alt="pattern-example" src="https://github.com/danielmiessler/fabric/assets/50654/b910c551-9263-405f-9735-71ca69bbab6d">
--- a/cmd/fabric/version.go
+++ b/cmd/fabric/version.go
@@ -1,3 +1,3 @@
 package main

-var version = "v1.4.290"
+var version = "v1.4.291"
--- a/cmd/generate_changelog/changelog.db
+++ b/cmd/generate_changelog/changelog.db
--- a/completions/_fabric
+++ b/completions/_fabric
@@ -59,6 +59,13 @@ _fabric_gemini_voices() {
  compadd -X "Gemini TTS Voices:" ${voices}
 }

+_fabric_transcription_models() {
+  local -a models
+  local cmd=${words[1]}
+  models=(${(f)"$($cmd --list-transcription-models --shell-complete-list 2>/dev/null)"})
+  compadd -X "Transcription Models:" ${models}
+}
+
 _fabric() {
  local curcontext="$curcontext" state line
  typeset -A opt_args
@@ -135,6 +142,9 @@ _fabric() {
    '(--think-start-tag)--think-start-tag[Start tag for thinking sections (default: <think>)]:start tag:' \
    '(--think-end-tag)--think-end-tag[End tag for thinking sections (default: </think>)]:end tag:' \
    '(--disable-responses-api)--disable-responses-api[Disable OpenAI Responses API (default: false)]' \
+    '(--transcribe-file)--transcribe-file[Audio or video file to transcribe]:audio file:_files -g "*.mp3 *.mp4 *.mpeg *.mpga *.m4a *.wav *.webm"' \
+    '(--transcribe-model)--transcribe-model[Model to use for transcription (separate from chat model)]:transcribe model:_fabric_transcription_models' \
+    '(--split-media-file)--split-media-file[Split audio/video files larger than 25MB using ffmpeg]' \
    '(--notification)--notification[Send desktop notification when command completes]' \
    '(--notification-command)--notification-command[Custom command to run for notifications]:notification command:' \
    '(-h --help)'{-h,--help}'[Show this help message]' \
--- a/completions/fabric.bash
+++ b/completions/fabric.bash
@@ -13,7 +13,7 @@ _fabric() {
  _get_comp_words_by_ref -n : cur prev words cword

  # Define all possible options/flags
-  local opts="--pattern -p --variable -v --context -C --session --attachment -a --setup -S --temperature -t --topp -T --stream -s --presencepenalty -P --raw -r --frequencypenalty -F --listpatterns -l --listmodels -L --listcontexts -x --listsessions -X --updatepatterns -U --copy -c --model -m --vendor -V --modelContextLength --output -o --output-session --latest -n --changeDefaultModel -d --youtube -y --playlist --transcript --transcript-with-timestamps --comments --metadata --yt-dlp-args --language -g --scrape_url -u --scrape_question -q --seed -e --thinking --wipecontext -w --wipesession -W --printcontext --printsession --readability --input-has-vars --no-variable-replacement --dry-run --serve --serveOllama --address --api-key --config --search --search-location --image-file --image-size --image-quality --image-compression --image-background --suppress-think --think-start-tag --think-end-tag --disable-responses-api --voice --list-gemini-voices --notification --notification-command --version --listextensions --addextension --rmextension --strategy --liststrategies --listvendors --shell-complete-list --help -h"
+  local opts="--pattern -p --variable -v --context -C --session --attachment -a --setup -S --temperature -t --topp -T --stream -s --presencepenalty -P --raw -r --frequencypenalty -F --listpatterns -l --listmodels -L --listcontexts -x --listsessions -X --updatepatterns -U --copy -c --model -m --vendor -V --modelContextLength --output -o --output-session --latest -n --changeDefaultModel -d --youtube -y --playlist --transcript --transcript-with-timestamps --comments --metadata --yt-dlp-args --language -g --scrape_url -u --scrape_question -q --seed -e --thinking --wipecontext -w --wipesession -W --printcontext --printsession --readability --input-has-vars --no-variable-replacement --dry-run --serve --serveOllama --address --api-key --config --search --search-location --image-file --image-size --image-quality --image-compression --image-background --suppress-think --think-start-tag --think-end-tag --disable-responses-api --transcribe-file --transcribe-model --split-media-file --voice --list-gemini-voices --notification --notification-command --version --listextensions --addextension --rmextension --strategy --liststrategies --listvendors --shell-complete-list --help -h"

  # Helper function for dynamic completions
  _fabric_get_list() {
@@ -74,8 +74,12 @@ _fabric() {
    COMPREPLY=($(compgen -W "$(_fabric_get_list --list-gemini-voices)" -- "${cur}"))
    return 0
    ;;
+  --transcribe-model)
+    COMPREPLY=($(compgen -W "$(_fabric_get_list --list-transcription-models)" -- "${cur}"))
+    return 0
+    ;;
  # Options requiring file/directory paths
-  -a | --attachment | -o | --output | --config | --addextension | --image-file)
+  -a | --attachment | -o | --output | --config | --addextension | --image-file | --transcribe-file)
    _filedir
    return 0
    ;;
--- a/completions/fabric.fish
+++ b/completions/fabric.fish
@@ -47,6 +47,11 @@ function __fabric_get_gemini_voices
        $cmd --list-gemini-voices --shell-complete-list 2>/dev/null
 end

+function __fabric_get_transcription_models
+        set cmd (commandline -opc)[1]
+        $cmd --list-transcription-models --shell-complete-list 2>/dev/null
+end
+
 # Main completion function
 function __fabric_register_completions
        set cmd $argv[1]
@@ -92,6 +97,8 @@ function __fabric_register_completions
        complete -c $cmd -l think-start-tag -d "Start tag for thinking sections (default: <think>)"
        complete -c $cmd -l think-end-tag -d "End tag for thinking sections (default: </think>)"
        complete -c $cmd -l voice -d "TTS voice name for supported models (e.g., Kore, Charon, Puck)" -a "(__fabric_get_gemini_voices)"
+        complete -c $cmd -l transcribe-file -d "Audio or video file to transcribe" -r -a "*.mp3 *.mp4 *.mpeg *.mpga *.m4a *.wav *.webm"
+        complete -c $cmd -l transcribe-model -d "Model to use for transcription (separate from chat model)" -a "(__fabric_get_transcription_models)"
        complete -c $cmd -l notification-command -d "Custom command to run for notifications (overrides built-in notifications)"

        # Boolean flags (no arguments)
@@ -127,6 +134,7 @@ function __fabric_register_completions
        complete -c $cmd -l shell-complete-list -d "Output raw list without headers/formatting (for shell completion)"
        complete -c $cmd -l suppress-think -d "Suppress text enclosed in thinking tags"
        complete -c $cmd -l disable-responses-api -d "Disable OpenAI Responses API (default: false)"
+        complete -c $cmd -l split-media-file -d "Split audio/video files larger than 25MB using ffmpeg"
        complete -c $cmd -l notification -d "Send desktop notification when command completes"
        complete -c $cmd -s h -l help -d "Show this help message"
 end
--- a/internal/cli/cli.go
+++ b/internal/cli/cli.go
@@ -74,6 +74,15 @@ func Cli(version string) (err error) {
 		return
 	}

+	// Handle transcription if specified
+	if currentFlags.TranscribeFile != "" {
+		var transcriptionMessage string
+		if transcriptionMessage, err = handleTranscription(currentFlags, registry); err != nil {
+			return
+		}
+		currentFlags.Message = AppendMessage(currentFlags.Message, transcriptionMessage)
+	}
+
 	// Process HTML readability if needed
 	if currentFlags.HtmlReadability {
 		if msg, cleanErr := converter.HtmlReadability(currentFlags.Message); cleanErr != nil {
--- a/internal/cli/flags.go
+++ b/internal/cli/flags.go
@@ -92,8 +92,12 @@ type Flags struct {
 	ThinkStartTag                   string               `long:"think-start-tag" yaml:"thinkStartTag" description:"Start tag for thinking sections" default:"<think>"`
 	ThinkEndTag                     string               `long:"think-end-tag" yaml:"thinkEndTag" description:"End tag for thinking sections" default:"</think>"`
 	DisableResponsesAPI             bool                 `long:"disable-responses-api" yaml:"disableResponsesAPI" description:"Disable OpenAI Responses API (default: false)"`
+	TranscribeFile                  string               `long:"transcribe-file" yaml:"transcribeFile" description:"Audio or video file to transcribe"`
+	TranscribeModel                 string               `long:"transcribe-model" yaml:"transcribeModel" description:"Model to use for transcription (separate from chat model)"`
+	SplitMediaFile                  bool                 `long:"split-media-file" yaml:"splitMediaFile" description:"Split audio/video files larger than 25MB using ffmpeg"`
 	Voice                           string               `long:"voice" yaml:"voice" description:"TTS voice name for supported models (e.g., Kore, Charon, Puck)" default:"Kore"`
 	ListGeminiVoices                bool                 `long:"list-gemini-voices" description:"List all available Gemini TTS voices"`
+	ListTranscriptionModels         bool                 `long:"list-transcription-models" description:"List all available transcription models"`
 	Notification                    bool                 `long:"notification" yaml:"notification" description:"Send desktop notification when command completes"`
 	NotificationCommand             string               `long:"notification-command" yaml:"notificationCommand" description:"Custom command to run for notifications (overrides built-in notifications)"`
 	Thinking                        domain.ThinkingLevel `long:"thinking" yaml:"thinking" description:"Set reasoning/thinking level (e.g., off, low, medium, high, or numeric tokens for Anthropic or Google Gemini)"`
--- a/internal/cli/listing.go
+++ b/internal/cli/listing.go
@@ -5,6 +5,8 @@ import (
 	"os"
 	"strconv"

+	openai "github.com/openai/openai-go"
+
 	"github.com/danielmiessler/fabric/internal/core"
 	"github.com/danielmiessler/fabric/internal/plugins/ai"
 	"github.com/danielmiessler/fabric/internal/plugins/ai/gemini"
@@ -70,5 +72,30 @@ func handleListingCommands(currentFlags *Flags, fabricDb *fsdb.Db, registry *cor
 		return true, nil
 	}

+	if currentFlags.ListTranscriptionModels {
+		listTranscriptionModels(currentFlags.ShellCompleteOutput)
+		return true, nil
+	}
+
 	return false, nil
 }
+
+// listTranscriptionModels lists all available transcription models
+func listTranscriptionModels(shellComplete bool) {
+	models := []string{
+		string(openai.AudioModelWhisper1),
+		string(openai.AudioModelGPT4oMiniTranscribe),
+		string(openai.AudioModelGPT4oTranscribe),
+	}
+
+	if shellComplete {
+		for _, model := range models {
+			fmt.Println(model)
+		}
+	} else {
+		fmt.Println("Available transcription models:")
+		for _, model := range models {
+			fmt.Printf("  %s\n", model)
+		}
+	}
+}
--- a/internal/cli/transcribe.go
+++ b/internal/cli/transcribe.go
@@ -0,0 +1,35 @@
+package cli
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/danielmiessler/fabric/internal/core"
+)
+
+type transcriber interface {
+	TranscribeFile(ctx context.Context, filePath, model string, split bool) (string, error)
+}
+
+func handleTranscription(flags *Flags, registry *core.PluginRegistry) (message string, err error) {
+	vendorName := flags.Vendor
+	if vendorName == "" {
+		vendorName = "OpenAI"
+	}
+	vendor, ok := registry.VendorManager.VendorsByName[vendorName]
+	if !ok {
+		return "", fmt.Errorf("vendor %s not configured", vendorName)
+	}
+	tr, ok := vendor.(transcriber)
+	if !ok {
+		return "", fmt.Errorf("vendor %s does not support audio transcription", vendorName)
+	}
+	model := flags.TranscribeModel
+	if model == "" {
+		return "", fmt.Errorf("transcription model is required (use --transcribe-model)")
+	}
+	if message, err = tr.TranscribeFile(context.Background(), flags.TranscribeFile, model, flags.SplitMediaFile); err != nil {
+		return
+	}
+	return
+}
--- a/internal/core/plugin_registry_test.go
+++ b/internal/core/plugin_registry_test.go
@@ -81,8 +81,10 @@ func TestGetChatter_WarnsOnAmbiguousModel(t *testing.T) {
 	if err != nil {
 		t.Fatalf("GetChatter() error = %v", err)
 	}
-	if chatter.vendor.GetName() != "VendorA" {
-		t.Fatalf("expected vendor VendorA, got %s", chatter.vendor.GetName())
+	// Verify that one of the valid vendors was selected (don't care which one due to map iteration randomness)
+	vendorName := chatter.vendor.GetName()
+	if vendorName != "VendorA" && vendorName != "VendorB" {
+		t.Fatalf("expected vendor VendorA or VendorB, got %s", vendorName)
 	}
 	if !strings.Contains(string(warning), "multiple vendors provide model shared-model") {
 		t.Fatalf("expected warning about multiple vendors, got %q", string(warning))
--- a/internal/plugins/ai/openai/openai_audio.go
+++ b/internal/plugins/ai/openai/openai_audio.go
@@ -0,0 +1,159 @@
+package openai
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"slices"
+	"sort"
+	"strings"
+
+	openai "github.com/openai/openai-go"
+)
+
+// MaxAudioFileSize defines the maximum allowed size for audio uploads (25MB).
+const MaxAudioFileSize int64 = 25 * 1024 * 1024
+
+// AllowedTranscriptionModels lists the models supported for transcription.
+var AllowedTranscriptionModels = []string{
+	string(openai.AudioModelWhisper1),
+	string(openai.AudioModelGPT4oMiniTranscribe),
+	string(openai.AudioModelGPT4oTranscribe),
+}
+
+// allowedAudioExtensions defines the supported input file extensions.
+var allowedAudioExtensions = map[string]struct{}{
+	".mp3":  {},
+	".mp4":  {},
+	".mpeg": {},
+	".mpga": {},
+	".m4a":  {},
+	".wav":  {},
+	".webm": {},
+}
+
+// TranscribeFile transcribes the given audio file using the specified model. If the file
+// exceeds the size limit, it can optionally be split into chunks using ffmpeg.
+func (o *Client) TranscribeFile(ctx context.Context, filePath, model string, split bool) (string, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	if !slices.Contains(AllowedTranscriptionModels, model) {
+		return "", fmt.Errorf("model '%s' is not supported for transcription", model)
+	}
+
+	ext := strings.ToLower(filepath.Ext(filePath))
+	if _, ok := allowedAudioExtensions[ext]; !ok {
+		return "", fmt.Errorf("unsupported audio format '%s'", ext)
+	}
+
+	info, err := os.Stat(filePath)
+	if err != nil {
+		return "", err
+	}
+
+	debug := os.Getenv("FABRIC_STT_DEBUG") != ""
+
+	var files []string
+	var cleanup func()
+	if info.Size() > MaxAudioFileSize {
+		if !split {
+			return "", fmt.Errorf("file %s exceeds 25MB limit; use --split-media-file to enable automatic splitting", filePath)
+		}
+		if debug {
+			fmt.Fprintf(os.Stderr, "File %s is larger than the size limit... breaking it up into chunks...\n", filePath)
+		}
+		if files, cleanup, err = splitAudioFile(filePath, ext, MaxAudioFileSize, debug); err != nil {
+			return "", err
+		}
+		defer cleanup()
+	} else {
+		files = []string{filePath}
+	}
+
+	var builder strings.Builder
+	for i, f := range files {
+		if debug {
+			fmt.Fprintf(os.Stderr, "Using model %s to transcribe part %d (file name: %s)...\n", model, i+1, f)
+		}
+		var chunk *os.File
+		if chunk, err = os.Open(f); err != nil {
+			return "", err
+		}
+		params := openai.AudioTranscriptionNewParams{
+			File:  chunk,
+			Model: openai.AudioModel(model),
+		}
+		var resp *openai.Transcription
+		resp, err = o.ApiClient.Audio.Transcriptions.New(ctx, params)
+		chunk.Close()
+		if err != nil {
+			return "", err
+		}
+		if i > 0 {
+			builder.WriteString(" ")
+		}
+		builder.WriteString(resp.Text)
+	}
+
+	return builder.String(), nil
+}
+
+// splitAudioFile splits the source file into chunks smaller than maxSize using ffmpeg.
+// It returns the list of chunk file paths and a cleanup function.
+func splitAudioFile(src, ext string, maxSize int64, debug bool) (files []string, cleanup func(), err error) {
+	if _, err = exec.LookPath("ffmpeg"); err != nil {
+		return nil, nil, fmt.Errorf("ffmpeg not found: please install it")
+	}
+
+	var dir string
+	if dir, err = os.MkdirTemp("", "fabric-audio-*"); err != nil {
+		return nil, nil, err
+	}
+	cleanup = func() { os.RemoveAll(dir) }
+
+	segmentTime := 600 // start with 10 minutes
+	for {
+		pattern := filepath.Join(dir, "chunk-%03d"+ext)
+		if debug {
+			fmt.Fprintf(os.Stderr, "Running ffmpeg to split audio into %d-second chunks...\n", segmentTime)
+		}
+		cmd := exec.Command("ffmpeg", "-y", "-i", src, "-f", "segment", "-segment_time", fmt.Sprintf("%d", segmentTime), "-c", "copy", pattern)
+		var stderr bytes.Buffer
+		cmd.Stderr = &stderr
+		if err = cmd.Run(); err != nil {
+			return nil, cleanup, fmt.Errorf("ffmpeg failed: %v: %s", err, stderr.String())
+		}
+
+		if files, err = filepath.Glob(filepath.Join(dir, "chunk-*"+ext)); err != nil {
+			return nil, cleanup, err
+		}
+		sort.Strings(files)
+
+		tooBig := false
+		for _, f := range files {
+			var info os.FileInfo
+			if info, err = os.Stat(f); err != nil {
+				return nil, cleanup, err
+			}
+			if info.Size() > maxSize {
+				tooBig = true
+				break
+			}
+		}
+		if !tooBig {
+			return files, cleanup, nil
+		}
+		for _, f := range files {
+			_ = os.Remove(f)
+		}
+		if segmentTime <= 1 {
+			return nil, cleanup, fmt.Errorf("unable to split file into acceptable size chunks")
+		}
+		segmentTime /= 2
+	}
+}
--- a/nix/pkgs/fabric/version.nix
+++ b/nix/pkgs/fabric/version.nix
@@ -1 +1 @@
-"1.4.290"
+"1.4.291"