diff --git a/cmd/generate_changelog/incoming/1915.txt b/cmd/generate_changelog/incoming/1915.txt new file mode 100644 index 00000000..564e7eb6 --- /dev/null +++ b/cmd/generate_changelog/incoming/1915.txt @@ -0,0 +1,3 @@ +### PR [#1915](https://github.com/danielmiessler/Fabric/pull/1915) by [majiayu000](https://github.com/majiayu000): feat: parallelize audio chunk transcription for improved performance + +- Parallelize audio chunk transcription using goroutines for improved performance diff --git a/internal/plugins/ai/openai/openai_audio.go b/internal/plugins/ai/openai/openai_audio.go index b2b3b924..9891f101 100644 --- a/internal/plugins/ai/openai/openai_audio.go +++ b/internal/plugins/ai/openai/openai_audio.go @@ -10,12 +10,20 @@ import ( "slices" "sort" "strings" + "sync" debuglog "github.com/danielmiessler/fabric/internal/log" openai "github.com/openai/openai-go" ) +// transcriptionResult holds the result of a single chunk transcription. +type transcriptionResult struct { + index int + text string + err error +} + // MaxAudioFileSize defines the maximum allowed size for audio uploads (25MB). const MaxAudioFileSize int64 = 25 * 1024 * 1024 @@ -73,27 +81,56 @@ func (o *Client) TranscribeFile(ctx context.Context, filePath, model string, spl files = []string{filePath} } - var builder strings.Builder + resultsChan := make(chan transcriptionResult, len(files)) + var wg sync.WaitGroup + for i, f := range files { - debuglog.Log("Using model %s to transcribe part %d (file name: %s)...\n", model, i+1, f) - var chunk *os.File - if chunk, err = os.Open(f); err != nil { - return "", err - } - params := openai.AudioTranscriptionNewParams{ - File: chunk, - Model: openai.AudioModel(model), - } - var resp *openai.Transcription - resp, err = o.ApiClient.Audio.Transcriptions.New(ctx, params) - chunk.Close() - if err != nil { - return "", err + wg.Add(1) + go func(index int, filePath string) { + defer wg.Done() + debuglog.Log("Using model %s to transcribe part %d (file name: %s)...\n", model, index+1, filePath) + + chunk, openErr := os.Open(filePath) + if openErr != nil { + resultsChan <- transcriptionResult{index: index, err: openErr} + return + } + defer chunk.Close() + + params := openai.AudioTranscriptionNewParams{ + File: chunk, + Model: openai.AudioModel(model), + } + resp, transcribeErr := o.ApiClient.Audio.Transcriptions.New(ctx, params) + if transcribeErr != nil { + resultsChan <- transcriptionResult{index: index, err: transcribeErr} + return + } + resultsChan <- transcriptionResult{index: index, text: resp.Text} + }(i, f) + } + + wg.Wait() + close(resultsChan) + + results := make([]transcriptionResult, 0, len(files)) + for result := range resultsChan { + if result.err != nil { + return "", result.err } + results = append(results, result) + } + + sort.Slice(results, func(i, j int) bool { + return results[i].index < results[j].index + }) + + var builder strings.Builder + for i, result := range results { if i > 0 { builder.WriteString(" ") } - builder.WriteString(resp.Text) + builder.WriteString(result.text) } return builder.String(), nil