mirror of
https://github.com/danielmiessler/Fabric.git
synced 2026-01-09 14:28:01 -05:00
191 lines
5.0 KiB
Go
191 lines
5.0 KiB
Go
package openai
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"slices"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
|
|
debuglog "github.com/danielmiessler/fabric/internal/log"
|
|
|
|
openai "github.com/openai/openai-go"
|
|
)
|
|
|
|
// transcriptionResult holds the result of a single chunk transcription.
|
|
type transcriptionResult struct {
|
|
index int
|
|
text string
|
|
err error
|
|
}
|
|
|
|
// MaxAudioFileSize defines the maximum allowed size for audio uploads (25MB).
|
|
const MaxAudioFileSize int64 = 25 * 1024 * 1024
|
|
|
|
// AllowedTranscriptionModels lists the models supported for transcription.
|
|
var AllowedTranscriptionModels = []string{
|
|
string(openai.AudioModelWhisper1),
|
|
string(openai.AudioModelGPT4oMiniTranscribe),
|
|
string(openai.AudioModelGPT4oTranscribe),
|
|
}
|
|
|
|
// allowedAudioExtensions defines the supported input file extensions.
|
|
var allowedAudioExtensions = map[string]struct{}{
|
|
".mp3": {},
|
|
".mp4": {},
|
|
".mpeg": {},
|
|
".mpga": {},
|
|
".m4a": {},
|
|
".wav": {},
|
|
".webm": {},
|
|
}
|
|
|
|
// TranscribeFile transcribes the given audio file using the specified model. If the file
|
|
// exceeds the size limit, it can optionally be split into chunks using ffmpeg.
|
|
func (o *Client) TranscribeFile(ctx context.Context, filePath, model string, split bool) (string, error) {
|
|
if ctx == nil {
|
|
ctx = context.Background()
|
|
}
|
|
|
|
if !slices.Contains(AllowedTranscriptionModels, model) {
|
|
return "", fmt.Errorf("model '%s' is not supported for transcription", model)
|
|
}
|
|
|
|
ext := strings.ToLower(filepath.Ext(filePath))
|
|
if _, ok := allowedAudioExtensions[ext]; !ok {
|
|
return "", fmt.Errorf("unsupported audio format '%s'", ext)
|
|
}
|
|
|
|
info, err := os.Stat(filePath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
var files []string
|
|
var cleanup func()
|
|
if info.Size() > MaxAudioFileSize {
|
|
if !split {
|
|
return "", fmt.Errorf("file %s exceeds 25MB limit; use --split-media-file to enable automatic splitting", filePath)
|
|
}
|
|
debuglog.Log("File %s is larger than the size limit... breaking it up into chunks...\n", filePath)
|
|
if files, cleanup, err = splitAudioFile(filePath, ext, MaxAudioFileSize); err != nil {
|
|
return "", err
|
|
}
|
|
defer cleanup()
|
|
} else {
|
|
files = []string{filePath}
|
|
}
|
|
|
|
resultsChan := make(chan transcriptionResult, len(files))
|
|
var wg sync.WaitGroup
|
|
|
|
for i, f := range files {
|
|
wg.Add(1)
|
|
go func(index int, filePath string) {
|
|
defer wg.Done()
|
|
debuglog.Log("Using model %s to transcribe part %d (file name: %s)...\n", model, index+1, filePath)
|
|
|
|
chunk, openErr := os.Open(filePath)
|
|
if openErr != nil {
|
|
resultsChan <- transcriptionResult{index: index, err: openErr}
|
|
return
|
|
}
|
|
defer chunk.Close()
|
|
|
|
params := openai.AudioTranscriptionNewParams{
|
|
File: chunk,
|
|
Model: openai.AudioModel(model),
|
|
}
|
|
resp, transcribeErr := o.ApiClient.Audio.Transcriptions.New(ctx, params)
|
|
if transcribeErr != nil {
|
|
resultsChan <- transcriptionResult{index: index, err: transcribeErr}
|
|
return
|
|
}
|
|
resultsChan <- transcriptionResult{index: index, text: resp.Text}
|
|
}(i, f)
|
|
}
|
|
|
|
wg.Wait()
|
|
close(resultsChan)
|
|
|
|
results := make([]transcriptionResult, 0, len(files))
|
|
for result := range resultsChan {
|
|
if result.err != nil {
|
|
return "", result.err
|
|
}
|
|
results = append(results, result)
|
|
}
|
|
|
|
sort.Slice(results, func(i, j int) bool {
|
|
return results[i].index < results[j].index
|
|
})
|
|
|
|
var builder strings.Builder
|
|
for i, result := range results {
|
|
if i > 0 {
|
|
builder.WriteString(" ")
|
|
}
|
|
builder.WriteString(result.text)
|
|
}
|
|
|
|
return builder.String(), nil
|
|
}
|
|
|
|
// splitAudioFile splits the source file into chunks smaller than maxSize using ffmpeg.
|
|
// It returns the list of chunk file paths and a cleanup function.
|
|
func splitAudioFile(src, ext string, maxSize int64) (files []string, cleanup func(), err error) {
|
|
if _, err = exec.LookPath("ffmpeg"); err != nil {
|
|
return nil, nil, fmt.Errorf("ffmpeg not found: please install it")
|
|
}
|
|
|
|
var dir string
|
|
if dir, err = os.MkdirTemp("", "fabric-audio-*"); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
cleanup = func() { os.RemoveAll(dir) }
|
|
|
|
segmentTime := 600 // start with 10 minutes
|
|
for {
|
|
pattern := filepath.Join(dir, "chunk-%03d"+ext)
|
|
debuglog.Log("Running ffmpeg to split audio into %d-second chunks...\n", segmentTime)
|
|
cmd := exec.Command("ffmpeg", "-y", "-i", src, "-f", "segment", "-segment_time", fmt.Sprintf("%d", segmentTime), "-c", "copy", pattern)
|
|
var stderr bytes.Buffer
|
|
cmd.Stderr = &stderr
|
|
if err = cmd.Run(); err != nil {
|
|
return nil, cleanup, fmt.Errorf("ffmpeg failed: %v: %s", err, stderr.String())
|
|
}
|
|
|
|
if files, err = filepath.Glob(filepath.Join(dir, "chunk-*"+ext)); err != nil {
|
|
return nil, cleanup, err
|
|
}
|
|
sort.Strings(files)
|
|
|
|
tooBig := false
|
|
for _, f := range files {
|
|
var info os.FileInfo
|
|
if info, err = os.Stat(f); err != nil {
|
|
return nil, cleanup, err
|
|
}
|
|
if info.Size() > maxSize {
|
|
tooBig = true
|
|
break
|
|
}
|
|
}
|
|
if !tooBig {
|
|
return files, cleanup, nil
|
|
}
|
|
for _, f := range files {
|
|
_ = os.Remove(f)
|
|
}
|
|
if segmentTime <= 1 {
|
|
return nil, cleanup, fmt.Errorf("unable to split file into acceptable size chunks")
|
|
}
|
|
segmentTime /= 2
|
|
}
|
|
}
|