mirror of
https://github.com/simstudioai/sim.git
synced 2026-04-06 03:00:16 -04:00
794 lines
23 KiB
TypeScript
794 lines
23 KiB
TypeScript
import { createLogger } from '@sim/logger'
|
|
import { type NextRequest, NextResponse } from 'next/server'
|
|
import { extractAudioFromVideo, isVideoFile } from '@/lib/audio/extractor'
|
|
import { checkInternalAuth } from '@/lib/auth/hybrid'
|
|
import { DEFAULT_EXECUTION_TIMEOUT_MS } from '@/lib/core/execution-limits'
|
|
import {
|
|
secureFetchWithPinnedIP,
|
|
validateUrlWithDNS,
|
|
} from '@/lib/core/security/input-validation.server'
|
|
import { getMimeTypeFromExtension, isInternalFileUrl } from '@/lib/uploads/utils/file-utils'
|
|
import {
|
|
downloadFileFromStorage,
|
|
resolveInternalFileUrl,
|
|
} from '@/lib/uploads/utils/file-utils.server'
|
|
import type { UserFile } from '@/executor/types'
|
|
import type { TranscriptSegment } from '@/tools/stt/types'
|
|
|
|
const logger = createLogger('SttProxyAPI')
|
|
|
|
export const dynamic = 'force-dynamic'
|
|
export const maxDuration = 300 // 5 minutes for large files
|
|
|
|
interface SttRequestBody {
|
|
provider: 'whisper' | 'deepgram' | 'elevenlabs' | 'assemblyai' | 'gemini'
|
|
apiKey: string
|
|
model?: string
|
|
audioFile?: UserFile | UserFile[]
|
|
audioFileReference?: UserFile | UserFile[]
|
|
audioUrl?: string
|
|
language?: string
|
|
timestamps?: 'none' | 'sentence' | 'word'
|
|
diarization?: boolean
|
|
translateToEnglish?: boolean
|
|
// Whisper-specific options
|
|
prompt?: string
|
|
temperature?: number
|
|
// AssemblyAI-specific options
|
|
sentiment?: boolean
|
|
entityDetection?: boolean
|
|
piiRedaction?: boolean
|
|
summarization?: boolean
|
|
workspaceId?: string
|
|
workflowId?: string
|
|
executionId?: string
|
|
}
|
|
|
|
export async function POST(request: NextRequest) {
|
|
const requestId = crypto.randomUUID()
|
|
logger.info(`[${requestId}] STT transcription request started`)
|
|
|
|
try {
|
|
const authResult = await checkInternalAuth(request, { requireWorkflowId: false })
|
|
if (!authResult.success) {
|
|
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
|
|
}
|
|
|
|
const userId = authResult.userId
|
|
const body: SttRequestBody = await request.json()
|
|
const {
|
|
provider,
|
|
apiKey,
|
|
model,
|
|
language,
|
|
timestamps,
|
|
diarization,
|
|
translateToEnglish,
|
|
sentiment,
|
|
entityDetection,
|
|
piiRedaction,
|
|
summarization,
|
|
} = body
|
|
|
|
if (!provider || !apiKey) {
|
|
return NextResponse.json(
|
|
{ error: 'Missing required fields: provider and apiKey' },
|
|
{ status: 400 }
|
|
)
|
|
}
|
|
|
|
let audioBuffer: Buffer
|
|
let audioFileName: string
|
|
let audioMimeType: string
|
|
|
|
if (body.audioFile) {
|
|
if (Array.isArray(body.audioFile) && body.audioFile.length !== 1) {
|
|
return NextResponse.json({ error: 'audioFile must be a single file' }, { status: 400 })
|
|
}
|
|
const file = Array.isArray(body.audioFile) ? body.audioFile[0] : body.audioFile
|
|
logger.info(`[${requestId}] Processing uploaded file: ${file.name}`)
|
|
|
|
audioBuffer = await downloadFileFromStorage(file, requestId, logger)
|
|
audioFileName = file.name
|
|
// file.type may be missing if the file came from a block that doesn't preserve it
|
|
// Infer from filename extension as fallback
|
|
const ext = file.name.split('.').pop()?.toLowerCase() || ''
|
|
audioMimeType = file.type || getMimeTypeFromExtension(ext)
|
|
} else if (body.audioFileReference) {
|
|
if (Array.isArray(body.audioFileReference) && body.audioFileReference.length !== 1) {
|
|
return NextResponse.json(
|
|
{ error: 'audioFileReference must be a single file' },
|
|
{ status: 400 }
|
|
)
|
|
}
|
|
const file = Array.isArray(body.audioFileReference)
|
|
? body.audioFileReference[0]
|
|
: body.audioFileReference
|
|
logger.info(`[${requestId}] Processing referenced file: ${file.name}`)
|
|
|
|
audioBuffer = await downloadFileFromStorage(file, requestId, logger)
|
|
audioFileName = file.name
|
|
|
|
const ext = file.name.split('.').pop()?.toLowerCase() || ''
|
|
audioMimeType = file.type || getMimeTypeFromExtension(ext)
|
|
} else if (body.audioUrl) {
|
|
logger.info(`[${requestId}] Downloading from URL: ${body.audioUrl}`)
|
|
|
|
let audioUrl = body.audioUrl.trim()
|
|
if (audioUrl.startsWith('/') && !isInternalFileUrl(audioUrl)) {
|
|
return NextResponse.json(
|
|
{
|
|
error: 'Invalid file path. Only uploaded files are supported for internal paths.',
|
|
},
|
|
{ status: 400 }
|
|
)
|
|
}
|
|
|
|
if (isInternalFileUrl(audioUrl)) {
|
|
if (!userId) {
|
|
return NextResponse.json(
|
|
{ error: 'Authentication required for internal file access' },
|
|
{ status: 401 }
|
|
)
|
|
}
|
|
const resolution = await resolveInternalFileUrl(audioUrl, userId, requestId, logger)
|
|
if (resolution.error) {
|
|
return NextResponse.json(
|
|
{ error: resolution.error.message },
|
|
{ status: resolution.error.status }
|
|
)
|
|
}
|
|
audioUrl = resolution.fileUrl || audioUrl
|
|
}
|
|
|
|
const urlValidation = await validateUrlWithDNS(audioUrl, 'audioUrl')
|
|
if (!urlValidation.isValid) {
|
|
return NextResponse.json({ error: urlValidation.error }, { status: 400 })
|
|
}
|
|
|
|
const response = await secureFetchWithPinnedIP(audioUrl, urlValidation.resolvedIP!, {
|
|
method: 'GET',
|
|
})
|
|
if (!response.ok) {
|
|
await response.text().catch(() => {})
|
|
throw new Error(`Failed to download audio from URL: ${response.statusText}`)
|
|
}
|
|
|
|
const arrayBuffer = await response.arrayBuffer()
|
|
audioBuffer = Buffer.from(arrayBuffer)
|
|
audioFileName = audioUrl.split('/').pop() || 'audio_file'
|
|
audioMimeType = response.headers.get('content-type') || 'audio/mpeg'
|
|
} else {
|
|
return NextResponse.json(
|
|
{ error: 'No audio source provided. Provide audioFile, audioFileReference, or audioUrl' },
|
|
{ status: 400 }
|
|
)
|
|
}
|
|
|
|
if (isVideoFile(audioMimeType)) {
|
|
logger.info(`[${requestId}] Extracting audio from video file`)
|
|
try {
|
|
const extracted = await extractAudioFromVideo(audioBuffer, audioMimeType, {
|
|
outputFormat: 'mp3',
|
|
sampleRate: 16000,
|
|
channels: 1,
|
|
})
|
|
audioBuffer = extracted.buffer
|
|
audioMimeType = 'audio/mpeg'
|
|
audioFileName = audioFileName.replace(/\.[^.]+$/, '.mp3')
|
|
} catch (error) {
|
|
logger.error(`[${requestId}] Video extraction failed:`, error)
|
|
return NextResponse.json(
|
|
{
|
|
error: `Failed to extract audio from video: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
},
|
|
{ status: 500 }
|
|
)
|
|
}
|
|
}
|
|
|
|
logger.info(`[${requestId}] Transcribing with ${provider}, file: ${audioFileName}`)
|
|
|
|
let transcript: string
|
|
let segments: TranscriptSegment[] | undefined
|
|
let detectedLanguage: string | undefined
|
|
let duration: number | undefined
|
|
let confidence: number | undefined
|
|
let sentimentResults: any[] | undefined
|
|
let entities: any[] | undefined
|
|
let summary: string | undefined
|
|
|
|
try {
|
|
if (provider === 'whisper') {
|
|
const result = await transcribeWithWhisper(
|
|
audioBuffer,
|
|
apiKey,
|
|
language,
|
|
timestamps,
|
|
translateToEnglish,
|
|
model,
|
|
body.prompt,
|
|
body.temperature,
|
|
audioMimeType,
|
|
audioFileName
|
|
)
|
|
transcript = result.transcript
|
|
segments = result.segments
|
|
detectedLanguage = result.language
|
|
duration = result.duration
|
|
} else if (provider === 'deepgram') {
|
|
const result = await transcribeWithDeepgram(
|
|
audioBuffer,
|
|
apiKey,
|
|
language,
|
|
timestamps,
|
|
diarization,
|
|
model,
|
|
audioMimeType
|
|
)
|
|
transcript = result.transcript
|
|
segments = result.segments
|
|
detectedLanguage = result.language
|
|
duration = result.duration
|
|
confidence = result.confidence
|
|
} else if (provider === 'elevenlabs') {
|
|
const result = await transcribeWithElevenLabs(
|
|
audioBuffer,
|
|
apiKey,
|
|
language,
|
|
timestamps,
|
|
model
|
|
)
|
|
transcript = result.transcript
|
|
segments = result.segments
|
|
detectedLanguage = result.language
|
|
duration = result.duration
|
|
} else if (provider === 'assemblyai') {
|
|
const result = await transcribeWithAssemblyAI(
|
|
audioBuffer,
|
|
apiKey,
|
|
language,
|
|
timestamps,
|
|
diarization,
|
|
sentiment,
|
|
entityDetection,
|
|
piiRedaction,
|
|
summarization,
|
|
model
|
|
)
|
|
transcript = result.transcript
|
|
segments = result.segments
|
|
detectedLanguage = result.language
|
|
duration = result.duration
|
|
confidence = result.confidence
|
|
sentimentResults = result.sentiment
|
|
entities = result.entities
|
|
summary = result.summary
|
|
} else if (provider === 'gemini') {
|
|
const result = await transcribeWithGemini(
|
|
audioBuffer,
|
|
apiKey,
|
|
audioMimeType,
|
|
language,
|
|
timestamps,
|
|
model
|
|
)
|
|
transcript = result.transcript
|
|
segments = result.segments
|
|
detectedLanguage = result.language
|
|
duration = result.duration
|
|
confidence = result.confidence
|
|
} else {
|
|
return NextResponse.json({ error: `Unknown provider: ${provider}` }, { status: 400 })
|
|
}
|
|
} catch (error) {
|
|
logger.error(`[${requestId}] Transcription failed:`, error)
|
|
const errorMessage = error instanceof Error ? error.message : 'Transcription failed'
|
|
return NextResponse.json({ error: errorMessage }, { status: 500 })
|
|
}
|
|
|
|
logger.info(`[${requestId}] Transcription completed successfully`)
|
|
|
|
const response: Record<string, any> = { transcript }
|
|
if (segments !== undefined) response.segments = segments
|
|
if (detectedLanguage !== undefined) response.language = detectedLanguage
|
|
if (duration !== undefined) response.duration = duration
|
|
if (confidence !== undefined) response.confidence = confidence
|
|
if (sentimentResults !== undefined) response.sentiment = sentimentResults
|
|
if (entities !== undefined) response.entities = entities
|
|
if (summary !== undefined) response.summary = summary
|
|
|
|
return NextResponse.json(response)
|
|
} catch (error) {
|
|
logger.error(`[${requestId}] STT proxy error:`, error)
|
|
const errorMessage = error instanceof Error ? error.message : 'Unknown error'
|
|
return NextResponse.json({ error: errorMessage }, { status: 500 })
|
|
}
|
|
}
|
|
|
|
async function transcribeWithWhisper(
|
|
audioBuffer: Buffer,
|
|
apiKey: string,
|
|
language?: string,
|
|
timestamps?: 'none' | 'sentence' | 'word',
|
|
translate?: boolean,
|
|
model?: string,
|
|
prompt?: string,
|
|
temperature?: number,
|
|
mimeType?: string,
|
|
fileName?: string
|
|
): Promise<{
|
|
transcript: string
|
|
segments?: TranscriptSegment[]
|
|
language?: string
|
|
duration?: number
|
|
}> {
|
|
const formData = new FormData()
|
|
|
|
// Use actual MIME type and filename if provided
|
|
const actualMimeType = mimeType || 'audio/mpeg'
|
|
const actualFileName = fileName || 'audio.mp3'
|
|
const blob = new Blob([new Uint8Array(audioBuffer)], { type: actualMimeType })
|
|
formData.append('file', blob, actualFileName)
|
|
formData.append('model', model || 'whisper-1')
|
|
|
|
if (language && language !== 'auto') {
|
|
formData.append('language', language)
|
|
}
|
|
|
|
if (prompt) {
|
|
formData.append('prompt', prompt)
|
|
}
|
|
|
|
if (temperature !== undefined) {
|
|
formData.append('temperature', temperature.toString())
|
|
}
|
|
|
|
formData.append('response_format', 'verbose_json')
|
|
|
|
// OpenAI API uses array notation for timestamp_granularities
|
|
if (timestamps === 'word') {
|
|
formData.append('timestamp_granularities[]', 'word')
|
|
} else if (timestamps === 'sentence') {
|
|
formData.append('timestamp_granularities[]', 'segment')
|
|
}
|
|
|
|
const endpoint = translate ? 'translations' : 'transcriptions'
|
|
const response = await fetch(`https://api.openai.com/v1/audio/${endpoint}`, {
|
|
method: 'POST',
|
|
headers: {
|
|
Authorization: `Bearer ${apiKey}`,
|
|
},
|
|
body: formData,
|
|
})
|
|
|
|
if (!response.ok) {
|
|
const error = await response.json()
|
|
const errorMessage = error.error?.message || error.message || JSON.stringify(error)
|
|
throw new Error(`Whisper API error: ${errorMessage}`)
|
|
}
|
|
|
|
const data = await response.json()
|
|
|
|
let segments: TranscriptSegment[] | undefined
|
|
if (timestamps !== 'none') {
|
|
segments = (data.segments || data.words || []).map((seg: any) => ({
|
|
text: seg.text,
|
|
start: seg.start,
|
|
end: seg.end,
|
|
}))
|
|
}
|
|
|
|
return {
|
|
transcript: data.text,
|
|
segments,
|
|
language: data.language,
|
|
duration: data.duration,
|
|
}
|
|
}
|
|
|
|
async function transcribeWithDeepgram(
|
|
audioBuffer: Buffer,
|
|
apiKey: string,
|
|
language?: string,
|
|
timestamps?: 'none' | 'sentence' | 'word',
|
|
diarization?: boolean,
|
|
model?: string,
|
|
mimeType?: string
|
|
): Promise<{
|
|
transcript: string
|
|
segments?: TranscriptSegment[]
|
|
language?: string
|
|
duration?: number
|
|
confidence?: number
|
|
}> {
|
|
const params = new URLSearchParams({
|
|
model: model || 'nova-3',
|
|
smart_format: 'true',
|
|
punctuate: 'true',
|
|
})
|
|
|
|
if (language && language !== 'auto') {
|
|
params.append('language', language)
|
|
} else if (language === 'auto') {
|
|
params.append('detect_language', 'true')
|
|
}
|
|
|
|
if (timestamps === 'sentence') {
|
|
params.append('utterances', 'true')
|
|
}
|
|
|
|
if (diarization) {
|
|
params.append('diarize', 'true')
|
|
}
|
|
|
|
const response = await fetch(`https://api.deepgram.com/v1/listen?${params.toString()}`, {
|
|
method: 'POST',
|
|
headers: {
|
|
Authorization: `Token ${apiKey}`,
|
|
'Content-Type': mimeType || 'audio/mpeg',
|
|
},
|
|
body: new Uint8Array(audioBuffer),
|
|
})
|
|
|
|
if (!response.ok) {
|
|
const error = await response.json()
|
|
const errorMessage = error.err_msg || error.message || JSON.stringify(error)
|
|
throw new Error(`Deepgram API error: ${errorMessage}`)
|
|
}
|
|
|
|
const data = await response.json()
|
|
const result = data.results?.channels?.[0]?.alternatives?.[0]
|
|
|
|
if (!result) {
|
|
throw new Error('No transcription result from Deepgram')
|
|
}
|
|
|
|
const transcript = result.transcript
|
|
const detectedLanguage = data.results?.channels?.[0]?.detected_language
|
|
const confidence = result.confidence
|
|
|
|
let segments: TranscriptSegment[] | undefined
|
|
if (result.words && timestamps === 'word') {
|
|
segments = result.words.map((word: any) => ({
|
|
text: word.word,
|
|
start: word.start,
|
|
end: word.end,
|
|
speaker: word.speaker !== undefined ? `Speaker ${word.speaker}` : undefined,
|
|
confidence: word.confidence,
|
|
}))
|
|
} else if (data.results?.utterances && timestamps === 'sentence') {
|
|
segments = data.results.utterances.map((utterance: any) => ({
|
|
text: utterance.transcript,
|
|
start: utterance.start,
|
|
end: utterance.end,
|
|
speaker: utterance.speaker !== undefined ? `Speaker ${utterance.speaker}` : undefined,
|
|
confidence: utterance.confidence,
|
|
}))
|
|
}
|
|
|
|
return {
|
|
transcript,
|
|
segments,
|
|
language: detectedLanguage,
|
|
duration: data.metadata?.duration,
|
|
confidence,
|
|
}
|
|
}
|
|
|
|
async function transcribeWithElevenLabs(
|
|
audioBuffer: Buffer,
|
|
apiKey: string,
|
|
language?: string,
|
|
timestamps?: 'none' | 'sentence' | 'word',
|
|
model?: string
|
|
): Promise<{
|
|
transcript: string
|
|
segments?: TranscriptSegment[]
|
|
language?: string
|
|
duration?: number
|
|
}> {
|
|
const formData = new FormData()
|
|
const blob = new Blob([new Uint8Array(audioBuffer)], { type: 'audio/mpeg' })
|
|
formData.append('file', blob, 'audio.mp3')
|
|
formData.append('model_id', model || 'scribe_v1')
|
|
|
|
if (language && language !== 'auto') {
|
|
formData.append('language_code', language)
|
|
}
|
|
|
|
if (timestamps && timestamps !== 'none') {
|
|
const granularity = timestamps === 'word' ? 'word' : 'word'
|
|
formData.append('timestamps_granularity', granularity)
|
|
} else {
|
|
formData.append('timestamps_granularity', 'word')
|
|
}
|
|
|
|
const response = await fetch('https://api.elevenlabs.io/v1/speech-to-text', {
|
|
method: 'POST',
|
|
headers: {
|
|
'xi-api-key': apiKey,
|
|
},
|
|
body: formData,
|
|
})
|
|
|
|
if (!response.ok) {
|
|
const error = await response.json()
|
|
const errorMessage =
|
|
typeof error.detail === 'string'
|
|
? error.detail
|
|
: error.detail?.message || error.message || JSON.stringify(error)
|
|
throw new Error(`ElevenLabs API error: ${errorMessage}`)
|
|
}
|
|
|
|
const data = await response.json()
|
|
|
|
const words = data.words || []
|
|
const segments: TranscriptSegment[] = words
|
|
.filter((w: any) => w.type === 'word')
|
|
.map((w: any) => ({
|
|
text: w.text,
|
|
start: w.start,
|
|
end: w.end,
|
|
speaker: w.speaker_id,
|
|
}))
|
|
|
|
return {
|
|
transcript: data.text || '',
|
|
segments: segments.length > 0 ? segments : undefined,
|
|
language: data.language_code,
|
|
duration: undefined, // ElevenLabs doesn't return duration in response
|
|
}
|
|
}
|
|
|
|
async function transcribeWithAssemblyAI(
|
|
audioBuffer: Buffer,
|
|
apiKey: string,
|
|
language?: string,
|
|
timestamps?: 'none' | 'sentence' | 'word',
|
|
diarization?: boolean,
|
|
sentiment?: boolean,
|
|
entityDetection?: boolean,
|
|
piiRedaction?: boolean,
|
|
summarization?: boolean,
|
|
model?: string
|
|
): Promise<{
|
|
transcript: string
|
|
segments?: TranscriptSegment[]
|
|
language?: string
|
|
duration?: number
|
|
confidence?: number
|
|
sentiment?: any[]
|
|
entities?: any[]
|
|
summary?: string
|
|
}> {
|
|
const uploadResponse = await fetch('https://api.assemblyai.com/v2/upload', {
|
|
method: 'POST',
|
|
headers: {
|
|
authorization: apiKey,
|
|
'content-type': 'application/octet-stream',
|
|
},
|
|
body: new Uint8Array(audioBuffer),
|
|
})
|
|
|
|
if (!uploadResponse.ok) {
|
|
const error = await uploadResponse.json()
|
|
throw new Error(`AssemblyAI upload error: ${error.error || JSON.stringify(error)}`)
|
|
}
|
|
|
|
const { upload_url } = await uploadResponse.json()
|
|
|
|
const transcriptRequest: any = {
|
|
audio_url: upload_url,
|
|
}
|
|
|
|
// AssemblyAI supports 'best', 'slam-1', or 'universal' for speech_model
|
|
if (model === 'best' || model === 'slam-1' || model === 'universal') {
|
|
transcriptRequest.speech_model = model
|
|
}
|
|
|
|
if (language && language !== 'auto') {
|
|
transcriptRequest.language_code = language
|
|
} else if (language === 'auto') {
|
|
transcriptRequest.language_detection = true
|
|
}
|
|
|
|
if (diarization) {
|
|
transcriptRequest.speaker_labels = true
|
|
}
|
|
|
|
if (sentiment) {
|
|
transcriptRequest.sentiment_analysis = true
|
|
}
|
|
|
|
if (entityDetection) {
|
|
transcriptRequest.entity_detection = true
|
|
}
|
|
|
|
if (piiRedaction) {
|
|
transcriptRequest.redact_pii = true
|
|
transcriptRequest.redact_pii_policies = [
|
|
'us_social_security_number',
|
|
'email_address',
|
|
'phone_number',
|
|
]
|
|
}
|
|
|
|
if (summarization) {
|
|
transcriptRequest.summarization = true
|
|
transcriptRequest.summary_model = 'informative'
|
|
transcriptRequest.summary_type = 'bullets'
|
|
}
|
|
|
|
const transcriptResponse = await fetch('https://api.assemblyai.com/v2/transcript', {
|
|
method: 'POST',
|
|
headers: {
|
|
authorization: apiKey,
|
|
'content-type': 'application/json',
|
|
},
|
|
body: JSON.stringify(transcriptRequest),
|
|
})
|
|
|
|
if (!transcriptResponse.ok) {
|
|
const error = await transcriptResponse.json()
|
|
throw new Error(`AssemblyAI transcript error: ${error.error || JSON.stringify(error)}`)
|
|
}
|
|
|
|
const { id } = await transcriptResponse.json()
|
|
|
|
let transcript: any
|
|
let attempts = 0
|
|
const pollIntervalMs = 5000
|
|
const maxAttempts = Math.ceil(DEFAULT_EXECUTION_TIMEOUT_MS / pollIntervalMs)
|
|
|
|
while (attempts < maxAttempts) {
|
|
const statusResponse = await fetch(`https://api.assemblyai.com/v2/transcript/${id}`, {
|
|
headers: {
|
|
authorization: apiKey,
|
|
},
|
|
})
|
|
|
|
if (!statusResponse.ok) {
|
|
const error = await statusResponse.json()
|
|
throw new Error(`AssemblyAI status error: ${error.error || JSON.stringify(error)}`)
|
|
}
|
|
|
|
transcript = await statusResponse.json()
|
|
|
|
if (transcript.status === 'completed') {
|
|
break
|
|
}
|
|
if (transcript.status === 'error') {
|
|
throw new Error(`AssemblyAI transcription failed: ${transcript.error}`)
|
|
}
|
|
|
|
await new Promise((resolve) => setTimeout(resolve, 5000))
|
|
attempts++
|
|
}
|
|
|
|
if (transcript.status !== 'completed') {
|
|
throw new Error('AssemblyAI transcription timed out')
|
|
}
|
|
|
|
let segments: TranscriptSegment[] | undefined
|
|
if (timestamps !== 'none' && transcript.words) {
|
|
segments = transcript.words.map((word: any) => ({
|
|
text: word.text,
|
|
start: word.start / 1000,
|
|
end: word.end / 1000,
|
|
speaker: word.speaker ? `Speaker ${word.speaker}` : undefined,
|
|
confidence: word.confidence,
|
|
}))
|
|
}
|
|
|
|
const result: any = {
|
|
transcript: transcript.text,
|
|
segments,
|
|
language: transcript.language_code,
|
|
duration: transcript.audio_duration,
|
|
confidence: transcript.confidence,
|
|
}
|
|
|
|
if (sentiment && transcript.sentiment_analysis_results) {
|
|
result.sentiment = transcript.sentiment_analysis_results
|
|
}
|
|
|
|
if (entityDetection && transcript.entities) {
|
|
result.entities = transcript.entities
|
|
}
|
|
|
|
if (summarization && transcript.summary) {
|
|
result.summary = transcript.summary
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
async function transcribeWithGemini(
|
|
audioBuffer: Buffer,
|
|
apiKey: string,
|
|
mimeType: string,
|
|
language?: string,
|
|
timestamps?: 'none' | 'sentence' | 'word',
|
|
model?: string
|
|
): Promise<{
|
|
transcript: string
|
|
segments?: TranscriptSegment[]
|
|
language?: string
|
|
duration?: number
|
|
confidence?: number
|
|
}> {
|
|
const modelName = model || 'gemini-2.5-flash'
|
|
|
|
const estimatedSize = audioBuffer.length * 1.34
|
|
if (estimatedSize > 20 * 1024 * 1024) {
|
|
throw new Error('Audio file exceeds 20MB limit for inline data')
|
|
}
|
|
|
|
const base64Audio = audioBuffer.toString('base64')
|
|
|
|
const languagePrompt = language && language !== 'auto' ? ` The audio is in ${language}.` : ''
|
|
|
|
const timestampPrompt =
|
|
timestamps === 'sentence' || timestamps === 'word'
|
|
? ' Include timestamps in MM:SS format for each sentence.'
|
|
: ''
|
|
|
|
const requestBody = {
|
|
contents: [
|
|
{
|
|
parts: [
|
|
{
|
|
inline_data: {
|
|
mime_type: mimeType,
|
|
data: base64Audio,
|
|
},
|
|
},
|
|
{
|
|
text: `Please transcribe this audio file.${languagePrompt}${timestampPrompt} Provide the full transcript.`,
|
|
},
|
|
],
|
|
},
|
|
],
|
|
}
|
|
|
|
const response = await fetch(
|
|
`https://generativelanguage.googleapis.com/v1beta/models/${modelName}:generateContent?key=${apiKey}`,
|
|
{
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
},
|
|
body: JSON.stringify(requestBody),
|
|
}
|
|
)
|
|
|
|
if (!response.ok) {
|
|
const error = await response.json()
|
|
if (response.status === 404) {
|
|
throw new Error(
|
|
`Model not found: ${modelName}. Use gemini-3.1-pro-preview, gemini-3-pro-preview, gemini-2.5-pro, gemini-2.5-flash, gemini-2.5-flash-lite, or gemini-2.0-flash-exp`
|
|
)
|
|
}
|
|
const errorMessage = error.error?.message || JSON.stringify(error)
|
|
throw new Error(`Gemini API error: ${errorMessage}`)
|
|
}
|
|
|
|
const data = await response.json()
|
|
|
|
if (!data.candidates?.[0]?.content?.parts?.[0]?.text) {
|
|
const candidate = data.candidates?.[0]
|
|
if (candidate?.finishReason === 'SAFETY') {
|
|
throw new Error('Content was blocked by safety filters')
|
|
}
|
|
throw new Error('Invalid response structure from Gemini API')
|
|
}
|
|
|
|
const transcript = data.candidates[0].content.parts[0].text
|
|
|
|
return {
|
|
transcript,
|
|
language: language !== 'auto' ? language : undefined,
|
|
}
|
|
}
|