feat(tools): added speech to text with openai whisper, elevenlabs, and deepgram (#2068)

* feat(tools): added speech to text with openai whisper, elevenlabs, and deepgram

* added new file icons, implemented ffmpeg

* updated docs

* revert environment
This commit is contained in:
Waleed
2025-11-19 21:03:54 -08:00
committed by GitHub
parent 7c5d625ca5
commit e64b1c9fcd
27 changed files with 1884 additions and 18 deletions

View File

@@ -4084,3 +4084,27 @@ export function CalendlyIcon(props: SVGProps<SVGSVGElement>) {
</svg>
)
}
export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
return (
<svg
{...props}
xmlns='http://www.w3.org/2000/svg'
width='24'
height='24'
viewBox='0 0 24 24'
fill='none'
stroke='currentColor'
strokeWidth='2'
strokeLinecap='round'
strokeLinejoin='round'
>
<path d='M2 10v3' />
<path d='M6 6v11' />
<path d='M10 3v18' />
<path d='M14 8v7' />
<path d='M18 5v13' />
<path d='M22 10v3' />
</svg>
)
}

View File

@@ -8,6 +8,7 @@ import {
ApolloIcon,
ArxivIcon,
AsanaIcon,
AudioWaveformIcon,
BrainIcon,
BrowserUseIcon,
CalendlyIcon,
@@ -100,6 +101,7 @@ export const blockTypeToIconMap: Record<string, IconComponent> = {
telegram: TelegramIcon,
tavily: TavilyIcon,
supabase: SupabaseIcon,
stt: AudioWaveformIcon,
stripe: StripeIcon,
stagehand_agent: StagehandIcon,
stagehand: StagehandIcon,

View File

@@ -10,6 +10,20 @@ import { BlockInfoCard } from "@/components/ui/block-info-card"
color="#FFFFFF"
/>
{/* MANUAL-CONTENT-START:intro */}
[Calendly](https://calendly.com/) is a popular scheduling automation platform that helps you book meetings, events, and appointments with ease. With Calendly, teams and individuals can streamline scheduling, reduce back-and-forth emails, and automate tasks around events.
With the Sim Calendly integration, your agents can:
- **Retrieve information about your account and scheduled events**: Use tools to fetch user info, event types, and scheduled events for analysis or automation.
- **Manage event types and scheduling**: Access and list available event types for users or organizations, retrieve details about specific event types, and monitor scheduled meetings and invitee data.
- **Automate follow-ups and workflows**: When users schedule, reschedule, or cancel meetings, Sim agents can automatically trigger corresponding workflows—such as sending reminders, updating CRMs, or notifying participants.
- **Integrate easily using webhooks**: Set up Sim workflows to respond to real-time Calendly webhook events, including when invitees schedule, cancel, or interact with routing forms.
Whether you want to automate meeting prep, manage invites, or run custom workflows in response to scheduling activity, the Calendly tools in Sim give you flexible and secure access. Unlock new automation by reacting instantly to scheduling changes—streamlining your team's operations and communications.
{/* MANUAL-CONTENT-END */}
## Usage Instructions
Integrate Calendly into your workflow. Manage event types, scheduled events, invitees, and webhooks. Can also trigger workflows based on Calendly webhook events (invitee scheduled, invitee canceled, routing form submitted). Requires Personal Access Token.

View File

@@ -61,6 +61,7 @@
"stagehand",
"stagehand_agent",
"stripe",
"stt",
"supabase",
"tavily",
"telegram",

View File

@@ -0,0 +1,122 @@
---
title: Speech-to-Text
description: Convert speech to text using AI
---
import { BlockInfoCard } from "@/components/ui/block-info-card"
<BlockInfoCard
type="stt"
color="#181C1E"
/>
{/* MANUAL-CONTENT-START:intro */}
Transcribe speech to text using state-of-the-art AI models from leading providers. The Sim Speech-to-Text (STT) tools allow you to convert audio and video files into accurate transcripts, supporting multiple languages, timestamps, and optional translation.
Supported providers:
- **[OpenAI Whisper](https://platform.openai.com/docs/guides/speech-to-text/overview)**: Advanced open-source STT model from OpenAI. Supports models such as `whisper-1` and handles a wide variety of languages and audio formats.
- **[Deepgram](https://deepgram.com/)**: Real-time and batch STT API with deep learning models like `nova-3`, `nova-2`, and `whisper-large`. Offers features like diarization, intent recognition, and industry-specific tuning.
- **[ElevenLabs](https://elevenlabs.io/)**: Known for high-quality speech AI, ElevenLabs provides STT models focused on accuracy and natural language understanding for numerous languages and dialects.
Choose the provider and model best suited to your task—whether fast, production-grade transcription (Deepgram), highly accurate multi-language capability (Whisper), or advanced understanding and language coverage (ElevenLabs).
{/* MANUAL-CONTENT-END */}
## Usage Instructions
Transcribe audio and video files to text using leading AI providers. Supports multiple languages, timestamps, and speaker diarization.
## Tools
### `stt_whisper`
Transcribe audio to text using OpenAI Whisper
#### Input
| Parameter | Type | Required | Description |
| --------- | ---- | -------- | ----------- |
| `provider` | string | Yes | STT provider \(whisper\) |
| `apiKey` | string | Yes | OpenAI API key |
| `model` | string | No | Whisper model to use \(default: whisper-1\) |
| `audioFile` | file | No | Audio or video file to transcribe |
| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
| `audioUrl` | string | No | URL to audio or video file |
| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
| `translateToEnglish` | boolean | No | Translate audio to English |
#### Output
| Parameter | Type | Description |
| --------- | ---- | ----------- |
| `transcript` | string | Full transcribed text |
| `segments` | array | Timestamped segments |
| `language` | string | Detected or specified language |
| `duration` | number | Audio duration in seconds |
| `confidence` | number | Overall confidence score |
### `stt_deepgram`
Transcribe audio to text using Deepgram
#### Input
| Parameter | Type | Required | Description |
| --------- | ---- | -------- | ----------- |
| `provider` | string | Yes | STT provider \(deepgram\) |
| `apiKey` | string | Yes | Deepgram API key |
| `model` | string | No | Deepgram model to use \(nova-3, nova-2, whisper-large, etc.\) |
| `audioFile` | file | No | Audio or video file to transcribe |
| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
| `audioUrl` | string | No | URL to audio or video file |
| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
| `diarization` | boolean | No | Enable speaker diarization |
#### Output
| Parameter | Type | Description |
| --------- | ---- | ----------- |
| `transcript` | string | Full transcribed text |
| `segments` | array | Timestamped segments with speaker labels |
| `language` | string | Detected or specified language |
| `duration` | number | Audio duration in seconds |
| `confidence` | number | Overall confidence score |
### `stt_elevenlabs`
Transcribe audio to text using ElevenLabs
#### Input
| Parameter | Type | Required | Description |
| --------- | ---- | -------- | ----------- |
| `provider` | string | Yes | STT provider \(elevenlabs\) |
| `apiKey` | string | Yes | ElevenLabs API key |
| `model` | string | No | ElevenLabs model to use \(scribe_v1, scribe_v1_experimental\) |
| `audioFile` | file | No | Audio or video file to transcribe |
| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
| `audioUrl` | string | No | URL to audio or video file |
| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
#### Output
| Parameter | Type | Description |
| --------- | ---- | ----------- |
| `transcript` | string | Full transcribed text |
| `segments` | array | Timestamped segments |
| `language` | string | Detected or specified language |
| `duration` | number | Audio duration in seconds |
| `confidence` | number | Overall confidence score |
## Notes
- Category: `tools`
- Type: `stt`

View File

@@ -13,21 +13,37 @@ import {
} from '@/app/api/files/utils'
const ALLOWED_EXTENSIONS = new Set([
// Documents
'pdf',
'doc',
'docx',
'txt',
'md',
'png',
'jpg',
'jpeg',
'gif',
'csv',
'xlsx',
'xls',
'json',
'yaml',
'yml',
// Images
'png',
'jpg',
'jpeg',
'gif',
// Audio
'mp3',
'm4a',
'wav',
'webm',
'ogg',
'flac',
'aac',
'opus',
// Video
'mp4',
'mov',
'avi',
'mkv',
])
function validateFileExtension(filename: string): boolean {

View File

@@ -0,0 +1,375 @@
import { type NextRequest, NextResponse } from 'next/server'
import { extractAudioFromVideo, isVideoFile } from '@/lib/audio/extractor'
import { checkHybridAuth } from '@/lib/auth/hybrid'
import { createLogger } from '@/lib/logs/console/logger'
import { downloadFileFromStorage } from '@/lib/uploads/utils/file-utils.server'
import type { UserFile } from '@/executor/types'
import type { TranscriptSegment } from '@/tools/stt/types'
const logger = createLogger('SttProxyAPI')
export const dynamic = 'force-dynamic'
export const maxDuration = 300 // 5 minutes for large files
interface SttRequestBody {
provider: 'whisper' | 'deepgram' | 'elevenlabs'
apiKey: string
model?: string
audioFile?: UserFile | UserFile[]
audioFileReference?: UserFile | UserFile[]
audioUrl?: string
language?: string
timestamps?: 'none' | 'sentence' | 'word'
diarization?: boolean
translateToEnglish?: boolean
workspaceId?: string
workflowId?: string
executionId?: string
}
export async function POST(request: NextRequest) {
const requestId = crypto.randomUUID()
logger.info(`[${requestId}] STT transcription request started`)
try {
const authResult = await checkHybridAuth(request, { requireWorkflowId: false })
if (!authResult.success) {
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
const body: SttRequestBody = await request.json()
const { provider, apiKey, model, language, timestamps, diarization, translateToEnglish } = body
if (!provider || !apiKey) {
return NextResponse.json(
{ error: 'Missing required fields: provider and apiKey' },
{ status: 400 }
)
}
let audioBuffer: Buffer
let audioFileName: string
let audioMimeType: string
if (body.audioFile) {
const file = Array.isArray(body.audioFile) ? body.audioFile[0] : body.audioFile
logger.info(`[${requestId}] Processing uploaded file: ${file.name}`)
audioBuffer = await downloadFileFromStorage(file, requestId, logger)
audioFileName = file.name
audioMimeType = file.type
} else if (body.audioFileReference) {
const file = Array.isArray(body.audioFileReference)
? body.audioFileReference[0]
: body.audioFileReference
logger.info(`[${requestId}] Processing referenced file: ${file.name}`)
audioBuffer = await downloadFileFromStorage(file, requestId, logger)
audioFileName = file.name
audioMimeType = file.type
} else if (body.audioUrl) {
logger.info(`[${requestId}] Downloading from URL: ${body.audioUrl}`)
const response = await fetch(body.audioUrl)
if (!response.ok) {
throw new Error(`Failed to download audio from URL: ${response.statusText}`)
}
const arrayBuffer = await response.arrayBuffer()
audioBuffer = Buffer.from(arrayBuffer)
audioFileName = body.audioUrl.split('/').pop() || 'audio_file'
audioMimeType = response.headers.get('content-type') || 'audio/mpeg'
} else {
return NextResponse.json(
{ error: 'No audio source provided. Provide audioFile, audioFileReference, or audioUrl' },
{ status: 400 }
)
}
if (isVideoFile(audioMimeType)) {
logger.info(`[${requestId}] Extracting audio from video file`)
try {
const extracted = await extractAudioFromVideo(audioBuffer, audioMimeType, {
outputFormat: 'mp3',
sampleRate: 16000,
channels: 1,
})
audioBuffer = extracted.buffer
audioMimeType = 'audio/mpeg'
audioFileName = audioFileName.replace(/\.[^.]+$/, '.mp3')
} catch (error) {
logger.error(`[${requestId}] Video extraction failed:`, error)
return NextResponse.json(
{
error: `Failed to extract audio from video: ${error instanceof Error ? error.message : 'Unknown error'}`,
},
{ status: 500 }
)
}
}
logger.info(`[${requestId}] Transcribing with ${provider}, file: ${audioFileName}`)
let transcript: string
let segments: TranscriptSegment[] | undefined
let detectedLanguage: string | undefined
let duration: number | undefined
let confidence: number | undefined
try {
if (provider === 'whisper') {
const result = await transcribeWithWhisper(
audioBuffer,
apiKey,
language,
timestamps,
translateToEnglish,
model
)
transcript = result.transcript
segments = result.segments
detectedLanguage = result.language
duration = result.duration
} else if (provider === 'deepgram') {
const result = await transcribeWithDeepgram(
audioBuffer,
apiKey,
language,
timestamps,
diarization,
model
)
transcript = result.transcript
segments = result.segments
detectedLanguage = result.language
duration = result.duration
confidence = result.confidence
} else if (provider === 'elevenlabs') {
const result = await transcribeWithElevenLabs(
audioBuffer,
apiKey,
language,
timestamps,
model
)
transcript = result.transcript
segments = result.segments
detectedLanguage = result.language
duration = result.duration
} else {
return NextResponse.json({ error: `Unknown provider: ${provider}` }, { status: 400 })
}
} catch (error) {
logger.error(`[${requestId}] Transcription failed:`, error)
const errorMessage = error instanceof Error ? error.message : 'Transcription failed'
return NextResponse.json({ error: errorMessage }, { status: 500 })
}
logger.info(`[${requestId}] Transcription completed successfully`)
return NextResponse.json({
transcript,
segments,
language: detectedLanguage,
duration,
confidence,
})
} catch (error) {
logger.error(`[${requestId}] STT proxy error:`, error)
const errorMessage = error instanceof Error ? error.message : 'Unknown error'
return NextResponse.json({ error: errorMessage }, { status: 500 })
}
}
async function transcribeWithWhisper(
audioBuffer: Buffer,
apiKey: string,
language?: string,
timestamps?: 'none' | 'sentence' | 'word',
translate?: boolean,
model?: string
): Promise<{
transcript: string
segments?: TranscriptSegment[]
language?: string
duration?: number
}> {
const formData = new FormData()
const blob = new Blob([new Uint8Array(audioBuffer)], { type: 'audio/mpeg' })
formData.append('file', blob, 'audio.mp3')
formData.append('model', model || 'whisper-1')
if (language && language !== 'auto') {
formData.append('language', language)
}
if (timestamps === 'word') {
formData.append('response_format', 'verbose_json')
formData.append('timestamp_granularities[]', 'word')
} else if (timestamps === 'sentence') {
formData.append('response_format', 'verbose_json')
formData.append('timestamp_granularities[]', 'segment')
}
const endpoint = translate ? 'translations' : 'transcriptions'
const response = await fetch(`https://api.openai.com/v1/audio/${endpoint}`, {
method: 'POST',
headers: {
Authorization: `Bearer ${apiKey}`,
},
body: formData,
})
if (!response.ok) {
const error = await response.json()
const errorMessage = error.error?.message || error.message || JSON.stringify(error)
throw new Error(`Whisper API error: ${errorMessage}`)
}
const data = await response.json()
if (timestamps === 'none') {
return {
transcript: data.text,
language: data.language,
}
}
const segments: TranscriptSegment[] = (data.segments || data.words || []).map((seg: any) => ({
text: seg.text,
start: seg.start,
end: seg.end,
}))
return {
transcript: data.text,
segments,
language: data.language,
duration: data.duration,
}
}
async function transcribeWithDeepgram(
audioBuffer: Buffer,
apiKey: string,
language?: string,
timestamps?: 'none' | 'sentence' | 'word',
diarization?: boolean,
model?: string
): Promise<{
transcript: string
segments?: TranscriptSegment[]
language?: string
duration?: number
confidence?: number
}> {
const params = new URLSearchParams({
model: model || 'nova-3',
smart_format: 'true',
punctuate: 'true',
})
if (language && language !== 'auto') {
params.append('language', language)
}
if (timestamps !== 'none') {
params.append('utterances', 'true')
}
if (diarization) {
params.append('diarize', 'true')
}
const response = await fetch(`https://api.deepgram.com/v1/listen?${params.toString()}`, {
method: 'POST',
headers: {
Authorization: `Token ${apiKey}`,
'Content-Type': 'audio/mpeg',
},
body: new Uint8Array(audioBuffer),
})
if (!response.ok) {
const error = await response.json()
const errorMessage = error.err_msg || error.message || JSON.stringify(error)
throw new Error(`Deepgram API error: ${errorMessage}`)
}
const data = await response.json()
const result = data.results?.channels?.[0]?.alternatives?.[0]
if (!result) {
throw new Error('No transcription result from Deepgram')
}
const transcript = result.transcript
const detectedLanguage = data.results?.channels?.[0]?.detected_language
const confidence = result.confidence
let segments: TranscriptSegment[] | undefined
if (timestamps !== 'none' && result.words) {
segments = result.words.map((word: any) => ({
text: word.word,
start: word.start,
end: word.end,
speaker: word.speaker !== undefined ? `Speaker ${word.speaker}` : undefined,
confidence: word.confidence,
}))
}
return {
transcript,
segments,
language: detectedLanguage,
duration: data.metadata?.duration,
confidence,
}
}
async function transcribeWithElevenLabs(
audioBuffer: Buffer,
apiKey: string,
language?: string,
timestamps?: 'none' | 'sentence' | 'word',
model?: string
): Promise<{
transcript: string
segments?: TranscriptSegment[]
language?: string
duration?: number
}> {
const formData = new FormData()
const blob = new Blob([new Uint8Array(audioBuffer)], { type: 'audio/mpeg' })
formData.append('file', blob, 'audio.mp3')
formData.append('model_id', model || 'scribe_v1')
if (language && language !== 'auto') {
formData.append('language', language)
}
const response = await fetch('https://api.elevenlabs.io/v1/speech-to-text', {
method: 'POST',
headers: {
'xi-api-key': apiKey,
},
body: formData,
})
if (!response.ok) {
const error = await response.json()
const errorMessage =
typeof error.detail === 'string'
? error.detail
: error.detail?.message || error.message || JSON.stringify(error)
throw new Error(`ElevenLabs API error: ${errorMessage}`)
}
const data = await response.json()
return {
transcript: data.text || '',
language: data.language,
duration: data.duration,
}
}

View File

@@ -144,6 +144,62 @@ export const TxtIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
</svg>
)
export const AudioIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
<svg viewBox='0 0 24 24' fill='none' xmlns='http://www.w3.org/2000/svg' className={className}>
<path
d='M14 2H6C4.9 2 4 2.9 4 4V20C4 21.1 4.9 22 6 22H18C19.1 22 20 21.1 20 20V8L14 2Z'
fill='#0288D1'
/>
<path d='M14 2V8H20' fill='#29B6F6' />
<path
d='M14 2L20 8V20C20 21.1 19.1 22 18 22H6C4.9 22 4 21.1 4 20V4C4 2.9 4.9 2 6 2H14Z'
stroke='#01579B'
strokeWidth='0.5'
strokeLinecap='round'
strokeLinejoin='round'
/>
{/* Speaker icon */}
<path d='M8.5 10.5v3c0 .28.22.5.5.5h1.5l2 2V8l-2 2H9c-.28 0-.5.22-.5.5z' fill='white' />
{/* Sound waves */}
<path
d='M14 10.5c.6.6.6 1.4 0 2M15.5 9c1.2 1.2 1.2 3.8 0 5'
stroke='white'
strokeWidth='0.8'
strokeLinecap='round'
/>
</svg>
)
export const VideoIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
<svg viewBox='0 0 24 24' fill='none' xmlns='http://www.w3.org/2000/svg' className={className}>
<path
d='M14 2H6C4.9 2 4 2.9 4 4V20C4 21.1 4.9 22 6 22H18C19.1 22 20 21.1 20 20V8L14 2Z'
fill='#D32F2F'
/>
<path d='M14 2V8H20' fill='#EF5350' />
<path
d='M14 2L20 8V20C20 21.1 19.1 22 18 22H6C4.9 22 4 21.1 4 20V4C4 2.9 4.9 2 6 2H14Z'
stroke='#B71C1C'
strokeWidth='0.5'
strokeLinecap='round'
strokeLinejoin='round'
/>
{/* Video screen */}
<rect
x='7.5'
y='9.5'
width='9'
height='6'
rx='0.5'
stroke='white'
strokeWidth='0.8'
fill='none'
/>
{/* Play button */}
<path d='M10.5 11.5l3 2-3 2v-4z' fill='white' />
</svg>
)
export const DefaultFileIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
<svg viewBox='0 0 24 24' fill='none' xmlns='http://www.w3.org/2000/svg' className={className}>
<path
@@ -164,13 +220,23 @@ export const DefaultFileIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' })
</svg>
)
// Helper function to get the appropriate icon component
export function getDocumentIcon(mimeType: string, filename: string): React.FC<IconProps> {
const extension = filename.split('.').pop()?.toLowerCase()
const audioExtensions = ['mp3', 'm4a', 'wav', 'webm', 'ogg', 'flac', 'aac', 'opus']
if (mimeType.startsWith('audio/') || (extension && audioExtensions.includes(extension))) {
return AudioIcon
}
const videoExtensions = ['mp4', 'mov', 'avi', 'mkv']
if (mimeType.startsWith('video/') || (extension && videoExtensions.includes(extension))) {
return VideoIcon
}
if (mimeType === 'application/pdf' || extension === 'pdf') {
return PdfIcon
}
if (
mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
mimeType === 'application/msword' ||
@@ -179,6 +245,7 @@ export function getDocumentIcon(mimeType: string, filename: string): React.FC<Ic
) {
return DocxIcon
}
if (
mimeType === 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ||
mimeType === 'application/vnd.ms-excel' ||
@@ -187,11 +254,14 @@ export function getDocumentIcon(mimeType: string, filename: string): React.FC<Ic
) {
return XlsxIcon
}
if (mimeType === 'text/csv' || extension === 'csv') {
return CsvIcon
}
if (mimeType === 'text/plain' || extension === 'txt') {
return TxtIcon
}
return DefaultFileIcon
}

View File

@@ -148,21 +148,29 @@ export function FileUpload({
const maxSizeInBytes = maxSize * 1024 * 1024
const validFiles: File[] = []
let totalNewSize = 0
let sizeExceededFile: string | null = null
for (let i = 0; i < files.length; i++) {
const file = files[i]
if (existingTotalSize + totalNewSize + file.size > maxSizeInBytes) {
logger.error(
`Adding ${file.name} would exceed the maximum size limit of ${maxSize}MB`,
activeWorkflowId
)
const errorMessage = `Adding ${file.name} would exceed the maximum size limit of ${maxSize}MB`
logger.error(errorMessage, activeWorkflowId)
if (!sizeExceededFile) {
sizeExceededFile = errorMessage
}
} else {
validFiles.push(file)
totalNewSize += file.size
}
}
if (validFiles.length === 0) return
if (validFiles.length === 0) {
if (sizeExceededFile) {
setUploadError(sizeExceededFile)
setTimeout(() => setUploadError(null), 5000)
}
return
}
const uploading = validFiles.map((file) => ({
id: `upload-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,

View File

@@ -32,6 +32,7 @@ const logger = createLogger('FileUploadsSettings')
const isBillingEnabled = isTruthy(getEnv('NEXT_PUBLIC_BILLING_ENABLED'))
const SUPPORTED_EXTENSIONS = [
// Documents
'pdf',
'csv',
'doc',
@@ -47,9 +48,23 @@ const SUPPORTED_EXTENSIONS = [
'json',
'yaml',
'yml',
// Audio formats
'mp3',
'm4a',
'wav',
'webm',
'ogg',
'flac',
'aac',
'opus',
// Video formats
'mp4',
'mov',
'avi',
'mkv',
] as const
const ACCEPT_ATTR =
'.pdf,.csv,.doc,.docx,.txt,.md,.xlsx,.xls,.html,.htm,.pptx,.ppt,.json,.yaml,.yml'
'.pdf,.csv,.doc,.docx,.txt,.md,.xlsx,.xls,.html,.htm,.pptx,.ppt,.json,.yaml,.yml,.mp3,.m4a,.wav,.webm,.ogg,.flac,.aac,.opus,.mp4,.mov,.avi,.mkv'
export function Files() {
const params = useParams()

View File

@@ -0,0 +1,232 @@
import { AudioWaveformIcon } from '@/components/icons'
import { AuthMode, type BlockConfig } from '@/blocks/types'
import type { SttBlockResponse } from '@/tools/stt/types'
export const SttBlock: BlockConfig<SttBlockResponse> = {
type: 'stt',
name: 'Speech-to-Text',
description: 'Convert speech to text using AI',
authMode: AuthMode.ApiKey,
longDescription:
'Transcribe audio and video files to text using leading AI providers. Supports multiple languages, timestamps, and speaker diarization.',
docsLink: 'https://docs.sim.ai/tools/stt',
category: 'tools',
bgColor: '#181C1E',
icon: AudioWaveformIcon,
subBlocks: [
// Provider selection
{
id: 'provider',
title: 'Provider',
type: 'dropdown',
options: [
{ label: 'OpenAI Whisper', id: 'whisper' },
{ label: 'Deepgram', id: 'deepgram' },
{ label: 'ElevenLabs', id: 'elevenlabs' },
],
value: () => 'whisper',
required: true,
},
// OpenAI Whisper model selection
{
id: 'model',
title: 'Model',
type: 'dropdown',
condition: { field: 'provider', value: 'whisper' },
options: [{ label: 'Whisper-1', id: 'whisper-1' }],
value: () => 'whisper-1',
required: false,
},
// ElevenLabs model selection
{
id: 'model',
title: 'Model',
type: 'dropdown',
condition: { field: 'provider', value: 'elevenlabs' },
options: [
{ label: 'Scribe v1', id: 'scribe_v1' },
{ label: 'Scribe v1 Experimental', id: 'scribe_v1_experimental' },
],
value: () => 'scribe_v1',
required: false,
},
// Deepgram model selection
{
id: 'model',
title: 'Model',
type: 'dropdown',
condition: { field: 'provider', value: 'deepgram' },
options: [
{ label: 'Nova 3', id: 'nova-3' },
{ label: 'Nova 2', id: 'nova-2' },
{ label: 'Nova', id: 'nova' },
{ label: 'Whisper Large', id: 'whisper-large' },
{ label: 'Enhanced', id: 'enhanced' },
{ label: 'Base', id: 'base' },
],
value: () => 'nova-3',
required: false,
},
// Audio/Video file upload (basic mode)
{
id: 'audioFile',
title: 'Audio/Video File',
type: 'file-upload',
canonicalParamId: 'audioFile',
placeholder: 'Upload an audio or video file',
mode: 'basic',
multiple: false,
required: false,
acceptedTypes: '.mp3,.m4a,.wav,.webm,.ogg,.flac,.aac,.opus,.mp4,.mov,.avi,.mkv',
},
// Audio file reference (advanced mode)
{
id: 'audioFileReference',
title: 'Audio/Video File Reference',
type: 'short-input',
canonicalParamId: 'audioFile',
placeholder: 'Reference audio/video from previous blocks',
mode: 'advanced',
required: false,
},
// Audio URL (alternative)
{
id: 'audioUrl',
title: 'Audio/Video URL (alternative)',
type: 'short-input',
placeholder: 'Or enter publicly accessible audio/video URL',
required: false,
},
// Language selection
{
id: 'language',
title: 'Language',
type: 'dropdown',
options: [
{ label: 'Auto-detect', id: 'auto' },
{ label: 'English', id: 'en' },
{ label: 'Spanish', id: 'es' },
{ label: 'French', id: 'fr' },
{ label: 'German', id: 'de' },
{ label: 'Italian', id: 'it' },
{ label: 'Portuguese', id: 'pt' },
{ label: 'Dutch', id: 'nl' },
{ label: 'Russian', id: 'ru' },
{ label: 'Chinese', id: 'zh' },
{ label: 'Japanese', id: 'ja' },
{ label: 'Korean', id: 'ko' },
{ label: 'Arabic', id: 'ar' },
{ label: 'Hindi', id: 'hi' },
{ label: 'Polish', id: 'pl' },
{ label: 'Turkish', id: 'tr' },
{ label: 'Swedish', id: 'sv' },
{ label: 'Danish', id: 'da' },
{ label: 'Norwegian', id: 'no' },
{ label: 'Finnish', id: 'fi' },
],
value: () => 'auto',
},
// Timestamps (word-level, sentence-level, or none)
{
id: 'timestamps',
title: 'Timestamps',
type: 'dropdown',
options: [
{ label: 'None', id: 'none' },
{ label: 'Sentence-level', id: 'sentence' },
{ label: 'Word-level', id: 'word' },
],
value: () => 'none',
},
// Speaker diarization (Deepgram/AssemblyAI only)
{
id: 'diarization',
title: 'Speaker Diarization',
type: 'switch',
condition: { field: 'provider', value: ['deepgram'] },
},
// Translate to English (Whisper only)
{
id: 'translateToEnglish',
title: 'Translate to English',
type: 'switch',
condition: { field: 'provider', value: 'whisper' },
},
// API Key
{
id: 'apiKey',
title: 'API Key',
type: 'short-input',
placeholder: 'Enter your API key',
password: true,
required: true,
},
],
tools: {
access: ['stt_whisper', 'stt_deepgram', 'stt_elevenlabs'],
config: {
tool: (params) => {
// Select tool based on provider
switch (params.provider) {
case 'whisper':
return 'stt_whisper'
case 'deepgram':
return 'stt_deepgram'
case 'elevenlabs':
return 'stt_elevenlabs'
default:
return 'stt_whisper'
}
},
params: (params) => ({
provider: params.provider,
apiKey: params.apiKey,
model: params.model,
audioFile: params.audioFile,
audioFileReference: params.audioFileReference,
audioUrl: params.audioUrl,
language: params.language,
timestamps: params.timestamps,
diarization: params.diarization,
translateToEnglish: params.translateToEnglish,
}),
},
},
inputs: {
provider: { type: 'string', description: 'STT provider (whisper, deepgram, elevenlabs)' },
apiKey: { type: 'string', description: 'Provider API key' },
model: {
type: 'string',
description: 'Provider-specific model (e.g., scribe_v1 for ElevenLabs, nova-3 for Deepgram)',
},
audioFile: { type: 'json', description: 'Audio/video file (UserFile)' },
audioFileReference: { type: 'json', description: 'Audio/video file reference' },
audioUrl: { type: 'string', description: 'Audio/video URL' },
language: { type: 'string', description: 'Language code or auto' },
timestamps: { type: 'string', description: 'Timestamp granularity (none, sentence, word)' },
diarization: { type: 'boolean', description: 'Enable speaker diarization' },
translateToEnglish: { type: 'boolean', description: 'Translate to English (Whisper only)' },
},
outputs: {
transcript: { type: 'string', description: 'Full transcribed text' },
segments: { type: 'array', description: 'Timestamped segments with speaker labels' },
language: { type: 'string', description: 'Detected or specified language' },
duration: { type: 'number', description: 'Audio duration in seconds' },
confidence: { type: 'number', description: 'Overall confidence score' },
},
}

View File

@@ -77,6 +77,7 @@ import { StagehandAgentBlock } from '@/blocks/blocks/stagehand_agent'
import { StartTriggerBlock } from '@/blocks/blocks/start_trigger'
import { StarterBlock } from '@/blocks/blocks/starter'
import { StripeBlock } from '@/blocks/blocks/stripe'
import { SttBlock } from '@/blocks/blocks/stt'
import { SupabaseBlock } from '@/blocks/blocks/supabase'
import { TavilyBlock } from '@/blocks/blocks/tavily'
import { TelegramBlock } from '@/blocks/blocks/telegram'
@@ -177,6 +178,7 @@ export const registry: Record<string, BlockConfig> = {
stagehand_agent: StagehandAgentBlock,
slack: SlackBlock,
starter: StarterBlock,
stt: SttBlock,
start_trigger: StartTriggerBlock,
input_trigger: InputTriggerBlock,
chat_trigger: ChatTriggerBlock,

View File

@@ -4084,3 +4084,27 @@ export function CalendlyIcon(props: SVGProps<SVGSVGElement>) {
</svg>
)
}
export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
return (
<svg
{...props}
xmlns='http://www.w3.org/2000/svg'
width='24'
height='24'
viewBox='0 0 24 24'
fill='none'
stroke='currentColor'
strokeWidth='2'
strokeLinecap='round'
strokeLinejoin='round'
>
<path d='M2 10v3' />
<path d='M6 6v11' />
<path d='M10 3v18' />
<path d='M14 8v7' />
<path d='M18 5v13' />
<path d='M22 10v3' />
</svg>
)
}

View File

@@ -0,0 +1,294 @@
import { execSync } from 'node:child_process'
import fs from 'node:fs/promises'
import os from 'node:os'
import path from 'node:path'
import ffmpegStatic from 'ffmpeg-static'
import ffmpeg from 'fluent-ffmpeg'
import type {
AudioExtractionOptions,
AudioExtractionResult,
AudioMetadata,
} from '@/lib/audio/types'
// Set ffmpeg binary path with fallback to system ffmpeg
try {
if (ffmpegStatic && typeof ffmpegStatic === 'string') {
ffmpeg.setFfmpegPath(ffmpegStatic)
} else {
// Try to find system ffmpeg
try {
const systemFfmpeg = execSync('which ffmpeg', { encoding: 'utf-8' }).trim()
if (systemFfmpeg) {
ffmpeg.setFfmpegPath(systemFfmpeg)
console.log('[FFmpeg] Using system ffmpeg:', systemFfmpeg)
}
} catch {
console.warn(
'[FFmpeg] ffmpeg-static not available and system ffmpeg not found. Please install ffmpeg: brew install ffmpeg (macOS) or apt-get install ffmpeg (Linux)'
)
}
}
} catch (error) {
console.warn('[FFmpeg] Failed to set ffmpeg path:', error)
}
/**
* Extract audio from video or convert audio format using FFmpeg
*/
export async function extractAudioFromVideo(
inputBuffer: Buffer,
mimeType: string,
options: AudioExtractionOptions = {}
): Promise<AudioExtractionResult> {
const isVideo = mimeType.startsWith('video/')
const isAudio = mimeType.startsWith('audio/')
// If it's already audio and no conversion needed, get metadata and return
if (isAudio && !options.outputFormat) {
try {
const metadata = await getAudioMetadata(inputBuffer, mimeType)
return {
buffer: inputBuffer,
format: mimeType.split('/')[1] || 'unknown',
duration: metadata.duration || 0,
size: inputBuffer.length,
}
} catch (error) {
// If metadata extraction fails, still return the buffer
return {
buffer: inputBuffer,
format: mimeType.split('/')[1] || 'unknown',
duration: 0,
size: inputBuffer.length,
}
}
}
// For video or audio conversion, use ffmpeg
if (isVideo || options.outputFormat) {
return await convertAudioWithFFmpeg(inputBuffer, mimeType, options)
}
// Fallback
return {
buffer: inputBuffer,
format: options.outputFormat || mimeType.split('/')[1] || 'unknown',
duration: 0,
size: inputBuffer.length,
}
}
/**
* Convert audio/video using FFmpeg
*/
async function convertAudioWithFFmpeg(
inputBuffer: Buffer,
mimeType: string,
options: AudioExtractionOptions
): Promise<AudioExtractionResult> {
// Create temporary files
const tempDir = os.tmpdir()
const inputExt = getExtensionFromMimeType(mimeType)
const outputFormat = options.outputFormat || 'mp3'
const inputFile = path.join(tempDir, `ffmpeg-input-${Date.now()}.${inputExt}`)
const outputFile = path.join(tempDir, `ffmpeg-output-${Date.now()}.${outputFormat}`)
try {
// Write input buffer to temporary file
await fs.writeFile(inputFile, inputBuffer)
// Get metadata for duration
let duration = 0
try {
const metadata = await getAudioMetadataFromFile(inputFile)
duration = metadata.duration || 0
} catch (error) {
// Metadata extraction failed, continue without duration
console.warn('Failed to extract metadata:', error)
}
// Convert using FFmpeg
await new Promise<void>((resolve, reject) => {
let command = ffmpeg(inputFile).toFormat(outputFormat).audioCodec(getAudioCodec(outputFormat))
// Apply audio options
if (options.channels) {
command = command.audioChannels(options.channels)
}
if (options.sampleRate) {
command = command.audioFrequency(options.sampleRate)
}
if (options.bitrate) {
command = command.audioBitrate(options.bitrate)
}
command
.on('end', () => resolve())
.on('error', (err) => reject(new Error(`FFmpeg error: ${err.message}`)))
.save(outputFile)
})
// Read output file
const outputBuffer = await fs.readFile(outputFile)
return {
buffer: outputBuffer,
format: outputFormat,
duration,
size: outputBuffer.length,
}
} finally {
// Clean up temporary files
try {
await fs.unlink(inputFile).catch(() => {})
await fs.unlink(outputFile).catch(() => {})
} catch (error) {
// Ignore cleanup errors
}
}
}
/**
* Get audio metadata using ffprobe
*/
export async function getAudioMetadata(buffer: Buffer, mimeType: string): Promise<AudioMetadata> {
const tempDir = os.tmpdir()
const inputExt = getExtensionFromMimeType(mimeType)
const inputFile = path.join(tempDir, `ffprobe-input-${Date.now()}.${inputExt}`)
try {
// Write buffer to temporary file
await fs.writeFile(inputFile, buffer)
// Get metadata using ffprobe
return await getAudioMetadataFromFile(inputFile)
} finally {
// Clean up temporary file
try {
await fs.unlink(inputFile).catch(() => {})
} catch (error) {
// Ignore cleanup errors
}
}
}
/**
* Get audio metadata from a file path using ffprobe
*/
async function getAudioMetadataFromFile(filePath: string): Promise<AudioMetadata> {
return new Promise((resolve, reject) => {
ffmpeg.ffprobe(filePath, (err, metadata) => {
if (err) {
reject(new Error(`FFprobe error: ${err.message}`))
return
}
const audioStream = metadata.streams.find((s) => s.codec_type === 'audio')
const format = metadata.format
resolve({
duration: format.duration || 0,
format: format.format_name || 'unknown',
codec: audioStream?.codec_name,
sampleRate: audioStream?.sample_rate,
channels: audioStream?.channels,
bitrate: format.bit_rate ? Number(format.bit_rate) : undefined,
})
})
})
}
/**
* Get file extension from MIME type
*/
function getExtensionFromMimeType(mimeType: string): string {
const mimeToExt: Record<string, string> = {
// Video
'video/mp4': 'mp4',
'video/quicktime': 'mov',
'video/x-msvideo': 'avi',
'video/x-matroska': 'mkv',
'video/webm': 'webm',
// Audio
'audio/mpeg': 'mp3',
'audio/mp4': 'm4a',
'audio/wav': 'wav',
'audio/webm': 'webm',
'audio/ogg': 'ogg',
'audio/flac': 'flac',
'audio/aac': 'aac',
'audio/opus': 'opus',
}
return mimeToExt[mimeType] || mimeType.split('/')[1] || 'dat'
}
/**
* Get appropriate audio codec for output format
*/
function getAudioCodec(format: string): string {
const codecMap: Record<string, string> = {
mp3: 'libmp3lame',
wav: 'pcm_s16le',
flac: 'flac',
m4a: 'aac',
aac: 'aac',
ogg: 'libvorbis',
opus: 'libopus',
}
return codecMap[format] || 'libmp3lame'
}
/**
* Check if a file is a video file
*/
export function isVideoFile(mimeType: string): boolean {
return mimeType.startsWith('video/')
}
/**
* Check if a file is an audio file
*/
export function isAudioFile(mimeType: string): boolean {
return mimeType.startsWith('audio/')
}
/**
* Get optimal audio format for STT provider
*/
export function getOptimalFormat(provider: 'whisper' | 'deepgram' | 'elevenlabs'): {
format: 'mp3' | 'wav' | 'flac'
sampleRate: number
channels: 1 | 2
} {
switch (provider) {
case 'whisper':
// Whisper prefers 16kHz mono
return {
format: 'mp3',
sampleRate: 16000,
channels: 1,
}
case 'deepgram':
// Deepgram works well with various formats
return {
format: 'mp3',
sampleRate: 16000,
channels: 1,
}
case 'elevenlabs':
// ElevenLabs format preferences
return {
format: 'mp3',
sampleRate: 16000,
channels: 1,
}
default:
return {
format: 'mp3',
sampleRate: 16000,
channels: 1,
}
}
}

View File

@@ -0,0 +1,22 @@
export interface AudioExtractionOptions {
outputFormat?: 'mp3' | 'wav' | 'flac'
sampleRate?: number
channels?: 1 | 2
bitrate?: string
}
export interface AudioExtractionResult {
buffer: Buffer
format: string
duration: number
size: number
}
export interface AudioMetadata {
duration: number
format: string
codec?: string
sampleRate?: number
channels?: number
bitrate?: number
}

View File

@@ -12,7 +12,7 @@ export interface FileAttachment {
}
export interface MessageContent {
type: 'text' | 'image' | 'document'
type: 'text' | 'image' | 'document' | 'audio' | 'video'
text?: string
source?: {
type: 'base64'
@@ -24,7 +24,7 @@ export interface MessageContent {
/**
* Mapping of MIME types to content types
*/
export const MIME_TYPE_MAPPING: Record<string, 'image' | 'document'> = {
export const MIME_TYPE_MAPPING: Record<string, 'image' | 'document' | 'audio' | 'video'> = {
// Images
'image/jpeg': 'image',
'image/jpg': 'image',
@@ -49,12 +49,40 @@ export const MIME_TYPE_MAPPING: Record<string, 'image' | 'document'> = {
'application/vnd.ms-powerpoint': 'document', // .ppt
'text/markdown': 'document',
'application/rtf': 'document',
// Audio
'audio/mpeg': 'audio', // .mp3
'audio/mp3': 'audio',
'audio/mp4': 'audio', // .m4a
'audio/x-m4a': 'audio',
'audio/m4a': 'audio',
'audio/wav': 'audio',
'audio/wave': 'audio',
'audio/x-wav': 'audio',
'audio/webm': 'audio',
'audio/ogg': 'audio',
'audio/vorbis': 'audio',
'audio/flac': 'audio',
'audio/x-flac': 'audio',
'audio/aac': 'audio',
'audio/x-aac': 'audio',
'audio/opus': 'audio',
// Video
'video/mp4': 'video',
'video/mpeg': 'video',
'video/quicktime': 'video', // .mov
'video/x-quicktime': 'video',
'video/x-msvideo': 'video', // .avi
'video/avi': 'video',
'video/x-matroska': 'video', // .mkv
'video/webm': 'video',
}
/**
* Get the content type for a given MIME type
*/
export function getContentType(mimeType: string): 'image' | 'document' | null {
export function getContentType(mimeType: string): 'image' | 'document' | 'audio' | 'video' | null {
return MIME_TYPE_MAPPING[mimeType.toLowerCase()] || null
}
@@ -80,6 +108,28 @@ export function isImageFileType(mimeType: string): boolean {
return imageTypes.includes(mimeType.toLowerCase())
}
/**
* Check if a MIME type is an audio type
*/
export function isAudioFileType(mimeType: string): boolean {
return getContentType(mimeType) === 'audio'
}
/**
* Check if a MIME type is a video type
*/
export function isVideoFileType(mimeType: string): boolean {
return getContentType(mimeType) === 'video'
}
/**
* Check if a MIME type is an audio or video type
*/
export function isMediaFileType(mimeType: string): boolean {
const contentType = getContentType(mimeType)
return contentType === 'audio' || contentType === 'video'
}
/**
* Convert a file buffer to base64
*/
@@ -143,6 +193,22 @@ export function getMimeTypeFromExtension(extension: string): string {
ppt: 'application/vnd.ms-powerpoint',
md: 'text/markdown',
rtf: 'application/rtf',
// Audio
mp3: 'audio/mpeg',
m4a: 'audio/mp4',
wav: 'audio/wav',
webm: 'audio/webm',
ogg: 'audio/ogg',
flac: 'audio/flac',
aac: 'audio/aac',
opus: 'audio/opus',
// Video
mp4: 'video/mp4',
mov: 'video/quicktime',
avi: 'video/x-msvideo',
mkv: 'video/x-matroska',
}
return extensionMimeMap[extension.toLowerCase()] || 'application/octet-stream'

View File

@@ -20,7 +20,26 @@ export const SUPPORTED_DOCUMENT_EXTENSIONS = [
'yml',
] as const
export const SUPPORTED_AUDIO_EXTENSIONS = [
'mp3',
'm4a',
'wav',
'webm',
'ogg',
'flac',
'aac',
'opus',
] as const
export const SUPPORTED_VIDEO_EXTENSIONS = ['mp4', 'mov', 'avi', 'mkv', 'webm'] as const
export type SupportedDocumentExtension = (typeof SUPPORTED_DOCUMENT_EXTENSIONS)[number]
export type SupportedAudioExtension = (typeof SUPPORTED_AUDIO_EXTENSIONS)[number]
export type SupportedVideoExtension = (typeof SUPPORTED_VIDEO_EXTENSIONS)[number]
export type SupportedMediaExtension =
| SupportedDocumentExtension
| SupportedAudioExtension
| SupportedVideoExtension
export const SUPPORTED_MIME_TYPES: Record<SupportedDocumentExtension, string[]> = {
pdf: ['application/pdf', 'application/x-pdf'],
@@ -54,7 +73,33 @@ export const SUPPORTED_MIME_TYPES: Record<SupportedDocumentExtension, string[]>
yml: ['text/yaml', 'text/x-yaml', 'application/yaml', 'application/x-yaml'],
}
export const SUPPORTED_AUDIO_MIME_TYPES: Record<SupportedAudioExtension, string[]> = {
mp3: ['audio/mpeg', 'audio/mp3'],
m4a: ['audio/mp4', 'audio/x-m4a', 'audio/m4a'],
wav: ['audio/wav', 'audio/wave', 'audio/x-wav'],
webm: ['audio/webm'],
ogg: ['audio/ogg', 'audio/vorbis'],
flac: ['audio/flac', 'audio/x-flac'],
aac: ['audio/aac', 'audio/x-aac'],
opus: ['audio/opus'],
}
export const SUPPORTED_VIDEO_MIME_TYPES: Record<SupportedVideoExtension, string[]> = {
mp4: ['video/mp4', 'video/mpeg'],
mov: ['video/quicktime', 'video/x-quicktime'],
avi: ['video/x-msvideo', 'video/avi'],
mkv: ['video/x-matroska'],
webm: ['video/webm'],
}
export const ACCEPTED_FILE_TYPES = Object.values(SUPPORTED_MIME_TYPES).flat()
export const ACCEPTED_AUDIO_TYPES = Object.values(SUPPORTED_AUDIO_MIME_TYPES).flat()
export const ACCEPTED_VIDEO_TYPES = Object.values(SUPPORTED_VIDEO_MIME_TYPES).flat()
export const ACCEPTED_MEDIA_TYPES = [
...ACCEPTED_FILE_TYPES,
...ACCEPTED_AUDIO_TYPES,
...ACCEPTED_VIDEO_TYPES,
]
export const ACCEPTED_FILE_EXTENSIONS = SUPPORTED_DOCUMENT_EXTENSIONS.map((ext) => `.${ext}`)
@@ -110,5 +155,61 @@ export function getSupportedMimeTypes(extension: string): string[] {
if (isSupportedExtension(extension)) {
return SUPPORTED_MIME_TYPES[extension as SupportedDocumentExtension]
}
if (SUPPORTED_AUDIO_EXTENSIONS.includes(extension as SupportedAudioExtension)) {
return SUPPORTED_AUDIO_MIME_TYPES[extension as SupportedAudioExtension]
}
if (SUPPORTED_VIDEO_EXTENSIONS.includes(extension as SupportedVideoExtension)) {
return SUPPORTED_VIDEO_MIME_TYPES[extension as SupportedVideoExtension]
}
return []
}
/**
* Check if file extension is a supported audio extension
*/
export function isSupportedAudioExtension(extension: string): extension is SupportedAudioExtension {
return SUPPORTED_AUDIO_EXTENSIONS.includes(extension.toLowerCase() as SupportedAudioExtension)
}
/**
* Check if file extension is a supported video extension
*/
export function isSupportedVideoExtension(extension: string): extension is SupportedVideoExtension {
return SUPPORTED_VIDEO_EXTENSIONS.includes(extension.toLowerCase() as SupportedVideoExtension)
}
/**
* Validate if an audio/video file type is supported for STT processing
*/
export function validateMediaFileType(
fileName: string,
mimeType: string
): FileValidationError | null {
const extension = path.extname(fileName).toLowerCase().substring(1)
const isAudio = SUPPORTED_AUDIO_EXTENSIONS.includes(extension as SupportedAudioExtension)
const isVideo = SUPPORTED_VIDEO_EXTENSIONS.includes(extension as SupportedVideoExtension)
if (!isAudio && !isVideo) {
return {
code: 'UNSUPPORTED_FILE_TYPE',
message: `Unsupported media file type: ${extension}. Supported audio types: ${SUPPORTED_AUDIO_EXTENSIONS.join(', ')}. Supported video types: ${SUPPORTED_VIDEO_EXTENSIONS.join(', ')}`,
supportedTypes: [...SUPPORTED_AUDIO_EXTENSIONS, ...SUPPORTED_VIDEO_EXTENSIONS],
}
}
const baseMimeType = mimeType.split(';')[0].trim()
const allowedMimeTypes = isAudio
? SUPPORTED_AUDIO_MIME_TYPES[extension as SupportedAudioExtension]
: SUPPORTED_VIDEO_MIME_TYPES[extension as SupportedVideoExtension]
if (!allowedMimeTypes.includes(baseMimeType)) {
return {
code: 'MIME_TYPE_MISMATCH',
message: `MIME type ${baseMimeType} does not match file extension ${extension}. Expected: ${allowedMimeTypes.join(', ')}`,
supportedTypes: allowedMimeTypes,
}
}
return null
}

View File

@@ -75,7 +75,7 @@ const nextConfig: NextConfig = {
turbopack: {
resolveExtensions: ['.tsx', '.ts', '.jsx', '.js', '.mjs', '.json'],
},
serverExternalPackages: ['unpdf'],
serverExternalPackages: ['unpdf', 'ffmpeg-static', 'fluent-ffmpeg'],
experimental: {
optimizeCss: true,
turbopackSourceMaps: false,

View File

@@ -605,6 +605,7 @@ import {
stripeUpdateSubscriptionTool,
stripeVoidInvoiceTool,
} from '@/tools/stripe'
import { deepgramSttTool, elevenLabsSttTool, whisperSttTool } from '@/tools/stt'
import {
supabaseCountTool,
supabaseDeleteTool,
@@ -1050,6 +1051,9 @@ export const tools: Record<string, ToolConfig> = {
knowledge_upload_chunk: knowledgeUploadChunkTool,
knowledge_create_document: knowledgeCreateDocumentTool,
elevenlabs_tts: elevenLabsTtsTool,
stt_whisper: whisperSttTool,
stt_deepgram: deepgramSttTool,
stt_elevenlabs: elevenLabsSttTool,
s3_get_object: s3GetObjectTool,
s3_put_object: s3PutObjectTool,
s3_list_objects: s3ListObjectsTool,

View File

@@ -0,0 +1,125 @@
import type { SttParams, SttResponse } from '@/tools/stt/types'
import type { ToolConfig } from '@/tools/types'
export const deepgramSttTool: ToolConfig<SttParams, SttResponse> = {
id: 'stt_deepgram',
name: 'Deepgram STT',
description: 'Transcribe audio to text using Deepgram',
version: '1.0.0',
params: {
provider: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'STT provider (deepgram)',
},
apiKey: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'Deepgram API key',
},
model: {
type: 'string',
required: false,
visibility: 'user-or-llm',
description: 'Deepgram model to use (nova-3, nova-2, whisper-large, etc.)',
},
audioFile: {
type: 'file',
required: false,
visibility: 'user-or-llm',
description: 'Audio or video file to transcribe',
},
audioFileReference: {
type: 'file',
required: false,
visibility: 'user-or-llm',
description: 'Reference to audio/video file from previous blocks',
},
audioUrl: {
type: 'string',
required: false,
visibility: 'user-or-llm',
description: 'URL to audio or video file',
},
language: {
type: 'string',
required: false,
visibility: 'user-or-llm',
description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
},
timestamps: {
type: 'string',
required: false,
visibility: 'user-only',
description: 'Timestamp granularity: none, sentence, or word',
},
diarization: {
type: 'boolean',
required: false,
visibility: 'user-only',
description: 'Enable speaker diarization',
},
},
request: {
url: '/api/proxy/stt',
method: 'POST',
headers: () => ({
'Content-Type': 'application/json',
}),
body: (
params: SttParams & {
_context?: { workspaceId?: string; workflowId?: string; executionId?: string }
}
) => ({
provider: 'deepgram',
apiKey: params.apiKey,
model: params.model,
audioFile: params.audioFile,
audioFileReference: params.audioFileReference,
audioUrl: params.audioUrl,
language: params.language || 'auto',
timestamps: params.timestamps || 'none',
diarization: params.diarization || false,
workspaceId: params._context?.workspaceId,
workflowId: params._context?.workflowId,
executionId: params._context?.executionId,
}),
},
transformResponse: async (response: Response) => {
const data = await response.json()
if (!response.ok || data.error) {
return {
success: false,
error: data.error || 'Transcription failed',
output: {
transcript: '',
},
}
}
return {
success: true,
output: {
transcript: data.transcript,
segments: data.segments,
language: data.language,
duration: data.duration,
confidence: data.confidence,
},
}
},
outputs: {
transcript: { type: 'string', description: 'Full transcribed text' },
segments: { type: 'array', description: 'Timestamped segments with speaker labels' },
language: { type: 'string', description: 'Detected or specified language' },
duration: { type: 'number', description: 'Audio duration in seconds' },
confidence: { type: 'number', description: 'Overall confidence score' },
},
}

View File

@@ -0,0 +1,118 @@
import type { SttParams, SttResponse } from '@/tools/stt/types'
import type { ToolConfig } from '@/tools/types'
export const elevenLabsSttTool: ToolConfig<SttParams, SttResponse> = {
id: 'stt_elevenlabs',
name: 'ElevenLabs STT',
description: 'Transcribe audio to text using ElevenLabs',
version: '1.0.0',
params: {
provider: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'STT provider (elevenlabs)',
},
apiKey: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'ElevenLabs API key',
},
model: {
type: 'string',
required: false,
visibility: 'user-or-llm',
description: 'ElevenLabs model to use (scribe_v1, scribe_v1_experimental)',
},
audioFile: {
type: 'file',
required: false,
visibility: 'user-or-llm',
description: 'Audio or video file to transcribe',
},
audioFileReference: {
type: 'file',
required: false,
visibility: 'user-or-llm',
description: 'Reference to audio/video file from previous blocks',
},
audioUrl: {
type: 'string',
required: false,
visibility: 'user-or-llm',
description: 'URL to audio or video file',
},
language: {
type: 'string',
required: false,
visibility: 'user-or-llm',
description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
},
timestamps: {
type: 'string',
required: false,
visibility: 'user-only',
description: 'Timestamp granularity: none, sentence, or word',
},
},
request: {
url: '/api/proxy/stt',
method: 'POST',
headers: () => ({
'Content-Type': 'application/json',
}),
body: (
params: SttParams & {
_context?: { workspaceId?: string; workflowId?: string; executionId?: string }
}
) => ({
provider: 'elevenlabs',
apiKey: params.apiKey,
model: params.model,
audioFile: params.audioFile,
audioFileReference: params.audioFileReference,
audioUrl: params.audioUrl,
language: params.language || 'auto',
timestamps: params.timestamps || 'none',
workspaceId: params._context?.workspaceId,
workflowId: params._context?.workflowId,
executionId: params._context?.executionId,
}),
},
transformResponse: async (response: Response) => {
const data = await response.json()
if (!response.ok || data.error) {
return {
success: false,
error: data.error || 'Transcription failed',
output: {
transcript: '',
},
}
}
return {
success: true,
output: {
transcript: data.transcript,
segments: data.segments,
language: data.language,
duration: data.duration,
confidence: data.confidence,
},
}
},
outputs: {
transcript: { type: 'string', description: 'Full transcribed text' },
segments: { type: 'array', description: 'Timestamped segments' },
language: { type: 'string', description: 'Detected or specified language' },
duration: { type: 'number', description: 'Audio duration in seconds' },
confidence: { type: 'number', description: 'Overall confidence score' },
},
}

View File

@@ -0,0 +1,5 @@
import { deepgramSttTool } from '@/tools/stt/deepgram'
import { elevenLabsSttTool } from '@/tools/stt/elevenlabs'
import { whisperSttTool } from '@/tools/stt/whisper'
export { whisperSttTool, deepgramSttTool, elevenLabsSttTool }

View File

@@ -0,0 +1,62 @@
import type { UserFile } from '@/executor/types'
import type { ToolResponse } from '@/tools/types'
export interface SttParams {
provider: 'whisper' | 'deepgram' | 'elevenlabs'
apiKey: string
model?: string
audioFile?: UserFile | UserFile[]
audioFileReference?: UserFile | UserFile[]
audioUrl?: string
language?: string
timestamps?: 'none' | 'sentence' | 'word'
diarization?: boolean
translateToEnglish?: boolean
}
export interface TranscriptSegment {
text: string
start: number
end: number
speaker?: string
confidence?: number
}
export interface SttResponse extends ToolResponse {
output: {
transcript: string
segments?: TranscriptSegment[]
language?: string
duration?: number
confidence?: number
}
}
export interface SttBlockResponse extends ToolResponse {
output: {
transcript: string
segments?: TranscriptSegment[]
language?: string
duration?: number
confidence?: number
}
}
// Provider-specific types
export interface WhisperParams extends Omit<SttParams, 'provider'> {
model?: string
responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'
temperature?: number
}
export interface DeepgramParams extends Omit<SttParams, 'provider'> {
model?: string
punctuate?: boolean
paragraphs?: boolean
utterances?: boolean
}
export interface ElevenLabsSttParams extends Omit<SttParams, 'provider'> {
model?: string
}

View File

@@ -0,0 +1,125 @@
import type { SttParams, SttResponse } from '@/tools/stt/types'
import type { ToolConfig } from '@/tools/types'
export const whisperSttTool: ToolConfig<SttParams, SttResponse> = {
id: 'stt_whisper',
name: 'OpenAI Whisper STT',
description: 'Transcribe audio to text using OpenAI Whisper',
version: '1.0.0',
params: {
provider: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'STT provider (whisper)',
},
apiKey: {
type: 'string',
required: true,
visibility: 'user-only',
description: 'OpenAI API key',
},
model: {
type: 'string',
required: false,
visibility: 'user-or-llm',
description: 'Whisper model to use (default: whisper-1)',
},
audioFile: {
type: 'file',
required: false,
visibility: 'user-or-llm',
description: 'Audio or video file to transcribe',
},
audioFileReference: {
type: 'file',
required: false,
visibility: 'user-or-llm',
description: 'Reference to audio/video file from previous blocks',
},
audioUrl: {
type: 'string',
required: false,
visibility: 'user-or-llm',
description: 'URL to audio or video file',
},
language: {
type: 'string',
required: false,
visibility: 'user-or-llm',
description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
},
timestamps: {
type: 'string',
required: false,
visibility: 'user-only',
description: 'Timestamp granularity: none, sentence, or word',
},
translateToEnglish: {
type: 'boolean',
required: false,
visibility: 'user-only',
description: 'Translate audio to English',
},
},
request: {
url: '/api/proxy/stt',
method: 'POST',
headers: () => ({
'Content-Type': 'application/json',
}),
body: (
params: SttParams & {
_context?: { workspaceId?: string; workflowId?: string; executionId?: string }
}
) => ({
provider: 'whisper',
apiKey: params.apiKey,
model: params.model,
audioFile: params.audioFile,
audioFileReference: params.audioFileReference,
audioUrl: params.audioUrl,
language: params.language || 'auto',
timestamps: params.timestamps || 'none',
translateToEnglish: params.translateToEnglish || false,
workspaceId: params._context?.workspaceId,
workflowId: params._context?.workflowId,
executionId: params._context?.executionId,
}),
},
transformResponse: async (response: Response) => {
const data = await response.json()
if (!response.ok || data.error) {
return {
success: false,
error: data.error || 'Transcription failed',
output: {
transcript: '',
},
}
}
return {
success: true,
output: {
transcript: data.transcript,
segments: data.segments,
language: data.language,
duration: data.duration,
confidence: data.confidence,
},
}
},
outputs: {
transcript: { type: 'string', description: 'Full transcribed text' },
segments: { type: 'array', description: 'Timestamped segments' },
language: { type: 'string', description: 'Detected or specified language' },
duration: { type: 'number', description: 'Audio duration in seconds' },
confidence: { type: 'number', description: 'Overall confidence score' },
},
}

View File

@@ -9,8 +9,11 @@
"@t3-oss/env-nextjs": "0.13.4",
"@tanstack/react-query": "5.90.8",
"@tanstack/react-query-devtools": "5.90.2",
"@types/fluent-ffmpeg": "2.1.28",
"cronstrue": "3.3.0",
"drizzle-orm": "^0.44.5",
"ffmpeg-static": "5.3.0",
"fluent-ffmpeg": "2.1.3",
"mongodb": "6.19.0",
"neo4j-driver": "6.0.1",
"onedollarstats": "0.0.10",
@@ -235,6 +238,7 @@
},
},
"trustedDependencies": [
"ffmpeg-static",
"sharp",
],
"overrides": {
@@ -496,6 +500,8 @@
"@csstools/css-tokenizer": ["@csstools/css-tokenizer@3.0.4", "", {}, "sha512-Vd/9EVDiu6PPJt9yAh6roZP6El1xHrdvIVGjyBsHR0RYwNHgL7FJPyIIW4fANJNG6FtyZfvlRPpFI4ZM/lubvw=="],
"@derhuerst/http-basic": ["@derhuerst/http-basic@8.2.4", "", { "dependencies": { "caseless": "^0.12.0", "concat-stream": "^2.0.0", "http-response-object": "^3.0.1", "parse-cache-control": "^1.0.1" } }, "sha512-F9rL9k9Xjf5blCz8HsJRO4diy111cayL2vkY2XE4r4t3n0yPXVYy3KD3nJ1qbrSn9743UWSXH4IwuCa/HWlGFw=="],
"@dimforge/rapier3d-compat": ["@dimforge/rapier3d-compat@0.12.0", "", {}, "sha512-uekIGetywIgopfD97oDL5PfeezkFpNhwlzlaEYNOA0N6ghdsOvh/HYjSMek5Q2O1PYvRSDFcqFVJl4r4ZBwOow=="],
"@drizzle-team/brocli": ["@drizzle-team/brocli@0.10.2", "", {}, "sha512-z33Il7l5dKjUgGULTqBsQBQwckHh5AbIuxhdsIxDDiZAzBOrZO6q9ogcWC65kU382AfynTfgNumVcNIjuIua6w=="],
@@ -1336,6 +1342,8 @@
"@types/estree-jsx": ["@types/estree-jsx@1.0.5", "", { "dependencies": { "@types/estree": "*" } }, "sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg=="],
"@types/fluent-ffmpeg": ["@types/fluent-ffmpeg@2.1.28", "", { "dependencies": { "@types/node": "*" } }, "sha512-5ovxsDwBcPfJ+eYs1I/ZpcYCnkce7pvH9AHSvrZllAp1ZPpTRDZAFjF3TRFbukxSgIYTTNYePbS0rKUmaxVbXw=="],
"@types/geojson": ["@types/geojson@7946.0.16", "", {}, "sha512-6C8nqWur3j98U6+lXDfTUWIfgvZU+EumvpHKcYjujKH7woYyLj2sUmff0tRhrqM7BohUw7Pz3ZB1jj2gW9Fvmg=="],
"@types/hast": ["@types/hast@3.0.4", "", { "dependencies": { "@types/unist": "*" } }, "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ=="],
@@ -1470,6 +1478,8 @@
"astring": ["astring@1.9.0", "", { "bin": { "astring": "bin/astring" } }, "sha512-LElXdjswlqjWrPpJFg1Fx4wpkOCxj1TDHlSV4PlaRxHGWko024xICaa97ZkMfs6DRKlCguiAI+rbXv5GWwXIkg=="],
"async": ["async@0.2.10", "", {}, "sha512-eAkdoKxU6/LkKDBzLpT+t6Ff5EtfSF4wx1WfJiPEEV7WNLnDaRXk0oVysiEPm262roaachGexwUv94WhSgN5TQ=="],
"asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="],
"atomic-sleep": ["atomic-sleep@1.0.0", "", {}, "sha512-kNOjDqAh7px0XWNI+4QbzoiR/nTkHAWNud2uvnJquD1/x5a7EQZMJT0AczqK0Qn67oY/TTQ1LbUKajZpp3I9tQ=="],
@@ -1550,6 +1560,8 @@
"caniuse-lite": ["caniuse-lite@1.0.30001745", "", {}, "sha512-ywt6i8FzvdgrrrGbr1jZVObnVv6adj+0if2/omv9cmR2oiZs30zL4DIyaptKcbOrBdOIc74QTMoJvSE2QHh5UQ=="],
"caseless": ["caseless@0.12.0", "", {}, "sha512-4tYFyifaFfGacoiObjJegolkwSU4xQNGbVgUiNYVUxbQ2x2lUsFvY4hVgVzGiIe6WLOPqycWXA40l+PWsxthUw=="],
"ccount": ["ccount@2.0.1", "", {}, "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg=="],
"cfb": ["cfb@1.2.2", "", { "dependencies": { "adler-32": "~1.3.0", "crc-32": "~1.2.0" } }, "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA=="],
@@ -1818,6 +1830,8 @@
"entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="],
"env-paths": ["env-paths@2.2.1", "", {}, "sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A=="],
"environment": ["environment@1.1.0", "", {}, "sha512-xUtoPkMggbz0MPyPiIWr1Kp4aeWJjDZ6SMvURhimjdZgsRuDplF5/s9hcgGhyXMhs+6vpnuoiZ2kFiu3FMnS8Q=="],
"error": ["error@7.0.2", "", { "dependencies": { "string-template": "~0.2.1", "xtend": "~4.0.0" } }, "sha512-UtVv4l5MhijsYUxPJo4390gzfZvAnTHreNnDjnTZaKIiZ/SemXxAhBkYSKtWa5RtBXbLP8tMgn/n0RUa/H7jXw=="],
@@ -1916,6 +1930,8 @@
"fflate": ["fflate@0.8.2", "", {}, "sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A=="],
"ffmpeg-static": ["ffmpeg-static@5.3.0", "", { "dependencies": { "@derhuerst/http-basic": "^8.2.0", "env-paths": "^2.2.0", "https-proxy-agent": "^5.0.0", "progress": "^2.0.3" } }, "sha512-H+K6sW6TiIX6VGend0KQwthe+kaceeH/luE8dIZyOP35ik7ahYojDuqlTV1bOrtEwl01sy2HFNGQfi5IDJvotg=="],
"figures": ["figures@3.2.0", "", { "dependencies": { "escape-string-regexp": "^1.0.5" } }, "sha512-yaduQFRKLXYOGgEn6AZau90j3ggSOyiqXU0F9JZfeXYhNa+Jk4X+s45A2zg5jns87GAFa34BBm2kXw4XpNcbdg=="],
"file-type": ["file-type@16.5.4", "", { "dependencies": { "readable-web-to-node-stream": "^3.0.0", "strtok3": "^6.2.4", "token-types": "^4.1.1" } }, "sha512-/yFHK0aGjFEgDJjEKP0pWCplsPFPhwyfwevf/pVxiN0tmE4L9LmwWxWukdJSHdoCli4VgQLehjJtwQBnqmsKcw=="],
@@ -1924,6 +1940,8 @@
"finalhandler": ["finalhandler@2.1.0", "", { "dependencies": { "debug": "^4.4.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "on-finished": "^2.4.1", "parseurl": "^1.3.3", "statuses": "^2.0.1" } }, "sha512-/t88Ty3d5JWQbWYgaOGCCYfXRwV1+be02WqYYlL6h0lEiUAMPM8o8qKGO01YIkOHzka2up08wvgYD0mDiI+q3Q=="],
"fluent-ffmpeg": ["fluent-ffmpeg@2.1.3", "", { "dependencies": { "async": "^0.2.9", "which": "^1.1.1" } }, "sha512-Be3narBNt2s6bsaqP6Jzq91heDgOEaDCJAXcE3qcma/EJBSy5FB4cvO31XBInuAuKBx8Kptf8dkhjK0IOru39Q=="],
"follow-redirects": ["follow-redirects@1.15.11", "", {}, "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ=="],
"foreground-child": ["foreground-child@3.3.1", "", { "dependencies": { "cross-spawn": "^7.0.6", "signal-exit": "^4.0.1" } }, "sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw=="],
@@ -2050,6 +2068,8 @@
"http-proxy-agent": ["http-proxy-agent@7.0.2", "", { "dependencies": { "agent-base": "^7.1.0", "debug": "^4.3.4" } }, "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig=="],
"http-response-object": ["http-response-object@3.0.2", "", { "dependencies": { "@types/node": "^10.0.3" } }, "sha512-bqX0XTF6fnXSQcEJ2Iuyr75yVakyjIDCqroJQ/aHfSdlM743Cwqoi2nDYMzLGWUcuTWGWy8AAvOKXTfiv6q9RA=="],
"https-proxy-agent": ["https-proxy-agent@5.0.1", "", { "dependencies": { "agent-base": "6", "debug": "4" } }, "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA=="],
"human-signals": ["human-signals@5.0.0", "", {}, "sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ=="],
@@ -2538,6 +2558,8 @@
"papaparse": ["papaparse@5.5.3", "", {}, "sha512-5QvjGxYVjxO59MGU2lHVYpRWBBtKHnlIAcSe1uNFCkkptUh63NFRj0FJQm7nR67puEruUci/ZkjmEFrjCAyP4A=="],
"parse-cache-control": ["parse-cache-control@1.0.1", "", {}, "sha512-60zvsJReQPX5/QP0Kzfd/VrpjScIQ7SHBW6bFCYfEP+fp0Eppr1SHhIO5nd1PjZtvclzSzES9D/p5nFJurwfWg=="],
"parse-css-color": ["parse-css-color@0.2.1", "", { "dependencies": { "color-name": "^1.1.4", "hex-rgb": "^4.1.0" } }, "sha512-bwS/GGIFV3b6KS4uwpzCFj4w297Yl3uqnSgIPsoQkx7GMLROXfMnWvxfNkL0oh8HVhZA4hvJoEoEIqonfJ3BWg=="],
"parse-entities": ["parse-entities@4.0.2", "", { "dependencies": { "@types/unist": "^2.0.0", "character-entities-legacy": "^3.0.0", "character-reference-invalid": "^2.0.0", "decode-named-character-reference": "^1.0.0", "is-alphanumerical": "^2.0.0", "is-decimal": "^2.0.0", "is-hexadecimal": "^2.0.0" } }, "sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw=="],
@@ -2638,6 +2660,8 @@
"process-warning": ["process-warning@5.0.0", "", {}, "sha512-a39t9ApHNx2L4+HBnQKqxxHNs1r7KF+Intd8Q/g1bUh6q0WIp9voPXJ/x0j+ZL45KF1pJd9+q2jLIRMfvEshkA=="],
"progress": ["progress@2.0.3", "", {}, "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA=="],
"prom-client": ["prom-client@15.1.3", "", { "dependencies": { "@opentelemetry/api": "^1.4.0", "tdigest": "^0.1.1" } }, "sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g=="],
"prompts": ["prompts@2.4.2", "", { "dependencies": { "kleur": "^3.0.3", "sisteransi": "^1.0.5" } }, "sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q=="],
@@ -3140,7 +3164,7 @@
"whatwg-url": ["whatwg-url@14.2.0", "", { "dependencies": { "tr46": "^5.1.0", "webidl-conversions": "^7.0.0" } }, "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw=="],
"which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
"which": ["which@1.3.1", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "which": "./bin/which" } }, "sha512-HxJdYWq1MTIQbJ3nw0cqssHoTNU267KlrDuGZ1WYlxDStUtKUhOaJmh112/TZmHxxUfuJqPXSOm7tDyas0OSIQ=="],
"why-is-node-running": ["why-is-node-running@2.3.0", "", { "dependencies": { "siginfo": "^2.0.0", "stackback": "0.0.2" }, "bin": { "why-is-node-running": "cli.js" } }, "sha512-hUrmaWBdVDcxvYqnyh09zunKzROWjbZTiNy8dBEjkS7ehEDQibXJ7XvlmtbwuTclUiIyN+CyXQD4Vmko8fNm8w=="],
@@ -3418,6 +3442,8 @@
"@types/cors/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
"@types/fluent-ffmpeg/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
"@types/jsdom/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
"@types/node-fetch/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
@@ -3454,6 +3480,8 @@
"content-disposition/safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="],
"cross-spawn/which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
"dom-serializer/entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="],
"ecdsa-sig-formatter/safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="],
@@ -3508,6 +3536,8 @@
"http-proxy-agent/agent-base": ["agent-base@7.1.4", "", {}, "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ=="],
"http-response-object/@types/node": ["@types/node@10.17.60", "", {}, "sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw=="],
"inquirer/ora": ["ora@5.4.1", "", { "dependencies": { "bl": "^4.1.0", "chalk": "^4.1.0", "cli-cursor": "^3.1.0", "cli-spinners": "^2.5.0", "is-interactive": "^1.0.0", "is-unicode-supported": "^0.1.0", "log-symbols": "^4.1.0", "strip-ansi": "^6.0.0", "wcwidth": "^1.0.1" } }, "sha512-5b6Y85tPxZZ7QytO+BQzysW31HJku27cRIlkbAXaNx+BdcVi+LlRFmVXzeF6a7JCwJpyw5c4b+YSVImQIrBpuQ=="],
"isomorphic-unfetch/node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="],
@@ -3766,6 +3796,8 @@
"@types/cors/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
"@types/fluent-ffmpeg/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
"@types/jsdom/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
"@types/node-fetch/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],

View File

@@ -78,7 +78,8 @@ FROM base AS runner
WORKDIR /app
# Install Python and dependencies for guardrails PII detection (cached separately)
RUN apk add --no-cache python3 py3-pip bash
# Also install ffmpeg for audio/video processing in STT
RUN apk add --no-cache python3 py3-pip bash ffmpeg
ENV NODE_ENV=production

View File

@@ -39,8 +39,11 @@
"@t3-oss/env-nextjs": "0.13.4",
"@tanstack/react-query": "5.90.8",
"@tanstack/react-query-devtools": "5.90.2",
"@types/fluent-ffmpeg": "2.1.28",
"cronstrue": "3.3.0",
"drizzle-orm": "^0.44.5",
"ffmpeg-static": "5.3.0",
"fluent-ffmpeg": "2.1.3",
"mongodb": "6.19.0",
"neo4j-driver": "6.0.1",
"onedollarstats": "0.0.10",
@@ -63,5 +66,8 @@
"*.{js,jsx,ts,tsx,json,css,scss}": [
"biome check --write --no-errors-on-unmatched --files-ignore-unknown=true"
]
}
},
"trustedDependencies": [
"ffmpeg-static"
]
}