feat(tools): added speech to text with openai whisper, elevenlabs, and deepgram (#2068)

* feat(tools): added speech to text with openai whisper, elevenlabs, and deepgram * added new file icons, implemented ffmpeg * updated docs * revert environment
2026-01-08 22:48:14 -05:00 · 2025-11-19 21:03:54 -08:00
parent 7c5d625ca5
commit e64b1c9fcd
27 changed files with 1884 additions and 18 deletions
--- a/apps/docs/components/icons.tsx
+++ b/apps/docs/components/icons.tsx
@@ -4084,3 +4084,27 @@ export function CalendlyIcon(props: SVGProps<SVGSVGElement>) {
    </svg>
  )
 }
 export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
  return (
    <svg
      {...props}
      xmlns='http://www.w3.org/2000/svg'
      width='24'
      height='24'
      viewBox='0 0 24 24'
      fill='none'
      stroke='currentColor'
      strokeWidth='2'
      strokeLinecap='round'
      strokeLinejoin='round'
    >
      <path d='M2 10v3' />
      <path d='M6 6v11' />
      <path d='M10 3v18' />
      <path d='M14 8v7' />
      <path d='M18 5v13' />
      <path d='M22 10v3' />
    </svg>
  )
 }
--- a/apps/docs/components/ui/icon-mapping.ts
+++ b/apps/docs/components/ui/icon-mapping.ts
@@ -8,6 +8,7 @@ import {
  ApolloIcon,
  ArxivIcon,
  AsanaIcon,
  AudioWaveformIcon,
  BrainIcon,
  BrowserUseIcon,
  CalendlyIcon,
@@ -100,6 +101,7 @@ export const blockTypeToIconMap: Record<string, IconComponent> = {
  telegram: TelegramIcon,
  tavily: TavilyIcon,
  supabase: SupabaseIcon,
  stt: AudioWaveformIcon,
  stripe: StripeIcon,
  stagehand_agent: StagehandIcon,
  stagehand: StagehandIcon,
--- a/apps/docs/content/docs/en/tools/calendly.mdx
+++ b/apps/docs/content/docs/en/tools/calendly.mdx
@@ -10,6 +10,20 @@ import { BlockInfoCard } from "@/components/ui/block-info-card"
  color="#FFFFFF"
 />
 {/* MANUAL-CONTENT-START:intro */}
 [Calendly](https://calendly.com/) is a popular scheduling automation platform that helps you book meetings, events, and appointments with ease. With Calendly, teams and individuals can streamline scheduling, reduce back-and-forth emails, and automate tasks around events.
 With the Sim Calendly integration, your agents can:
 - **Retrieve information about your account and scheduled events**: Use tools to fetch user info, event types, and scheduled events for analysis or automation.
 - **Manage event types and scheduling**: Access and list available event types for users or organizations, retrieve details about specific event types, and monitor scheduled meetings and invitee data.
 - **Automate follow-ups and workflows**: When users schedule, reschedule, or cancel meetings, Sim agents can automatically trigger corresponding workflows—such as sending reminders, updating CRMs, or notifying participants.
 - **Integrate easily using webhooks**: Set up Sim workflows to respond to real-time Calendly webhook events, including when invitees schedule, cancel, or interact with routing forms.
 Whether you want to automate meeting prep, manage invites, or run custom workflows in response to scheduling activity, the Calendly tools in Sim give you flexible and secure access. Unlock new automation by reacting instantly to scheduling changes—streamlining your team's operations and communications.
 {/* MANUAL-CONTENT-END */}
 ## Usage Instructions
 Integrate Calendly into your workflow. Manage event types, scheduled events, invitees, and webhooks. Can also trigger workflows based on Calendly webhook events (invitee scheduled, invitee canceled, routing form submitted). Requires Personal Access Token.
--- a/apps/docs/content/docs/en/tools/meta.json
+++ b/apps/docs/content/docs/en/tools/meta.json
@@ -61,6 +61,7 @@
    "stagehand",
    "stagehand_agent",
    "stripe",
    "stt",
    "supabase",
    "tavily",
    "telegram",
--- a/apps/docs/content/docs/en/tools/stt.mdx
+++ b/apps/docs/content/docs/en/tools/stt.mdx
@@ -0,0 +1,122 @@
 ---
 title: Speech-to-Text
 description: Convert speech to text using AI
 ---
 import { BlockInfoCard } from "@/components/ui/block-info-card"
 <BlockInfoCard 
  type="stt"
  color="#181C1E"
 />
 {/* MANUAL-CONTENT-START:intro */}
 Transcribe speech to text using state-of-the-art AI models from leading providers. The Sim Speech-to-Text (STT) tools allow you to convert audio and video files into accurate transcripts, supporting multiple languages, timestamps, and optional translation.
 Supported providers:
 - **[OpenAI Whisper](https://platform.openai.com/docs/guides/speech-to-text/overview)**: Advanced open-source STT model from OpenAI. Supports models such as `whisper-1` and handles a wide variety of languages and audio formats.
 - **[Deepgram](https://deepgram.com/)**: Real-time and batch STT API with deep learning models like `nova-3`, `nova-2`, and `whisper-large`. Offers features like diarization, intent recognition, and industry-specific tuning.
 - **[ElevenLabs](https://elevenlabs.io/)**: Known for high-quality speech AI, ElevenLabs provides STT models focused on accuracy and natural language understanding for numerous languages and dialects.
 Choose the provider and model best suited to your task—whether fast, production-grade transcription (Deepgram), highly accurate multi-language capability (Whisper), or advanced understanding and language coverage (ElevenLabs).
 {/* MANUAL-CONTENT-END */}
 ## Usage Instructions
 Transcribe audio and video files to text using leading AI providers. Supports multiple languages, timestamps, and speaker diarization.
 ## Tools
 ### `stt_whisper`
 Transcribe audio to text using OpenAI Whisper
 #### Input
 | Parameter | Type | Required | Description |
 | --------- | ---- | -------- | ----------- |
 | `provider` | string | Yes | STT provider \(whisper\) |
 | `apiKey` | string | Yes | OpenAI API key |
 | `model` | string | No | Whisper model to use \(default: whisper-1\) |
 | `audioFile` | file | No | Audio or video file to transcribe |
 | `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
 | `audioUrl` | string | No | URL to audio or video file |
 | `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
 | `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
 | `translateToEnglish` | boolean | No | Translate audio to English |
 #### Output
 | Parameter | Type | Description |
 | --------- | ---- | ----------- |
 | `transcript` | string | Full transcribed text |
 | `segments` | array | Timestamped segments |
 | `language` | string | Detected or specified language |
 | `duration` | number | Audio duration in seconds |
 | `confidence` | number | Overall confidence score |
 ### `stt_deepgram`
 Transcribe audio to text using Deepgram
 #### Input
 | Parameter | Type | Required | Description |
 | --------- | ---- | -------- | ----------- |
 | `provider` | string | Yes | STT provider \(deepgram\) |
 | `apiKey` | string | Yes | Deepgram API key |
 | `model` | string | No | Deepgram model to use \(nova-3, nova-2, whisper-large, etc.\) |
 | `audioFile` | file | No | Audio or video file to transcribe |
 | `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
 | `audioUrl` | string | No | URL to audio or video file |
 | `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
 | `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
 | `diarization` | boolean | No | Enable speaker diarization |
 #### Output
 | Parameter | Type | Description |
 | --------- | ---- | ----------- |
 | `transcript` | string | Full transcribed text |
 | `segments` | array | Timestamped segments with speaker labels |
 | `language` | string | Detected or specified language |
 | `duration` | number | Audio duration in seconds |
 | `confidence` | number | Overall confidence score |
 ### `stt_elevenlabs`
 Transcribe audio to text using ElevenLabs
 #### Input
 | Parameter | Type | Required | Description |
 | --------- | ---- | -------- | ----------- |
 | `provider` | string | Yes | STT provider \(elevenlabs\) |
 | `apiKey` | string | Yes | ElevenLabs API key |
 | `model` | string | No | ElevenLabs model to use \(scribe_v1, scribe_v1_experimental\) |
 | `audioFile` | file | No | Audio or video file to transcribe |
 | `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
 | `audioUrl` | string | No | URL to audio or video file |
 | `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
 | `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
 #### Output
 | Parameter | Type | Description |
 | --------- | ---- | ----------- |
 | `transcript` | string | Full transcribed text |
 | `segments` | array | Timestamped segments |
 | `language` | string | Detected or specified language |
 | `duration` | number | Audio duration in seconds |
 | `confidence` | number | Overall confidence score |
 ## Notes
 - Category: `tools`
 - Type: `stt`
--- a/apps/sim/app/api/files/upload/route.ts
+++ b/apps/sim/app/api/files/upload/route.ts
@@ -13,21 +13,37 @@ import {
 } from '@/app/api/files/utils'
 const ALLOWED_EXTENSIONS = new Set([
  // Documents
  'pdf',
  'doc',
  'docx',
  'txt',
  'md',
  'png',
  'jpg',
  'jpeg',
  'gif',
  'csv',
  'xlsx',
  'xls',
  'json',
  'yaml',
  'yml',
  // Images
  'png',
  'jpg',
  'jpeg',
  'gif',
  // Audio
  'mp3',
  'm4a',
  'wav',
  'webm',
  'ogg',
  'flac',
  'aac',
  'opus',
  // Video
  'mp4',
  'mov',
  'avi',
  'mkv',
 ])
 function validateFileExtension(filename: string): boolean {
--- a/apps/sim/app/api/proxy/stt/route.ts
+++ b/apps/sim/app/api/proxy/stt/route.ts
@@ -0,0 +1,375 @@
 import { type NextRequest, NextResponse } from 'next/server'
 import { extractAudioFromVideo, isVideoFile } from '@/lib/audio/extractor'
 import { checkHybridAuth } from '@/lib/auth/hybrid'
 import { createLogger } from '@/lib/logs/console/logger'
 import { downloadFileFromStorage } from '@/lib/uploads/utils/file-utils.server'
 import type { UserFile } from '@/executor/types'
 import type { TranscriptSegment } from '@/tools/stt/types'
 const logger = createLogger('SttProxyAPI')
 export const dynamic = 'force-dynamic'
 export const maxDuration = 300 // 5 minutes for large files
 interface SttRequestBody {
  provider: 'whisper' | 'deepgram' | 'elevenlabs'
  apiKey: string
  model?: string
  audioFile?: UserFile | UserFile[]
  audioFileReference?: UserFile | UserFile[]
  audioUrl?: string
  language?: string
  timestamps?: 'none' | 'sentence' | 'word'
  diarization?: boolean
  translateToEnglish?: boolean
  workspaceId?: string
  workflowId?: string
  executionId?: string
 }
 export async function POST(request: NextRequest) {
  const requestId = crypto.randomUUID()
  logger.info(`[${requestId}] STT transcription request started`)
  try {
    const authResult = await checkHybridAuth(request, { requireWorkflowId: false })
    if (!authResult.success) {
      return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
    }
    const body: SttRequestBody = await request.json()
    const { provider, apiKey, model, language, timestamps, diarization, translateToEnglish } = body
    if (!provider || !apiKey) {
      return NextResponse.json(
        { error: 'Missing required fields: provider and apiKey' },
        { status: 400 }
      )
    }
    let audioBuffer: Buffer
    let audioFileName: string
    let audioMimeType: string
    if (body.audioFile) {
      const file = Array.isArray(body.audioFile) ? body.audioFile[0] : body.audioFile
      logger.info(`[${requestId}] Processing uploaded file: ${file.name}`)
      audioBuffer = await downloadFileFromStorage(file, requestId, logger)
      audioFileName = file.name
      audioMimeType = file.type
    } else if (body.audioFileReference) {
      const file = Array.isArray(body.audioFileReference)
        ? body.audioFileReference[0]
        : body.audioFileReference
      logger.info(`[${requestId}] Processing referenced file: ${file.name}`)
      audioBuffer = await downloadFileFromStorage(file, requestId, logger)
      audioFileName = file.name
      audioMimeType = file.type
    } else if (body.audioUrl) {
      logger.info(`[${requestId}] Downloading from URL: ${body.audioUrl}`)
      const response = await fetch(body.audioUrl)
      if (!response.ok) {
        throw new Error(`Failed to download audio from URL: ${response.statusText}`)
      }
      const arrayBuffer = await response.arrayBuffer()
      audioBuffer = Buffer.from(arrayBuffer)
      audioFileName = body.audioUrl.split('/').pop() || 'audio_file'
      audioMimeType = response.headers.get('content-type') || 'audio/mpeg'
    } else {
      return NextResponse.json(
        { error: 'No audio source provided. Provide audioFile, audioFileReference, or audioUrl' },
        { status: 400 }
      )
    }
    if (isVideoFile(audioMimeType)) {
      logger.info(`[${requestId}] Extracting audio from video file`)
      try {
        const extracted = await extractAudioFromVideo(audioBuffer, audioMimeType, {
          outputFormat: 'mp3',
          sampleRate: 16000,
          channels: 1,
        })
        audioBuffer = extracted.buffer
        audioMimeType = 'audio/mpeg'
        audioFileName = audioFileName.replace(/\.[^.]+$/, '.mp3')
      } catch (error) {
        logger.error(`[${requestId}] Video extraction failed:`, error)
        return NextResponse.json(
          {
            error: `Failed to extract audio from video: ${error instanceof Error ? error.message : 'Unknown error'}`,
          },
          { status: 500 }
        )
      }
    }
    logger.info(`[${requestId}] Transcribing with ${provider}, file: ${audioFileName}`)
    let transcript: string
    let segments: TranscriptSegment[] | undefined
    let detectedLanguage: string | undefined
    let duration: number | undefined
    let confidence: number | undefined
    try {
      if (provider === 'whisper') {
        const result = await transcribeWithWhisper(
          audioBuffer,
          apiKey,
          language,
          timestamps,
          translateToEnglish,
          model
        )
        transcript = result.transcript
        segments = result.segments
        detectedLanguage = result.language
        duration = result.duration
      } else if (provider === 'deepgram') {
        const result = await transcribeWithDeepgram(
          audioBuffer,
          apiKey,
          language,
          timestamps,
          diarization,
          model
        )
        transcript = result.transcript
        segments = result.segments
        detectedLanguage = result.language
        duration = result.duration
        confidence = result.confidence
      } else if (provider === 'elevenlabs') {
        const result = await transcribeWithElevenLabs(
          audioBuffer,
          apiKey,
          language,
          timestamps,
          model
        )
        transcript = result.transcript
        segments = result.segments
        detectedLanguage = result.language
        duration = result.duration
      } else {
        return NextResponse.json({ error: `Unknown provider: ${provider}` }, { status: 400 })
      }
    } catch (error) {
      logger.error(`[${requestId}] Transcription failed:`, error)
      const errorMessage = error instanceof Error ? error.message : 'Transcription failed'
      return NextResponse.json({ error: errorMessage }, { status: 500 })
    }
    logger.info(`[${requestId}] Transcription completed successfully`)
    return NextResponse.json({
      transcript,
      segments,
      language: detectedLanguage,
      duration,
      confidence,
    })
  } catch (error) {
    logger.error(`[${requestId}] STT proxy error:`, error)
    const errorMessage = error instanceof Error ? error.message : 'Unknown error'
    return NextResponse.json({ error: errorMessage }, { status: 500 })
  }
 }
 async function transcribeWithWhisper(
  audioBuffer: Buffer,
  apiKey: string,
  language?: string,
  timestamps?: 'none' | 'sentence' | 'word',
  translate?: boolean,
  model?: string
 ): Promise<{
  transcript: string
  segments?: TranscriptSegment[]
  language?: string
  duration?: number
 }> {
  const formData = new FormData()
  const blob = new Blob([new Uint8Array(audioBuffer)], { type: 'audio/mpeg' })
  formData.append('file', blob, 'audio.mp3')
  formData.append('model', model || 'whisper-1')
  if (language && language !== 'auto') {
    formData.append('language', language)
  }
  if (timestamps === 'word') {
    formData.append('response_format', 'verbose_json')
    formData.append('timestamp_granularities[]', 'word')
  } else if (timestamps === 'sentence') {
    formData.append('response_format', 'verbose_json')
    formData.append('timestamp_granularities[]', 'segment')
  }
  const endpoint = translate ? 'translations' : 'transcriptions'
  const response = await fetch(`https://api.openai.com/v1/audio/${endpoint}`, {
    method: 'POST',
    headers: {
      Authorization: `Bearer ${apiKey}`,
    },
    body: formData,
  })
  if (!response.ok) {
    const error = await response.json()
    const errorMessage = error.error?.message || error.message || JSON.stringify(error)
    throw new Error(`Whisper API error: ${errorMessage}`)
  }
  const data = await response.json()
  if (timestamps === 'none') {
    return {
      transcript: data.text,
      language: data.language,
    }
  }
  const segments: TranscriptSegment[] = (data.segments || data.words || []).map((seg: any) => ({
    text: seg.text,
    start: seg.start,
    end: seg.end,
  }))
  return {
    transcript: data.text,
    segments,
    language: data.language,
    duration: data.duration,
  }
 }
 async function transcribeWithDeepgram(
  audioBuffer: Buffer,
  apiKey: string,
  language?: string,
  timestamps?: 'none' | 'sentence' | 'word',
  diarization?: boolean,
  model?: string
 ): Promise<{
  transcript: string
  segments?: TranscriptSegment[]
  language?: string
  duration?: number
  confidence?: number
 }> {
  const params = new URLSearchParams({
    model: model || 'nova-3',
    smart_format: 'true',
    punctuate: 'true',
  })
  if (language && language !== 'auto') {
    params.append('language', language)
  }
  if (timestamps !== 'none') {
    params.append('utterances', 'true')
  }
  if (diarization) {
    params.append('diarize', 'true')
  }
  const response = await fetch(`https://api.deepgram.com/v1/listen?${params.toString()}`, {
    method: 'POST',
    headers: {
      Authorization: `Token ${apiKey}`,
      'Content-Type': 'audio/mpeg',
    },
    body: new Uint8Array(audioBuffer),
  })
  if (!response.ok) {
    const error = await response.json()
    const errorMessage = error.err_msg || error.message || JSON.stringify(error)
    throw new Error(`Deepgram API error: ${errorMessage}`)
  }
  const data = await response.json()
  const result = data.results?.channels?.[0]?.alternatives?.[0]
  if (!result) {
    throw new Error('No transcription result from Deepgram')
  }
  const transcript = result.transcript
  const detectedLanguage = data.results?.channels?.[0]?.detected_language
  const confidence = result.confidence
  let segments: TranscriptSegment[] | undefined
  if (timestamps !== 'none' && result.words) {
    segments = result.words.map((word: any) => ({
      text: word.word,
      start: word.start,
      end: word.end,
      speaker: word.speaker !== undefined ? `Speaker ${word.speaker}` : undefined,
      confidence: word.confidence,
    }))
  }
  return {
    transcript,
    segments,
    language: detectedLanguage,
    duration: data.metadata?.duration,
    confidence,
  }
 }
 async function transcribeWithElevenLabs(
  audioBuffer: Buffer,
  apiKey: string,
  language?: string,
  timestamps?: 'none' | 'sentence' | 'word',
  model?: string
 ): Promise<{
  transcript: string
  segments?: TranscriptSegment[]
  language?: string
  duration?: number
 }> {
  const formData = new FormData()
  const blob = new Blob([new Uint8Array(audioBuffer)], { type: 'audio/mpeg' })
  formData.append('file', blob, 'audio.mp3')
  formData.append('model_id', model || 'scribe_v1')
  if (language && language !== 'auto') {
    formData.append('language', language)
  }
  const response = await fetch('https://api.elevenlabs.io/v1/speech-to-text', {
    method: 'POST',
    headers: {
      'xi-api-key': apiKey,
    },
    body: formData,
  })
  if (!response.ok) {
    const error = await response.json()
    const errorMessage =
      typeof error.detail === 'string'
        ? error.detail
        : error.detail?.message || error.message || JSON.stringify(error)
    throw new Error(`ElevenLabs API error: ${errorMessage}`)
  }
  const data = await response.json()
  return {
    transcript: data.text || '',
    language: data.language,
    duration: data.duration,
  }
 }
--- a/apps/sim/app/workspace/[workspaceId]/knowledge/components/icons/document-icons.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/knowledge/components/icons/document-icons.tsx
@@ -144,6 +144,62 @@ export const TxtIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
  </svg>
 )
 export const AudioIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
  <svg viewBox='0 0 24 24' fill='none' xmlns='http://www.w3.org/2000/svg' className={className}>
    <path
      d='M14 2H6C4.9 2 4 2.9 4 4V20C4 21.1 4.9 22 6 22H18C19.1 22 20 21.1 20 20V8L14 2Z'
      fill='#0288D1'
    />
    <path d='M14 2V8H20' fill='#29B6F6' />
    <path
      d='M14 2L20 8V20C20 21.1 19.1 22 18 22H6C4.9 22 4 21.1 4 20V4C4 2.9 4.9 2 6 2H14Z'
      stroke='#01579B'
      strokeWidth='0.5'
      strokeLinecap='round'
      strokeLinejoin='round'
    />
    {/* Speaker icon */}
    <path d='M8.5 10.5v3c0 .28.22.5.5.5h1.5l2 2V8l-2 2H9c-.28 0-.5.22-.5.5z' fill='white' />
    {/* Sound waves */}
    <path
      d='M14 10.5c.6.6.6 1.4 0 2M15.5 9c1.2 1.2 1.2 3.8 0 5'
      stroke='white'
      strokeWidth='0.8'
      strokeLinecap='round'
    />
  </svg>
 )
 export const VideoIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
  <svg viewBox='0 0 24 24' fill='none' xmlns='http://www.w3.org/2000/svg' className={className}>
    <path
      d='M14 2H6C4.9 2 4 2.9 4 4V20C4 21.1 4.9 22 6 22H18C19.1 22 20 21.1 20 20V8L14 2Z'
      fill='#D32F2F'
    />
    <path d='M14 2V8H20' fill='#EF5350' />
    <path
      d='M14 2L20 8V20C20 21.1 19.1 22 18 22H6C4.9 22 4 21.1 4 20V4C4 2.9 4.9 2 6 2H14Z'
      stroke='#B71C1C'
      strokeWidth='0.5'
      strokeLinecap='round'
      strokeLinejoin='round'
    />
    {/* Video screen */}
    <rect
      x='7.5'
      y='9.5'
      width='9'
      height='6'
      rx='0.5'
      stroke='white'
      strokeWidth='0.8'
      fill='none'
    />
    {/* Play button */}
    <path d='M10.5 11.5l3 2-3 2v-4z' fill='white' />
  </svg>
 )
 export const DefaultFileIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
  <svg viewBox='0 0 24 24' fill='none' xmlns='http://www.w3.org/2000/svg' className={className}>
    <path
@@ -164,13 +220,23 @@ export const DefaultFileIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' })
  </svg>
 )
 // Helper function to get the appropriate icon component
 export function getDocumentIcon(mimeType: string, filename: string): React.FC<IconProps> {
  const extension = filename.split('.').pop()?.toLowerCase()
  const audioExtensions = ['mp3', 'm4a', 'wav', 'webm', 'ogg', 'flac', 'aac', 'opus']
  if (mimeType.startsWith('audio/') || (extension && audioExtensions.includes(extension))) {
    return AudioIcon
  }
  const videoExtensions = ['mp4', 'mov', 'avi', 'mkv']
  if (mimeType.startsWith('video/') || (extension && videoExtensions.includes(extension))) {
    return VideoIcon
  }
  if (mimeType === 'application/pdf' || extension === 'pdf') {
    return PdfIcon
  }
  if (
    mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
    mimeType === 'application/msword' ||
@@ -179,6 +245,7 @@ export function getDocumentIcon(mimeType: string, filename: string): React.FC<Ic
  ) {
    return DocxIcon
  }
  if (
    mimeType === 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ||
    mimeType === 'application/vnd.ms-excel' ||
@@ -187,11 +254,14 @@ export function getDocumentIcon(mimeType: string, filename: string): React.FC<Ic
  ) {
    return XlsxIcon
  }
  if (mimeType === 'text/csv' || extension === 'csv') {
    return CsvIcon
  }
  if (mimeType === 'text/plain' || extension === 'txt') {
    return TxtIcon
  }
  return DefaultFileIcon
 }
--- a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel-new/components/editor/components/sub-block/components/file-upload/file-upload.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel-new/components/editor/components/sub-block/components/file-upload/file-upload.tsx
@@ -148,21 +148,29 @@ export function FileUpload({
    const maxSizeInBytes = maxSize * 1024 * 1024
    const validFiles: File[] = []
    let totalNewSize = 0
    let sizeExceededFile: string | null = null
    for (let i = 0; i < files.length; i++) {
      const file = files[i]
      if (existingTotalSize + totalNewSize + file.size > maxSizeInBytes) {
-        logger.error(
+        const errorMessage = `Adding ${file.name} would exceed the maximum size limit of ${maxSize}MB`
-          `Adding ${file.name} would exceed the maximum size limit of ${maxSize}MB`,
+        logger.error(errorMessage, activeWorkflowId)
-          activeWorkflowId
+        if (!sizeExceededFile) {
-        )
+          sizeExceededFile = errorMessage
        }
      } else {
        validFiles.push(file)
        totalNewSize += file.size
      }
    }
-    if (validFiles.length === 0) return
+    if (validFiles.length === 0) {
      if (sizeExceededFile) {
        setUploadError(sizeExceededFile)
        setTimeout(() => setUploadError(null), 5000)
      }
      return
    }
    const uploading = validFiles.map((file) => ({
      id: `upload-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
--- a/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/settings-modal/components/files/files.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/settings-modal/components/files/files.tsx
@@ -32,6 +32,7 @@ const logger = createLogger('FileUploadsSettings')
 const isBillingEnabled = isTruthy(getEnv('NEXT_PUBLIC_BILLING_ENABLED'))
 const SUPPORTED_EXTENSIONS = [
  // Documents
  'pdf',
  'csv',
  'doc',
@@ -47,9 +48,23 @@ const SUPPORTED_EXTENSIONS = [
  'json',
  'yaml',
  'yml',
  // Audio formats
  'mp3',
  'm4a',
  'wav',
  'webm',
  'ogg',
  'flac',
  'aac',
  'opus',
  // Video formats
  'mp4',
  'mov',
  'avi',
  'mkv',
 ] as const
 const ACCEPT_ATTR =
-  '.pdf,.csv,.doc,.docx,.txt,.md,.xlsx,.xls,.html,.htm,.pptx,.ppt,.json,.yaml,.yml'
+  '.pdf,.csv,.doc,.docx,.txt,.md,.xlsx,.xls,.html,.htm,.pptx,.ppt,.json,.yaml,.yml,.mp3,.m4a,.wav,.webm,.ogg,.flac,.aac,.opus,.mp4,.mov,.avi,.mkv'
 export function Files() {
  const params = useParams()
--- a/apps/sim/blocks/blocks/stt.ts
+++ b/apps/sim/blocks/blocks/stt.ts
@@ -0,0 +1,232 @@
 import { AudioWaveformIcon } from '@/components/icons'
 import { AuthMode, type BlockConfig } from '@/blocks/types'
 import type { SttBlockResponse } from '@/tools/stt/types'
 export const SttBlock: BlockConfig<SttBlockResponse> = {
  type: 'stt',
  name: 'Speech-to-Text',
  description: 'Convert speech to text using AI',
  authMode: AuthMode.ApiKey,
  longDescription:
    'Transcribe audio and video files to text using leading AI providers. Supports multiple languages, timestamps, and speaker diarization.',
  docsLink: 'https://docs.sim.ai/tools/stt',
  category: 'tools',
  bgColor: '#181C1E',
  icon: AudioWaveformIcon,
  subBlocks: [
    // Provider selection
    {
      id: 'provider',
      title: 'Provider',
      type: 'dropdown',
      options: [
        { label: 'OpenAI Whisper', id: 'whisper' },
        { label: 'Deepgram', id: 'deepgram' },
        { label: 'ElevenLabs', id: 'elevenlabs' },
      ],
      value: () => 'whisper',
      required: true,
    },
    // OpenAI Whisper model selection
    {
      id: 'model',
      title: 'Model',
      type: 'dropdown',
      condition: { field: 'provider', value: 'whisper' },
      options: [{ label: 'Whisper-1', id: 'whisper-1' }],
      value: () => 'whisper-1',
      required: false,
    },
    // ElevenLabs model selection
    {
      id: 'model',
      title: 'Model',
      type: 'dropdown',
      condition: { field: 'provider', value: 'elevenlabs' },
      options: [
        { label: 'Scribe v1', id: 'scribe_v1' },
        { label: 'Scribe v1 Experimental', id: 'scribe_v1_experimental' },
      ],
      value: () => 'scribe_v1',
      required: false,
    },
    // Deepgram model selection
    {
      id: 'model',
      title: 'Model',
      type: 'dropdown',
      condition: { field: 'provider', value: 'deepgram' },
      options: [
        { label: 'Nova 3', id: 'nova-3' },
        { label: 'Nova 2', id: 'nova-2' },
        { label: 'Nova', id: 'nova' },
        { label: 'Whisper Large', id: 'whisper-large' },
        { label: 'Enhanced', id: 'enhanced' },
        { label: 'Base', id: 'base' },
      ],
      value: () => 'nova-3',
      required: false,
    },
    // Audio/Video file upload (basic mode)
    {
      id: 'audioFile',
      title: 'Audio/Video File',
      type: 'file-upload',
      canonicalParamId: 'audioFile',
      placeholder: 'Upload an audio or video file',
      mode: 'basic',
      multiple: false,
      required: false,
      acceptedTypes: '.mp3,.m4a,.wav,.webm,.ogg,.flac,.aac,.opus,.mp4,.mov,.avi,.mkv',
    },
    // Audio file reference (advanced mode)
    {
      id: 'audioFileReference',
      title: 'Audio/Video File Reference',
      type: 'short-input',
      canonicalParamId: 'audioFile',
      placeholder: 'Reference audio/video from previous blocks',
      mode: 'advanced',
      required: false,
    },
    // Audio URL (alternative)
    {
      id: 'audioUrl',
      title: 'Audio/Video URL (alternative)',
      type: 'short-input',
      placeholder: 'Or enter publicly accessible audio/video URL',
      required: false,
    },
    // Language selection
    {
      id: 'language',
      title: 'Language',
      type: 'dropdown',
      options: [
        { label: 'Auto-detect', id: 'auto' },
        { label: 'English', id: 'en' },
        { label: 'Spanish', id: 'es' },
        { label: 'French', id: 'fr' },
        { label: 'German', id: 'de' },
        { label: 'Italian', id: 'it' },
        { label: 'Portuguese', id: 'pt' },
        { label: 'Dutch', id: 'nl' },
        { label: 'Russian', id: 'ru' },
        { label: 'Chinese', id: 'zh' },
        { label: 'Japanese', id: 'ja' },
        { label: 'Korean', id: 'ko' },
        { label: 'Arabic', id: 'ar' },
        { label: 'Hindi', id: 'hi' },
        { label: 'Polish', id: 'pl' },
        { label: 'Turkish', id: 'tr' },
        { label: 'Swedish', id: 'sv' },
        { label: 'Danish', id: 'da' },
        { label: 'Norwegian', id: 'no' },
        { label: 'Finnish', id: 'fi' },
      ],
      value: () => 'auto',
    },
    // Timestamps (word-level, sentence-level, or none)
    {
      id: 'timestamps',
      title: 'Timestamps',
      type: 'dropdown',
      options: [
        { label: 'None', id: 'none' },
        { label: 'Sentence-level', id: 'sentence' },
        { label: 'Word-level', id: 'word' },
      ],
      value: () => 'none',
    },
    // Speaker diarization (Deepgram/AssemblyAI only)
    {
      id: 'diarization',
      title: 'Speaker Diarization',
      type: 'switch',
      condition: { field: 'provider', value: ['deepgram'] },
    },
    // Translate to English (Whisper only)
    {
      id: 'translateToEnglish',
      title: 'Translate to English',
      type: 'switch',
      condition: { field: 'provider', value: 'whisper' },
    },
    // API Key
    {
      id: 'apiKey',
      title: 'API Key',
      type: 'short-input',
      placeholder: 'Enter your API key',
      password: true,
      required: true,
    },
  ],
  tools: {
    access: ['stt_whisper', 'stt_deepgram', 'stt_elevenlabs'],
    config: {
      tool: (params) => {
        // Select tool based on provider
        switch (params.provider) {
          case 'whisper':
            return 'stt_whisper'
          case 'deepgram':
            return 'stt_deepgram'
          case 'elevenlabs':
            return 'stt_elevenlabs'
          default:
            return 'stt_whisper'
        }
      },
      params: (params) => ({
        provider: params.provider,
        apiKey: params.apiKey,
        model: params.model,
        audioFile: params.audioFile,
        audioFileReference: params.audioFileReference,
        audioUrl: params.audioUrl,
        language: params.language,
        timestamps: params.timestamps,
        diarization: params.diarization,
        translateToEnglish: params.translateToEnglish,
      }),
    },
  },
  inputs: {
    provider: { type: 'string', description: 'STT provider (whisper, deepgram, elevenlabs)' },
    apiKey: { type: 'string', description: 'Provider API key' },
    model: {
      type: 'string',
      description: 'Provider-specific model (e.g., scribe_v1 for ElevenLabs, nova-3 for Deepgram)',
    },
    audioFile: { type: 'json', description: 'Audio/video file (UserFile)' },
    audioFileReference: { type: 'json', description: 'Audio/video file reference' },
    audioUrl: { type: 'string', description: 'Audio/video URL' },
    language: { type: 'string', description: 'Language code or auto' },
    timestamps: { type: 'string', description: 'Timestamp granularity (none, sentence, word)' },
    diarization: { type: 'boolean', description: 'Enable speaker diarization' },
    translateToEnglish: { type: 'boolean', description: 'Translate to English (Whisper only)' },
  },
  outputs: {
    transcript: { type: 'string', description: 'Full transcribed text' },
    segments: { type: 'array', description: 'Timestamped segments with speaker labels' },
    language: { type: 'string', description: 'Detected or specified language' },
    duration: { type: 'number', description: 'Audio duration in seconds' },
    confidence: { type: 'number', description: 'Overall confidence score' },
  },
 }
--- a/apps/sim/blocks/registry.ts
+++ b/apps/sim/blocks/registry.ts
@@ -77,6 +77,7 @@ import { StagehandAgentBlock } from '@/blocks/blocks/stagehand_agent'
 import { StartTriggerBlock } from '@/blocks/blocks/start_trigger'
 import { StarterBlock } from '@/blocks/blocks/starter'
 import { StripeBlock } from '@/blocks/blocks/stripe'
 import { SttBlock } from '@/blocks/blocks/stt'
 import { SupabaseBlock } from '@/blocks/blocks/supabase'
 import { TavilyBlock } from '@/blocks/blocks/tavily'
 import { TelegramBlock } from '@/blocks/blocks/telegram'
@@ -177,6 +178,7 @@ export const registry: Record<string, BlockConfig> = {
  stagehand_agent: StagehandAgentBlock,
  slack: SlackBlock,
  starter: StarterBlock,
  stt: SttBlock,
  start_trigger: StartTriggerBlock,
  input_trigger: InputTriggerBlock,
  chat_trigger: ChatTriggerBlock,
--- a/apps/sim/components/icons.tsx
+++ b/apps/sim/components/icons.tsx
@@ -4084,3 +4084,27 @@ export function CalendlyIcon(props: SVGProps<SVGSVGElement>) {
    </svg>
  )
 }
 export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
  return (
    <svg
      {...props}
      xmlns='http://www.w3.org/2000/svg'
      width='24'
      height='24'
      viewBox='0 0 24 24'
      fill='none'
      stroke='currentColor'
      strokeWidth='2'
      strokeLinecap='round'
      strokeLinejoin='round'
    >
      <path d='M2 10v3' />
      <path d='M6 6v11' />
      <path d='M10 3v18' />
      <path d='M14 8v7' />
      <path d='M18 5v13' />
      <path d='M22 10v3' />
    </svg>
  )
 }
--- a/apps/sim/lib/audio/extractor.ts
+++ b/apps/sim/lib/audio/extractor.ts
@@ -0,0 +1,294 @@
 import { execSync } from 'node:child_process'
 import fs from 'node:fs/promises'
 import os from 'node:os'
 import path from 'node:path'
 import ffmpegStatic from 'ffmpeg-static'
 import ffmpeg from 'fluent-ffmpeg'
 import type {
  AudioExtractionOptions,
  AudioExtractionResult,
  AudioMetadata,
 } from '@/lib/audio/types'
 // Set ffmpeg binary path with fallback to system ffmpeg
 try {
  if (ffmpegStatic && typeof ffmpegStatic === 'string') {
    ffmpeg.setFfmpegPath(ffmpegStatic)
  } else {
    // Try to find system ffmpeg
    try {
      const systemFfmpeg = execSync('which ffmpeg', { encoding: 'utf-8' }).trim()
      if (systemFfmpeg) {
        ffmpeg.setFfmpegPath(systemFfmpeg)
        console.log('[FFmpeg] Using system ffmpeg:', systemFfmpeg)
      }
    } catch {
      console.warn(
        '[FFmpeg] ffmpeg-static not available and system ffmpeg not found. Please install ffmpeg: brew install ffmpeg (macOS) or apt-get install ffmpeg (Linux)'
      )
    }
  }
 } catch (error) {
  console.warn('[FFmpeg] Failed to set ffmpeg path:', error)
 }
 /**
 * Extract audio from video or convert audio format using FFmpeg
 */
 export async function extractAudioFromVideo(
  inputBuffer: Buffer,
  mimeType: string,
  options: AudioExtractionOptions = {}
 ): Promise<AudioExtractionResult> {
  const isVideo = mimeType.startsWith('video/')
  const isAudio = mimeType.startsWith('audio/')
  // If it's already audio and no conversion needed, get metadata and return
  if (isAudio && !options.outputFormat) {
    try {
      const metadata = await getAudioMetadata(inputBuffer, mimeType)
      return {
        buffer: inputBuffer,
        format: mimeType.split('/')[1] || 'unknown',
        duration: metadata.duration || 0,
        size: inputBuffer.length,
      }
    } catch (error) {
      // If metadata extraction fails, still return the buffer
      return {
        buffer: inputBuffer,
        format: mimeType.split('/')[1] || 'unknown',
        duration: 0,
        size: inputBuffer.length,
      }
    }
  }
  // For video or audio conversion, use ffmpeg
  if (isVideo || options.outputFormat) {
    return await convertAudioWithFFmpeg(inputBuffer, mimeType, options)
  }
  // Fallback
  return {
    buffer: inputBuffer,
    format: options.outputFormat || mimeType.split('/')[1] || 'unknown',
    duration: 0,
    size: inputBuffer.length,
  }
 }
 /**
 * Convert audio/video using FFmpeg
 */
 async function convertAudioWithFFmpeg(
  inputBuffer: Buffer,
  mimeType: string,
  options: AudioExtractionOptions
 ): Promise<AudioExtractionResult> {
  // Create temporary files
  const tempDir = os.tmpdir()
  const inputExt = getExtensionFromMimeType(mimeType)
  const outputFormat = options.outputFormat || 'mp3'
  const inputFile = path.join(tempDir, `ffmpeg-input-${Date.now()}.${inputExt}`)
  const outputFile = path.join(tempDir, `ffmpeg-output-${Date.now()}.${outputFormat}`)
  try {
    // Write input buffer to temporary file
    await fs.writeFile(inputFile, inputBuffer)
    // Get metadata for duration
    let duration = 0
    try {
      const metadata = await getAudioMetadataFromFile(inputFile)
      duration = metadata.duration || 0
    } catch (error) {
      // Metadata extraction failed, continue without duration
      console.warn('Failed to extract metadata:', error)
    }
    // Convert using FFmpeg
    await new Promise<void>((resolve, reject) => {
      let command = ffmpeg(inputFile).toFormat(outputFormat).audioCodec(getAudioCodec(outputFormat))
      // Apply audio options
      if (options.channels) {
        command = command.audioChannels(options.channels)
      }
      if (options.sampleRate) {
        command = command.audioFrequency(options.sampleRate)
      }
      if (options.bitrate) {
        command = command.audioBitrate(options.bitrate)
      }
      command
        .on('end', () => resolve())
        .on('error', (err) => reject(new Error(`FFmpeg error: ${err.message}`)))
        .save(outputFile)
    })
    // Read output file
    const outputBuffer = await fs.readFile(outputFile)
    return {
      buffer: outputBuffer,
      format: outputFormat,
      duration,
      size: outputBuffer.length,
    }
  } finally {
    // Clean up temporary files
    try {
      await fs.unlink(inputFile).catch(() => {})
      await fs.unlink(outputFile).catch(() => {})
    } catch (error) {
      // Ignore cleanup errors
    }
  }
 }
 /**
 * Get audio metadata using ffprobe
 */
 export async function getAudioMetadata(buffer: Buffer, mimeType: string): Promise<AudioMetadata> {
  const tempDir = os.tmpdir()
  const inputExt = getExtensionFromMimeType(mimeType)
  const inputFile = path.join(tempDir, `ffprobe-input-${Date.now()}.${inputExt}`)
  try {
    // Write buffer to temporary file
    await fs.writeFile(inputFile, buffer)
    // Get metadata using ffprobe
    return await getAudioMetadataFromFile(inputFile)
  } finally {
    // Clean up temporary file
    try {
      await fs.unlink(inputFile).catch(() => {})
    } catch (error) {
      // Ignore cleanup errors
    }
  }
 }
 /**
 * Get audio metadata from a file path using ffprobe
 */
 async function getAudioMetadataFromFile(filePath: string): Promise<AudioMetadata> {
  return new Promise((resolve, reject) => {
    ffmpeg.ffprobe(filePath, (err, metadata) => {
      if (err) {
        reject(new Error(`FFprobe error: ${err.message}`))
        return
      }
      const audioStream = metadata.streams.find((s) => s.codec_type === 'audio')
      const format = metadata.format
      resolve({
        duration: format.duration || 0,
        format: format.format_name || 'unknown',
        codec: audioStream?.codec_name,
        sampleRate: audioStream?.sample_rate,
        channels: audioStream?.channels,
        bitrate: format.bit_rate ? Number(format.bit_rate) : undefined,
      })
    })
  })
 }
 /**
 * Get file extension from MIME type
 */
 function getExtensionFromMimeType(mimeType: string): string {
  const mimeToExt: Record<string, string> = {
    // Video
    'video/mp4': 'mp4',
    'video/quicktime': 'mov',
    'video/x-msvideo': 'avi',
    'video/x-matroska': 'mkv',
    'video/webm': 'webm',
    // Audio
    'audio/mpeg': 'mp3',
    'audio/mp4': 'm4a',
    'audio/wav': 'wav',
    'audio/webm': 'webm',
    'audio/ogg': 'ogg',
    'audio/flac': 'flac',
    'audio/aac': 'aac',
    'audio/opus': 'opus',
  }
  return mimeToExt[mimeType] || mimeType.split('/')[1] || 'dat'
 }
 /**
 * Get appropriate audio codec for output format
 */
 function getAudioCodec(format: string): string {
  const codecMap: Record<string, string> = {
    mp3: 'libmp3lame',
    wav: 'pcm_s16le',
    flac: 'flac',
    m4a: 'aac',
    aac: 'aac',
    ogg: 'libvorbis',
    opus: 'libopus',
  }
  return codecMap[format] || 'libmp3lame'
 }
 /**
 * Check if a file is a video file
 */
 export function isVideoFile(mimeType: string): boolean {
  return mimeType.startsWith('video/')
 }
 /**
 * Check if a file is an audio file
 */
 export function isAudioFile(mimeType: string): boolean {
  return mimeType.startsWith('audio/')
 }
 /**
 * Get optimal audio format for STT provider
 */
 export function getOptimalFormat(provider: 'whisper' | 'deepgram' | 'elevenlabs'): {
  format: 'mp3' | 'wav' | 'flac'
  sampleRate: number
  channels: 1 | 2
 } {
  switch (provider) {
    case 'whisper':
      // Whisper prefers 16kHz mono
      return {
        format: 'mp3',
        sampleRate: 16000,
        channels: 1,
      }
    case 'deepgram':
      // Deepgram works well with various formats
      return {
        format: 'mp3',
        sampleRate: 16000,
        channels: 1,
      }
    case 'elevenlabs':
      // ElevenLabs format preferences
      return {
        format: 'mp3',
        sampleRate: 16000,
        channels: 1,
      }
    default:
      return {
        format: 'mp3',
        sampleRate: 16000,
        channels: 1,
      }
  }
 }
--- a/apps/sim/lib/audio/types.ts
+++ b/apps/sim/lib/audio/types.ts
@@ -0,0 +1,22 @@
 export interface AudioExtractionOptions {
  outputFormat?: 'mp3' | 'wav' | 'flac'
  sampleRate?: number
  channels?: 1 | 2
  bitrate?: string
 }
 export interface AudioExtractionResult {
  buffer: Buffer
  format: string
  duration: number
  size: number
 }
 export interface AudioMetadata {
  duration: number
  format: string
  codec?: string
  sampleRate?: number
  channels?: number
  bitrate?: number
 }
--- a/apps/sim/lib/uploads/utils/file-utils.ts
+++ b/apps/sim/lib/uploads/utils/file-utils.ts
@@ -12,7 +12,7 @@ export interface FileAttachment {
 }
 export interface MessageContent {
-  type: 'text' | 'image' | 'document'
+  type: 'text' | 'image' | 'document' | 'audio' | 'video'
  text?: string
  source?: {
    type: 'base64'
@@ -24,7 +24,7 @@ export interface MessageContent {
 /**
 * Mapping of MIME types to content types
 */
-export const MIME_TYPE_MAPPING: Record<string, 'image' | 'document'> = {
+export const MIME_TYPE_MAPPING: Record<string, 'image' | 'document' | 'audio' | 'video'> = {
  // Images
  'image/jpeg': 'image',
  'image/jpg': 'image',
@@ -49,12 +49,40 @@ export const MIME_TYPE_MAPPING: Record<string, 'image' | 'document'> = {
  'application/vnd.ms-powerpoint': 'document', // .ppt
  'text/markdown': 'document',
  'application/rtf': 'document',
  // Audio
  'audio/mpeg': 'audio', // .mp3
  'audio/mp3': 'audio',
  'audio/mp4': 'audio', // .m4a
  'audio/x-m4a': 'audio',
  'audio/m4a': 'audio',
  'audio/wav': 'audio',
  'audio/wave': 'audio',
  'audio/x-wav': 'audio',
  'audio/webm': 'audio',
  'audio/ogg': 'audio',
  'audio/vorbis': 'audio',
  'audio/flac': 'audio',
  'audio/x-flac': 'audio',
  'audio/aac': 'audio',
  'audio/x-aac': 'audio',
  'audio/opus': 'audio',
  // Video
  'video/mp4': 'video',
  'video/mpeg': 'video',
  'video/quicktime': 'video', // .mov
  'video/x-quicktime': 'video',
  'video/x-msvideo': 'video', // .avi
  'video/avi': 'video',
  'video/x-matroska': 'video', // .mkv
  'video/webm': 'video',
 }
 /**
 * Get the content type for a given MIME type
 */
-export function getContentType(mimeType: string): 'image' | 'document' | null {
+export function getContentType(mimeType: string): 'image' | 'document' | 'audio' | 'video' | null {
  return MIME_TYPE_MAPPING[mimeType.toLowerCase()] || null
 }
@@ -80,6 +108,28 @@ export function isImageFileType(mimeType: string): boolean {
  return imageTypes.includes(mimeType.toLowerCase())
 }
 /**
 * Check if a MIME type is an audio type
 */
 export function isAudioFileType(mimeType: string): boolean {
  return getContentType(mimeType) === 'audio'
 }
 /**
 * Check if a MIME type is a video type
 */
 export function isVideoFileType(mimeType: string): boolean {
  return getContentType(mimeType) === 'video'
 }
 /**
 * Check if a MIME type is an audio or video type
 */
 export function isMediaFileType(mimeType: string): boolean {
  const contentType = getContentType(mimeType)
  return contentType === 'audio' || contentType === 'video'
 }
 /**
 * Convert a file buffer to base64
 */
@@ -143,6 +193,22 @@ export function getMimeTypeFromExtension(extension: string): string {
    ppt: 'application/vnd.ms-powerpoint',
    md: 'text/markdown',
    rtf: 'application/rtf',
    // Audio
    mp3: 'audio/mpeg',
    m4a: 'audio/mp4',
    wav: 'audio/wav',
    webm: 'audio/webm',
    ogg: 'audio/ogg',
    flac: 'audio/flac',
    aac: 'audio/aac',
    opus: 'audio/opus',
    // Video
    mp4: 'video/mp4',
    mov: 'video/quicktime',
    avi: 'video/x-msvideo',
    mkv: 'video/x-matroska',
  }
  return extensionMimeMap[extension.toLowerCase()] || 'application/octet-stream'
--- a/apps/sim/lib/uploads/utils/validation.ts
+++ b/apps/sim/lib/uploads/utils/validation.ts
@@ -20,7 +20,26 @@ export const SUPPORTED_DOCUMENT_EXTENSIONS = [
  'yml',
 ] as const
 export const SUPPORTED_AUDIO_EXTENSIONS = [
  'mp3',
  'm4a',
  'wav',
  'webm',
  'ogg',
  'flac',
  'aac',
  'opus',
 ] as const
 export const SUPPORTED_VIDEO_EXTENSIONS = ['mp4', 'mov', 'avi', 'mkv', 'webm'] as const
 export type SupportedDocumentExtension = (typeof SUPPORTED_DOCUMENT_EXTENSIONS)[number]
 export type SupportedAudioExtension = (typeof SUPPORTED_AUDIO_EXTENSIONS)[number]
 export type SupportedVideoExtension = (typeof SUPPORTED_VIDEO_EXTENSIONS)[number]
 export type SupportedMediaExtension =
  | SupportedDocumentExtension
  | SupportedAudioExtension
  | SupportedVideoExtension
 export const SUPPORTED_MIME_TYPES: Record<SupportedDocumentExtension, string[]> = {
  pdf: ['application/pdf', 'application/x-pdf'],
@@ -54,7 +73,33 @@ export const SUPPORTED_MIME_TYPES: Record<SupportedDocumentExtension, string[]>
  yml: ['text/yaml', 'text/x-yaml', 'application/yaml', 'application/x-yaml'],
 }
 export const SUPPORTED_AUDIO_MIME_TYPES: Record<SupportedAudioExtension, string[]> = {
  mp3: ['audio/mpeg', 'audio/mp3'],
  m4a: ['audio/mp4', 'audio/x-m4a', 'audio/m4a'],
  wav: ['audio/wav', 'audio/wave', 'audio/x-wav'],
  webm: ['audio/webm'],
  ogg: ['audio/ogg', 'audio/vorbis'],
  flac: ['audio/flac', 'audio/x-flac'],
  aac: ['audio/aac', 'audio/x-aac'],
  opus: ['audio/opus'],
 }
 export const SUPPORTED_VIDEO_MIME_TYPES: Record<SupportedVideoExtension, string[]> = {
  mp4: ['video/mp4', 'video/mpeg'],
  mov: ['video/quicktime', 'video/x-quicktime'],
  avi: ['video/x-msvideo', 'video/avi'],
  mkv: ['video/x-matroska'],
  webm: ['video/webm'],
 }
 export const ACCEPTED_FILE_TYPES = Object.values(SUPPORTED_MIME_TYPES).flat()
 export const ACCEPTED_AUDIO_TYPES = Object.values(SUPPORTED_AUDIO_MIME_TYPES).flat()
 export const ACCEPTED_VIDEO_TYPES = Object.values(SUPPORTED_VIDEO_MIME_TYPES).flat()
 export const ACCEPTED_MEDIA_TYPES = [
  ...ACCEPTED_FILE_TYPES,
  ...ACCEPTED_AUDIO_TYPES,
  ...ACCEPTED_VIDEO_TYPES,
 ]
 export const ACCEPTED_FILE_EXTENSIONS = SUPPORTED_DOCUMENT_EXTENSIONS.map((ext) => `.${ext}`)
@@ -110,5 +155,61 @@ export function getSupportedMimeTypes(extension: string): string[] {
  if (isSupportedExtension(extension)) {
    return SUPPORTED_MIME_TYPES[extension as SupportedDocumentExtension]
  }
  if (SUPPORTED_AUDIO_EXTENSIONS.includes(extension as SupportedAudioExtension)) {
    return SUPPORTED_AUDIO_MIME_TYPES[extension as SupportedAudioExtension]
  }
  if (SUPPORTED_VIDEO_EXTENSIONS.includes(extension as SupportedVideoExtension)) {
    return SUPPORTED_VIDEO_MIME_TYPES[extension as SupportedVideoExtension]
  }
  return []
 }
 /**
 * Check if file extension is a supported audio extension
 */
 export function isSupportedAudioExtension(extension: string): extension is SupportedAudioExtension {
  return SUPPORTED_AUDIO_EXTENSIONS.includes(extension.toLowerCase() as SupportedAudioExtension)
 }
 /**
 * Check if file extension is a supported video extension
 */
 export function isSupportedVideoExtension(extension: string): extension is SupportedVideoExtension {
  return SUPPORTED_VIDEO_EXTENSIONS.includes(extension.toLowerCase() as SupportedVideoExtension)
 }
 /**
 * Validate if an audio/video file type is supported for STT processing
 */
 export function validateMediaFileType(
  fileName: string,
  mimeType: string
 ): FileValidationError | null {
  const extension = path.extname(fileName).toLowerCase().substring(1)
  const isAudio = SUPPORTED_AUDIO_EXTENSIONS.includes(extension as SupportedAudioExtension)
  const isVideo = SUPPORTED_VIDEO_EXTENSIONS.includes(extension as SupportedVideoExtension)
  if (!isAudio && !isVideo) {
    return {
      code: 'UNSUPPORTED_FILE_TYPE',
      message: `Unsupported media file type: ${extension}. Supported audio types: ${SUPPORTED_AUDIO_EXTENSIONS.join(', ')}. Supported video types: ${SUPPORTED_VIDEO_EXTENSIONS.join(', ')}`,
      supportedTypes: [...SUPPORTED_AUDIO_EXTENSIONS, ...SUPPORTED_VIDEO_EXTENSIONS],
    }
  }
  const baseMimeType = mimeType.split(';')[0].trim()
  const allowedMimeTypes = isAudio
    ? SUPPORTED_AUDIO_MIME_TYPES[extension as SupportedAudioExtension]
    : SUPPORTED_VIDEO_MIME_TYPES[extension as SupportedVideoExtension]
  if (!allowedMimeTypes.includes(baseMimeType)) {
    return {
      code: 'MIME_TYPE_MISMATCH',
      message: `MIME type ${baseMimeType} does not match file extension ${extension}. Expected: ${allowedMimeTypes.join(', ')}`,
      supportedTypes: allowedMimeTypes,
    }
  }
  return null
 }
--- a/apps/sim/next.config.ts
+++ b/apps/sim/next.config.ts
@@ -75,7 +75,7 @@ const nextConfig: NextConfig = {
  turbopack: {
    resolveExtensions: ['.tsx', '.ts', '.jsx', '.js', '.mjs', '.json'],
  },
-  serverExternalPackages: ['unpdf'],
+  serverExternalPackages: ['unpdf', 'ffmpeg-static', 'fluent-ffmpeg'],
  experimental: {
    optimizeCss: true,
    turbopackSourceMaps: false,
--- a/apps/sim/tools/registry.ts
+++ b/apps/sim/tools/registry.ts
@@ -605,6 +605,7 @@ import {
  stripeUpdateSubscriptionTool,
  stripeVoidInvoiceTool,
 } from '@/tools/stripe'
 import { deepgramSttTool, elevenLabsSttTool, whisperSttTool } from '@/tools/stt'
 import {
  supabaseCountTool,
  supabaseDeleteTool,
@@ -1050,6 +1051,9 @@ export const tools: Record<string, ToolConfig> = {
  knowledge_upload_chunk: knowledgeUploadChunkTool,
  knowledge_create_document: knowledgeCreateDocumentTool,
  elevenlabs_tts: elevenLabsTtsTool,
  stt_whisper: whisperSttTool,
  stt_deepgram: deepgramSttTool,
  stt_elevenlabs: elevenLabsSttTool,
  s3_get_object: s3GetObjectTool,
  s3_put_object: s3PutObjectTool,
  s3_list_objects: s3ListObjectsTool,
--- a/apps/sim/tools/stt/deepgram.ts
+++ b/apps/sim/tools/stt/deepgram.ts
@@ -0,0 +1,125 @@
 import type { SttParams, SttResponse } from '@/tools/stt/types'
 import type { ToolConfig } from '@/tools/types'
 export const deepgramSttTool: ToolConfig<SttParams, SttResponse> = {
  id: 'stt_deepgram',
  name: 'Deepgram STT',
  description: 'Transcribe audio to text using Deepgram',
  version: '1.0.0',
  params: {
    provider: {
      type: 'string',
      required: true,
      visibility: 'user-only',
      description: 'STT provider (deepgram)',
    },
    apiKey: {
      type: 'string',
      required: true,
      visibility: 'user-only',
      description: 'Deepgram API key',
    },
    model: {
      type: 'string',
      required: false,
      visibility: 'user-or-llm',
      description: 'Deepgram model to use (nova-3, nova-2, whisper-large, etc.)',
    },
    audioFile: {
      type: 'file',
      required: false,
      visibility: 'user-or-llm',
      description: 'Audio or video file to transcribe',
    },
    audioFileReference: {
      type: 'file',
      required: false,
      visibility: 'user-or-llm',
      description: 'Reference to audio/video file from previous blocks',
    },
    audioUrl: {
      type: 'string',
      required: false,
      visibility: 'user-or-llm',
      description: 'URL to audio or video file',
    },
    language: {
      type: 'string',
      required: false,
      visibility: 'user-or-llm',
      description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
    },
    timestamps: {
      type: 'string',
      required: false,
      visibility: 'user-only',
      description: 'Timestamp granularity: none, sentence, or word',
    },
    diarization: {
      type: 'boolean',
      required: false,
      visibility: 'user-only',
      description: 'Enable speaker diarization',
    },
  },
  request: {
    url: '/api/proxy/stt',
    method: 'POST',
    headers: () => ({
      'Content-Type': 'application/json',
    }),
    body: (
      params: SttParams & {
        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
      }
    ) => ({
      provider: 'deepgram',
      apiKey: params.apiKey,
      model: params.model,
      audioFile: params.audioFile,
      audioFileReference: params.audioFileReference,
      audioUrl: params.audioUrl,
      language: params.language || 'auto',
      timestamps: params.timestamps || 'none',
      diarization: params.diarization || false,
      workspaceId: params._context?.workspaceId,
      workflowId: params._context?.workflowId,
      executionId: params._context?.executionId,
    }),
  },
  transformResponse: async (response: Response) => {
    const data = await response.json()
    if (!response.ok || data.error) {
      return {
        success: false,
        error: data.error || 'Transcription failed',
        output: {
          transcript: '',
        },
      }
    }
    return {
      success: true,
      output: {
        transcript: data.transcript,
        segments: data.segments,
        language: data.language,
        duration: data.duration,
        confidence: data.confidence,
      },
    }
  },
  outputs: {
    transcript: { type: 'string', description: 'Full transcribed text' },
    segments: { type: 'array', description: 'Timestamped segments with speaker labels' },
    language: { type: 'string', description: 'Detected or specified language' },
    duration: { type: 'number', description: 'Audio duration in seconds' },
    confidence: { type: 'number', description: 'Overall confidence score' },
  },
 }
--- a/apps/sim/tools/stt/elevenlabs.ts
+++ b/apps/sim/tools/stt/elevenlabs.ts
@@ -0,0 +1,118 @@
 import type { SttParams, SttResponse } from '@/tools/stt/types'
 import type { ToolConfig } from '@/tools/types'
 export const elevenLabsSttTool: ToolConfig<SttParams, SttResponse> = {
  id: 'stt_elevenlabs',
  name: 'ElevenLabs STT',
  description: 'Transcribe audio to text using ElevenLabs',
  version: '1.0.0',
  params: {
    provider: {
      type: 'string',
      required: true,
      visibility: 'user-only',
      description: 'STT provider (elevenlabs)',
    },
    apiKey: {
      type: 'string',
      required: true,
      visibility: 'user-only',
      description: 'ElevenLabs API key',
    },
    model: {
      type: 'string',
      required: false,
      visibility: 'user-or-llm',
      description: 'ElevenLabs model to use (scribe_v1, scribe_v1_experimental)',
    },
    audioFile: {
      type: 'file',
      required: false,
      visibility: 'user-or-llm',
      description: 'Audio or video file to transcribe',
    },
    audioFileReference: {
      type: 'file',
      required: false,
      visibility: 'user-or-llm',
      description: 'Reference to audio/video file from previous blocks',
    },
    audioUrl: {
      type: 'string',
      required: false,
      visibility: 'user-or-llm',
      description: 'URL to audio or video file',
    },
    language: {
      type: 'string',
      required: false,
      visibility: 'user-or-llm',
      description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
    },
    timestamps: {
      type: 'string',
      required: false,
      visibility: 'user-only',
      description: 'Timestamp granularity: none, sentence, or word',
    },
  },
  request: {
    url: '/api/proxy/stt',
    method: 'POST',
    headers: () => ({
      'Content-Type': 'application/json',
    }),
    body: (
      params: SttParams & {
        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
      }
    ) => ({
      provider: 'elevenlabs',
      apiKey: params.apiKey,
      model: params.model,
      audioFile: params.audioFile,
      audioFileReference: params.audioFileReference,
      audioUrl: params.audioUrl,
      language: params.language || 'auto',
      timestamps: params.timestamps || 'none',
      workspaceId: params._context?.workspaceId,
      workflowId: params._context?.workflowId,
      executionId: params._context?.executionId,
    }),
  },
  transformResponse: async (response: Response) => {
    const data = await response.json()
    if (!response.ok || data.error) {
      return {
        success: false,
        error: data.error || 'Transcription failed',
        output: {
          transcript: '',
        },
      }
    }
    return {
      success: true,
      output: {
        transcript: data.transcript,
        segments: data.segments,
        language: data.language,
        duration: data.duration,
        confidence: data.confidence,
      },
    }
  },
  outputs: {
    transcript: { type: 'string', description: 'Full transcribed text' },
    segments: { type: 'array', description: 'Timestamped segments' },
    language: { type: 'string', description: 'Detected or specified language' },
    duration: { type: 'number', description: 'Audio duration in seconds' },
    confidence: { type: 'number', description: 'Overall confidence score' },
  },
 }
--- a/apps/sim/tools/stt/index.ts
+++ b/apps/sim/tools/stt/index.ts
@@ -0,0 +1,5 @@
 import { deepgramSttTool } from '@/tools/stt/deepgram'
 import { elevenLabsSttTool } from '@/tools/stt/elevenlabs'
 import { whisperSttTool } from '@/tools/stt/whisper'
 export { whisperSttTool, deepgramSttTool, elevenLabsSttTool }
--- a/apps/sim/tools/stt/types.ts
+++ b/apps/sim/tools/stt/types.ts
@@ -0,0 +1,62 @@
 import type { UserFile } from '@/executor/types'
 import type { ToolResponse } from '@/tools/types'
 export interface SttParams {
  provider: 'whisper' | 'deepgram' | 'elevenlabs'
  apiKey: string
  model?: string
  audioFile?: UserFile | UserFile[]
  audioFileReference?: UserFile | UserFile[]
  audioUrl?: string
  language?: string
  timestamps?: 'none' | 'sentence' | 'word'
  diarization?: boolean
  translateToEnglish?: boolean
 }
 export interface TranscriptSegment {
  text: string
  start: number
  end: number
  speaker?: string
  confidence?: number
 }
 export interface SttResponse extends ToolResponse {
  output: {
    transcript: string
    segments?: TranscriptSegment[]
    language?: string
    duration?: number
    confidence?: number
  }
 }
 export interface SttBlockResponse extends ToolResponse {
  output: {
    transcript: string
    segments?: TranscriptSegment[]
    language?: string
    duration?: number
    confidence?: number
  }
 }
 // Provider-specific types
 export interface WhisperParams extends Omit<SttParams, 'provider'> {
  model?: string
  responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'
  temperature?: number
 }
 export interface DeepgramParams extends Omit<SttParams, 'provider'> {
  model?: string
  punctuate?: boolean
  paragraphs?: boolean
  utterances?: boolean
 }
 export interface ElevenLabsSttParams extends Omit<SttParams, 'provider'> {
  model?: string
 }
--- a/apps/sim/tools/stt/whisper.ts
+++ b/apps/sim/tools/stt/whisper.ts
@@ -0,0 +1,125 @@
 import type { SttParams, SttResponse } from '@/tools/stt/types'
 import type { ToolConfig } from '@/tools/types'
 export const whisperSttTool: ToolConfig<SttParams, SttResponse> = {
  id: 'stt_whisper',
  name: 'OpenAI Whisper STT',
  description: 'Transcribe audio to text using OpenAI Whisper',
  version: '1.0.0',
  params: {
    provider: {
      type: 'string',
      required: true,
      visibility: 'user-only',
      description: 'STT provider (whisper)',
    },
    apiKey: {
      type: 'string',
      required: true,
      visibility: 'user-only',
      description: 'OpenAI API key',
    },
    model: {
      type: 'string',
      required: false,
      visibility: 'user-or-llm',
      description: 'Whisper model to use (default: whisper-1)',
    },
    audioFile: {
      type: 'file',
      required: false,
      visibility: 'user-or-llm',
      description: 'Audio or video file to transcribe',
    },
    audioFileReference: {
      type: 'file',
      required: false,
      visibility: 'user-or-llm',
      description: 'Reference to audio/video file from previous blocks',
    },
    audioUrl: {
      type: 'string',
      required: false,
      visibility: 'user-or-llm',
      description: 'URL to audio or video file',
    },
    language: {
      type: 'string',
      required: false,
      visibility: 'user-or-llm',
      description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
    },
    timestamps: {
      type: 'string',
      required: false,
      visibility: 'user-only',
      description: 'Timestamp granularity: none, sentence, or word',
    },
    translateToEnglish: {
      type: 'boolean',
      required: false,
      visibility: 'user-only',
      description: 'Translate audio to English',
    },
  },
  request: {
    url: '/api/proxy/stt',
    method: 'POST',
    headers: () => ({
      'Content-Type': 'application/json',
    }),
    body: (
      params: SttParams & {
        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
      }
    ) => ({
      provider: 'whisper',
      apiKey: params.apiKey,
      model: params.model,
      audioFile: params.audioFile,
      audioFileReference: params.audioFileReference,
      audioUrl: params.audioUrl,
      language: params.language || 'auto',
      timestamps: params.timestamps || 'none',
      translateToEnglish: params.translateToEnglish || false,
      workspaceId: params._context?.workspaceId,
      workflowId: params._context?.workflowId,
      executionId: params._context?.executionId,
    }),
  },
  transformResponse: async (response: Response) => {
    const data = await response.json()
    if (!response.ok || data.error) {
      return {
        success: false,
        error: data.error || 'Transcription failed',
        output: {
          transcript: '',
        },
      }
    }
    return {
      success: true,
      output: {
        transcript: data.transcript,
        segments: data.segments,
        language: data.language,
        duration: data.duration,
        confidence: data.confidence,
      },
    }
  },
  outputs: {
    transcript: { type: 'string', description: 'Full transcribed text' },
    segments: { type: 'array', description: 'Timestamped segments' },
    language: { type: 'string', description: 'Detected or specified language' },
    duration: { type: 'number', description: 'Audio duration in seconds' },
    confidence: { type: 'number', description: 'Overall confidence score' },
  },
 }
--- a/bun.lock
+++ b/bun.lock
@@ -9,8 +9,11 @@
        "@t3-oss/env-nextjs": "0.13.4",
        "@tanstack/react-query": "5.90.8",
        "@tanstack/react-query-devtools": "5.90.2",
        "@types/fluent-ffmpeg": "2.1.28",
        "cronstrue": "3.3.0",
        "drizzle-orm": "^0.44.5",
        "ffmpeg-static": "5.3.0",
        "fluent-ffmpeg": "2.1.3",
        "mongodb": "6.19.0",
        "neo4j-driver": "6.0.1",
        "onedollarstats": "0.0.10",
@@ -235,6 +238,7 @@
    },
  },
  "trustedDependencies": [
    "ffmpeg-static",
    "sharp",
  ],
  "overrides": {
@@ -496,6 +500,8 @@
    "@csstools/css-tokenizer": ["@csstools/css-tokenizer@3.0.4", "", {}, "sha512-Vd/9EVDiu6PPJt9yAh6roZP6El1xHrdvIVGjyBsHR0RYwNHgL7FJPyIIW4fANJNG6FtyZfvlRPpFI4ZM/lubvw=="],
    "@derhuerst/http-basic": ["@derhuerst/http-basic@8.2.4", "", { "dependencies": { "caseless": "^0.12.0", "concat-stream": "^2.0.0", "http-response-object": "^3.0.1", "parse-cache-control": "^1.0.1" } }, "sha512-F9rL9k9Xjf5blCz8HsJRO4diy111cayL2vkY2XE4r4t3n0yPXVYy3KD3nJ1qbrSn9743UWSXH4IwuCa/HWlGFw=="],
    "@dimforge/rapier3d-compat": ["@dimforge/rapier3d-compat@0.12.0", "", {}, "sha512-uekIGetywIgopfD97oDL5PfeezkFpNhwlzlaEYNOA0N6ghdsOvh/HYjSMek5Q2O1PYvRSDFcqFVJl4r4ZBwOow=="],
    "@drizzle-team/brocli": ["@drizzle-team/brocli@0.10.2", "", {}, "sha512-z33Il7l5dKjUgGULTqBsQBQwckHh5AbIuxhdsIxDDiZAzBOrZO6q9ogcWC65kU382AfynTfgNumVcNIjuIua6w=="],
@@ -1336,6 +1342,8 @@
    "@types/estree-jsx": ["@types/estree-jsx@1.0.5", "", { "dependencies": { "@types/estree": "*" } }, "sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg=="],
    "@types/fluent-ffmpeg": ["@types/fluent-ffmpeg@2.1.28", "", { "dependencies": { "@types/node": "*" } }, "sha512-5ovxsDwBcPfJ+eYs1I/ZpcYCnkce7pvH9AHSvrZllAp1ZPpTRDZAFjF3TRFbukxSgIYTTNYePbS0rKUmaxVbXw=="],
    "@types/geojson": ["@types/geojson@7946.0.16", "", {}, "sha512-6C8nqWur3j98U6+lXDfTUWIfgvZU+EumvpHKcYjujKH7woYyLj2sUmff0tRhrqM7BohUw7Pz3ZB1jj2gW9Fvmg=="],
    "@types/hast": ["@types/hast@3.0.4", "", { "dependencies": { "@types/unist": "*" } }, "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ=="],
@@ -1470,6 +1478,8 @@
    "astring": ["astring@1.9.0", "", { "bin": { "astring": "bin/astring" } }, "sha512-LElXdjswlqjWrPpJFg1Fx4wpkOCxj1TDHlSV4PlaRxHGWko024xICaa97ZkMfs6DRKlCguiAI+rbXv5GWwXIkg=="],
    "async": ["async@0.2.10", "", {}, "sha512-eAkdoKxU6/LkKDBzLpT+t6Ff5EtfSF4wx1WfJiPEEV7WNLnDaRXk0oVysiEPm262roaachGexwUv94WhSgN5TQ=="],
    "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="],
    "atomic-sleep": ["atomic-sleep@1.0.0", "", {}, "sha512-kNOjDqAh7px0XWNI+4QbzoiR/nTkHAWNud2uvnJquD1/x5a7EQZMJT0AczqK0Qn67oY/TTQ1LbUKajZpp3I9tQ=="],
@@ -1550,6 +1560,8 @@
    "caniuse-lite": ["caniuse-lite@1.0.30001745", "", {}, "sha512-ywt6i8FzvdgrrrGbr1jZVObnVv6adj+0if2/omv9cmR2oiZs30zL4DIyaptKcbOrBdOIc74QTMoJvSE2QHh5UQ=="],
    "caseless": ["caseless@0.12.0", "", {}, "sha512-4tYFyifaFfGacoiObjJegolkwSU4xQNGbVgUiNYVUxbQ2x2lUsFvY4hVgVzGiIe6WLOPqycWXA40l+PWsxthUw=="],
    "ccount": ["ccount@2.0.1", "", {}, "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg=="],
    "cfb": ["cfb@1.2.2", "", { "dependencies": { "adler-32": "~1.3.0", "crc-32": "~1.2.0" } }, "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA=="],
@@ -1818,6 +1830,8 @@
    "entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="],
    "env-paths": ["env-paths@2.2.1", "", {}, "sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A=="],
    "environment": ["environment@1.1.0", "", {}, "sha512-xUtoPkMggbz0MPyPiIWr1Kp4aeWJjDZ6SMvURhimjdZgsRuDplF5/s9hcgGhyXMhs+6vpnuoiZ2kFiu3FMnS8Q=="],
    "error": ["error@7.0.2", "", { "dependencies": { "string-template": "~0.2.1", "xtend": "~4.0.0" } }, "sha512-UtVv4l5MhijsYUxPJo4390gzfZvAnTHreNnDjnTZaKIiZ/SemXxAhBkYSKtWa5RtBXbLP8tMgn/n0RUa/H7jXw=="],
@@ -1916,6 +1930,8 @@
    "fflate": ["fflate@0.8.2", "", {}, "sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A=="],
    "ffmpeg-static": ["ffmpeg-static@5.3.0", "", { "dependencies": { "@derhuerst/http-basic": "^8.2.0", "env-paths": "^2.2.0", "https-proxy-agent": "^5.0.0", "progress": "^2.0.3" } }, "sha512-H+K6sW6TiIX6VGend0KQwthe+kaceeH/luE8dIZyOP35ik7ahYojDuqlTV1bOrtEwl01sy2HFNGQfi5IDJvotg=="],
    "figures": ["figures@3.2.0", "", { "dependencies": { "escape-string-regexp": "^1.0.5" } }, "sha512-yaduQFRKLXYOGgEn6AZau90j3ggSOyiqXU0F9JZfeXYhNa+Jk4X+s45A2zg5jns87GAFa34BBm2kXw4XpNcbdg=="],
    "file-type": ["file-type@16.5.4", "", { "dependencies": { "readable-web-to-node-stream": "^3.0.0", "strtok3": "^6.2.4", "token-types": "^4.1.1" } }, "sha512-/yFHK0aGjFEgDJjEKP0pWCplsPFPhwyfwevf/pVxiN0tmE4L9LmwWxWukdJSHdoCli4VgQLehjJtwQBnqmsKcw=="],
@@ -1924,6 +1940,8 @@
    "finalhandler": ["finalhandler@2.1.0", "", { "dependencies": { "debug": "^4.4.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "on-finished": "^2.4.1", "parseurl": "^1.3.3", "statuses": "^2.0.1" } }, "sha512-/t88Ty3d5JWQbWYgaOGCCYfXRwV1+be02WqYYlL6h0lEiUAMPM8o8qKGO01YIkOHzka2up08wvgYD0mDiI+q3Q=="],
    "fluent-ffmpeg": ["fluent-ffmpeg@2.1.3", "", { "dependencies": { "async": "^0.2.9", "which": "^1.1.1" } }, "sha512-Be3narBNt2s6bsaqP6Jzq91heDgOEaDCJAXcE3qcma/EJBSy5FB4cvO31XBInuAuKBx8Kptf8dkhjK0IOru39Q=="],
    "follow-redirects": ["follow-redirects@1.15.11", "", {}, "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ=="],
    "foreground-child": ["foreground-child@3.3.1", "", { "dependencies": { "cross-spawn": "^7.0.6", "signal-exit": "^4.0.1" } }, "sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw=="],
@@ -2050,6 +2068,8 @@
    "http-proxy-agent": ["http-proxy-agent@7.0.2", "", { "dependencies": { "agent-base": "^7.1.0", "debug": "^4.3.4" } }, "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig=="],
    "http-response-object": ["http-response-object@3.0.2", "", { "dependencies": { "@types/node": "^10.0.3" } }, "sha512-bqX0XTF6fnXSQcEJ2Iuyr75yVakyjIDCqroJQ/aHfSdlM743Cwqoi2nDYMzLGWUcuTWGWy8AAvOKXTfiv6q9RA=="],
    "https-proxy-agent": ["https-proxy-agent@5.0.1", "", { "dependencies": { "agent-base": "6", "debug": "4" } }, "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA=="],
    "human-signals": ["human-signals@5.0.0", "", {}, "sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ=="],
@@ -2538,6 +2558,8 @@
    "papaparse": ["papaparse@5.5.3", "", {}, "sha512-5QvjGxYVjxO59MGU2lHVYpRWBBtKHnlIAcSe1uNFCkkptUh63NFRj0FJQm7nR67puEruUci/ZkjmEFrjCAyP4A=="],
    "parse-cache-control": ["parse-cache-control@1.0.1", "", {}, "sha512-60zvsJReQPX5/QP0Kzfd/VrpjScIQ7SHBW6bFCYfEP+fp0Eppr1SHhIO5nd1PjZtvclzSzES9D/p5nFJurwfWg=="],
    "parse-css-color": ["parse-css-color@0.2.1", "", { "dependencies": { "color-name": "^1.1.4", "hex-rgb": "^4.1.0" } }, "sha512-bwS/GGIFV3b6KS4uwpzCFj4w297Yl3uqnSgIPsoQkx7GMLROXfMnWvxfNkL0oh8HVhZA4hvJoEoEIqonfJ3BWg=="],
    "parse-entities": ["parse-entities@4.0.2", "", { "dependencies": { "@types/unist": "^2.0.0", "character-entities-legacy": "^3.0.0", "character-reference-invalid": "^2.0.0", "decode-named-character-reference": "^1.0.0", "is-alphanumerical": "^2.0.0", "is-decimal": "^2.0.0", "is-hexadecimal": "^2.0.0" } }, "sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw=="],
@@ -2638,6 +2660,8 @@
    "process-warning": ["process-warning@5.0.0", "", {}, "sha512-a39t9ApHNx2L4+HBnQKqxxHNs1r7KF+Intd8Q/g1bUh6q0WIp9voPXJ/x0j+ZL45KF1pJd9+q2jLIRMfvEshkA=="],
    "progress": ["progress@2.0.3", "", {}, "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA=="],
    "prom-client": ["prom-client@15.1.3", "", { "dependencies": { "@opentelemetry/api": "^1.4.0", "tdigest": "^0.1.1" } }, "sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g=="],
    "prompts": ["prompts@2.4.2", "", { "dependencies": { "kleur": "^3.0.3", "sisteransi": "^1.0.5" } }, "sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q=="],
@@ -3140,7 +3164,7 @@
    "whatwg-url": ["whatwg-url@14.2.0", "", { "dependencies": { "tr46": "^5.1.0", "webidl-conversions": "^7.0.0" } }, "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw=="],
-    "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
+    "which": ["which@1.3.1", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "which": "./bin/which" } }, "sha512-HxJdYWq1MTIQbJ3nw0cqssHoTNU267KlrDuGZ1WYlxDStUtKUhOaJmh112/TZmHxxUfuJqPXSOm7tDyas0OSIQ=="],
    "why-is-node-running": ["why-is-node-running@2.3.0", "", { "dependencies": { "siginfo": "^2.0.0", "stackback": "0.0.2" }, "bin": { "why-is-node-running": "cli.js" } }, "sha512-hUrmaWBdVDcxvYqnyh09zunKzROWjbZTiNy8dBEjkS7ehEDQibXJ7XvlmtbwuTclUiIyN+CyXQD4Vmko8fNm8w=="],
@@ -3418,6 +3442,8 @@
    "@types/cors/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
    "@types/fluent-ffmpeg/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
    "@types/jsdom/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
    "@types/node-fetch/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
@@ -3454,6 +3480,8 @@
    "content-disposition/safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="],
    "cross-spawn/which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
    "dom-serializer/entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="],
    "ecdsa-sig-formatter/safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="],
@@ -3508,6 +3536,8 @@
    "http-proxy-agent/agent-base": ["agent-base@7.1.4", "", {}, "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ=="],
    "http-response-object/@types/node": ["@types/node@10.17.60", "", {}, "sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw=="],
    "inquirer/ora": ["ora@5.4.1", "", { "dependencies": { "bl": "^4.1.0", "chalk": "^4.1.0", "cli-cursor": "^3.1.0", "cli-spinners": "^2.5.0", "is-interactive": "^1.0.0", "is-unicode-supported": "^0.1.0", "log-symbols": "^4.1.0", "strip-ansi": "^6.0.0", "wcwidth": "^1.0.1" } }, "sha512-5b6Y85tPxZZ7QytO+BQzysW31HJku27cRIlkbAXaNx+BdcVi+LlRFmVXzeF6a7JCwJpyw5c4b+YSVImQIrBpuQ=="],
    "isomorphic-unfetch/node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="],
@@ -3766,6 +3796,8 @@
    "@types/cors/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
    "@types/fluent-ffmpeg/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
    "@types/jsdom/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
    "@types/node-fetch/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
--- a/docker/app.Dockerfile
+++ b/docker/app.Dockerfile
@@ -78,7 +78,8 @@ FROM base AS runner
 WORKDIR /app
 # Install Python and dependencies for guardrails PII detection (cached separately)
-RUN apk add --no-cache python3 py3-pip bash
+# Also install ffmpeg for audio/video processing in STT
 RUN apk add --no-cache python3 py3-pip bash ffmpeg
 ENV NODE_ENV=production
--- a/package.json
+++ b/package.json
@@ -39,8 +39,11 @@
    "@t3-oss/env-nextjs": "0.13.4",
    "@tanstack/react-query": "5.90.8",
    "@tanstack/react-query-devtools": "5.90.2",
    "@types/fluent-ffmpeg": "2.1.28",
    "cronstrue": "3.3.0",
    "drizzle-orm": "^0.44.5",
    "ffmpeg-static": "5.3.0",
    "fluent-ffmpeg": "2.1.3",
    "mongodb": "6.19.0",
    "neo4j-driver": "6.0.1",
    "onedollarstats": "0.0.10",
@@ -63,5 +66,8 @@
    "*.{js,jsx,ts,tsx,json,css,scss}": [
      "biome check --write --no-errors-on-unmatched --files-ignore-unknown=true"
    ]
-  }
+  },
  "trustedDependencies": [
    "ffmpeg-static"
  ]
 }