feat(tools): added speech to text with openai whisper, elevenlabs, and deepgram (#2068)

* feat(tools): added speech to text with openai whisper, elevenlabs, and deepgram * added new file icons, implemented ffmpeg * updated docs * revert environment
2026-01-09 15:07:55 -05:00 · 2025-11-19 21:03:54 -08:00
parent 7c5d625ca5
commit e64b1c9fcd
27 changed files with 1884 additions and 18 deletions
--- a/apps/docs/components/icons.tsx
+++ b/apps/docs/components/icons.tsx
@@ -4084,3 +4084,27 @@ export function CalendlyIcon(props: SVGProps<SVGSVGElement>) {
    </svg>
  )
 }
+
+export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
+  return (
+    <svg
+      {...props}
+      xmlns='http://www.w3.org/2000/svg'
+      width='24'
+      height='24'
+      viewBox='0 0 24 24'
+      fill='none'
+      stroke='currentColor'
+      strokeWidth='2'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    >
+      <path d='M2 10v3' />
+      <path d='M6 6v11' />
+      <path d='M10 3v18' />
+      <path d='M14 8v7' />
+      <path d='M18 5v13' />
+      <path d='M22 10v3' />
+    </svg>
+  )
+}
--- a/apps/docs/components/ui/icon-mapping.ts
+++ b/apps/docs/components/ui/icon-mapping.ts
@@ -8,6 +8,7 @@ import {
  ApolloIcon,
  ArxivIcon,
  AsanaIcon,
+  AudioWaveformIcon,
  BrainIcon,
  BrowserUseIcon,
  CalendlyIcon,
@@ -100,6 +101,7 @@ export const blockTypeToIconMap: Record<string, IconComponent> = {
  telegram: TelegramIcon,
  tavily: TavilyIcon,
  supabase: SupabaseIcon,
+  stt: AudioWaveformIcon,
  stripe: StripeIcon,
  stagehand_agent: StagehandIcon,
  stagehand: StagehandIcon,
--- a/apps/docs/content/docs/en/tools/calendly.mdx
+++ b/apps/docs/content/docs/en/tools/calendly.mdx
@@ -10,6 +10,20 @@ import { BlockInfoCard } from "@/components/ui/block-info-card"
  color="#FFFFFF"
 />

+{/* MANUAL-CONTENT-START:intro */}
+[Calendly](https://calendly.com/) is a popular scheduling automation platform that helps you book meetings, events, and appointments with ease. With Calendly, teams and individuals can streamline scheduling, reduce back-and-forth emails, and automate tasks around events.
+
+With the Sim Calendly integration, your agents can:
+
+- **Retrieve information about your account and scheduled events**: Use tools to fetch user info, event types, and scheduled events for analysis or automation.
+- **Manage event types and scheduling**: Access and list available event types for users or organizations, retrieve details about specific event types, and monitor scheduled meetings and invitee data.
+- **Automate follow-ups and workflows**: When users schedule, reschedule, or cancel meetings, Sim agents can automatically trigger corresponding workflows—such as sending reminders, updating CRMs, or notifying participants.
+- **Integrate easily using webhooks**: Set up Sim workflows to respond to real-time Calendly webhook events, including when invitees schedule, cancel, or interact with routing forms.
+
+Whether you want to automate meeting prep, manage invites, or run custom workflows in response to scheduling activity, the Calendly tools in Sim give you flexible and secure access. Unlock new automation by reacting instantly to scheduling changes—streamlining your team's operations and communications.
+{/* MANUAL-CONTENT-END */}
+
+
 ## Usage Instructions

 Integrate Calendly into your workflow. Manage event types, scheduled events, invitees, and webhooks. Can also trigger workflows based on Calendly webhook events (invitee scheduled, invitee canceled, routing form submitted). Requires Personal Access Token.
--- a/apps/docs/content/docs/en/tools/meta.json
+++ b/apps/docs/content/docs/en/tools/meta.json
@@ -61,6 +61,7 @@
    "stagehand",
    "stagehand_agent",
    "stripe",
+    "stt",
    "supabase",
    "tavily",
    "telegram",
--- a/apps/docs/content/docs/en/tools/stt.mdx
+++ b/apps/docs/content/docs/en/tools/stt.mdx
@@ -0,0 +1,122 @@
+---
+title: Speech-to-Text
+description: Convert speech to text using AI
+---
+
+import { BlockInfoCard } from "@/components/ui/block-info-card"
+
+<BlockInfoCard 
+  type="stt"
+  color="#181C1E"
+/>
+
+{/* MANUAL-CONTENT-START:intro */}
+Transcribe speech to text using state-of-the-art AI models from leading providers. The Sim Speech-to-Text (STT) tools allow you to convert audio and video files into accurate transcripts, supporting multiple languages, timestamps, and optional translation.
+
+Supported providers:
+
+- **[OpenAI Whisper](https://platform.openai.com/docs/guides/speech-to-text/overview)**: Advanced open-source STT model from OpenAI. Supports models such as `whisper-1` and handles a wide variety of languages and audio formats.
+- **[Deepgram](https://deepgram.com/)**: Real-time and batch STT API with deep learning models like `nova-3`, `nova-2`, and `whisper-large`. Offers features like diarization, intent recognition, and industry-specific tuning.
+- **[ElevenLabs](https://elevenlabs.io/)**: Known for high-quality speech AI, ElevenLabs provides STT models focused on accuracy and natural language understanding for numerous languages and dialects.
+
+Choose the provider and model best suited to your task—whether fast, production-grade transcription (Deepgram), highly accurate multi-language capability (Whisper), or advanced understanding and language coverage (ElevenLabs).
+{/* MANUAL-CONTENT-END */}
+
+
+## Usage Instructions
+
+Transcribe audio and video files to text using leading AI providers. Supports multiple languages, timestamps, and speaker diarization.
+
+
+
+## Tools
+
+### `stt_whisper`
+
+Transcribe audio to text using OpenAI Whisper
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | STT provider \(whisper\) |
+| `apiKey` | string | Yes | OpenAI API key |
+| `model` | string | No | Whisper model to use \(default: whisper-1\) |
+| `audioFile` | file | No | Audio or video file to transcribe |
+| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
+| `audioUrl` | string | No | URL to audio or video file |
+| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
+| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
+| `translateToEnglish` | boolean | No | Translate audio to English |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `transcript` | string | Full transcribed text |
+| `segments` | array | Timestamped segments |
+| `language` | string | Detected or specified language |
+| `duration` | number | Audio duration in seconds |
+| `confidence` | number | Overall confidence score |
+
+### `stt_deepgram`
+
+Transcribe audio to text using Deepgram
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | STT provider \(deepgram\) |
+| `apiKey` | string | Yes | Deepgram API key |
+| `model` | string | No | Deepgram model to use \(nova-3, nova-2, whisper-large, etc.\) |
+| `audioFile` | file | No | Audio or video file to transcribe |
+| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
+| `audioUrl` | string | No | URL to audio or video file |
+| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
+| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
+| `diarization` | boolean | No | Enable speaker diarization |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `transcript` | string | Full transcribed text |
+| `segments` | array | Timestamped segments with speaker labels |
+| `language` | string | Detected or specified language |
+| `duration` | number | Audio duration in seconds |
+| `confidence` | number | Overall confidence score |
+
+### `stt_elevenlabs`
+
+Transcribe audio to text using ElevenLabs
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | STT provider \(elevenlabs\) |
+| `apiKey` | string | Yes | ElevenLabs API key |
+| `model` | string | No | ElevenLabs model to use \(scribe_v1, scribe_v1_experimental\) |
+| `audioFile` | file | No | Audio or video file to transcribe |
+| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
+| `audioUrl` | string | No | URL to audio or video file |
+| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
+| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `transcript` | string | Full transcribed text |
+| `segments` | array | Timestamped segments |
+| `language` | string | Detected or specified language |
+| `duration` | number | Audio duration in seconds |
+| `confidence` | number | Overall confidence score |
+
+
+
+## Notes
+
+- Category: `tools`
+- Type: `stt`
--- a/apps/sim/app/api/files/upload/route.ts
+++ b/apps/sim/app/api/files/upload/route.ts
@@ -13,21 +13,37 @@ import {
 } from '@/app/api/files/utils'

 const ALLOWED_EXTENSIONS = new Set([
+  // Documents
  'pdf',
  'doc',
  'docx',
  'txt',
  'md',
-  'png',
-  'jpg',
-  'jpeg',
-  'gif',
  'csv',
  'xlsx',
  'xls',
  'json',
  'yaml',
  'yml',
+  // Images
+  'png',
+  'jpg',
+  'jpeg',
+  'gif',
+  // Audio
+  'mp3',
+  'm4a',
+  'wav',
+  'webm',
+  'ogg',
+  'flac',
+  'aac',
+  'opus',
+  // Video
+  'mp4',
+  'mov',
+  'avi',
+  'mkv',
 ])

 function validateFileExtension(filename: string): boolean {
--- a/apps/sim/app/api/proxy/stt/route.ts
+++ b/apps/sim/app/api/proxy/stt/route.ts
@@ -0,0 +1,375 @@
+import { type NextRequest, NextResponse } from 'next/server'
+import { extractAudioFromVideo, isVideoFile } from '@/lib/audio/extractor'
+import { checkHybridAuth } from '@/lib/auth/hybrid'
+import { createLogger } from '@/lib/logs/console/logger'
+import { downloadFileFromStorage } from '@/lib/uploads/utils/file-utils.server'
+import type { UserFile } from '@/executor/types'
+import type { TranscriptSegment } from '@/tools/stt/types'
+
+const logger = createLogger('SttProxyAPI')
+
+export const dynamic = 'force-dynamic'
+export const maxDuration = 300 // 5 minutes for large files
+
+interface SttRequestBody {
+  provider: 'whisper' | 'deepgram' | 'elevenlabs'
+  apiKey: string
+  model?: string
+  audioFile?: UserFile | UserFile[]
+  audioFileReference?: UserFile | UserFile[]
+  audioUrl?: string
+  language?: string
+  timestamps?: 'none' | 'sentence' | 'word'
+  diarization?: boolean
+  translateToEnglish?: boolean
+  workspaceId?: string
+  workflowId?: string
+  executionId?: string
+}
+
+export async function POST(request: NextRequest) {
+  const requestId = crypto.randomUUID()
+  logger.info(`[${requestId}] STT transcription request started`)
+
+  try {
+    const authResult = await checkHybridAuth(request, { requireWorkflowId: false })
+    if (!authResult.success) {
+      return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
+    }
+
+    const body: SttRequestBody = await request.json()
+    const { provider, apiKey, model, language, timestamps, diarization, translateToEnglish } = body
+
+    if (!provider || !apiKey) {
+      return NextResponse.json(
+        { error: 'Missing required fields: provider and apiKey' },
+        { status: 400 }
+      )
+    }
+
+    let audioBuffer: Buffer
+    let audioFileName: string
+    let audioMimeType: string
+
+    if (body.audioFile) {
+      const file = Array.isArray(body.audioFile) ? body.audioFile[0] : body.audioFile
+      logger.info(`[${requestId}] Processing uploaded file: ${file.name}`)
+
+      audioBuffer = await downloadFileFromStorage(file, requestId, logger)
+      audioFileName = file.name
+      audioMimeType = file.type
+    } else if (body.audioFileReference) {
+      const file = Array.isArray(body.audioFileReference)
+        ? body.audioFileReference[0]
+        : body.audioFileReference
+      logger.info(`[${requestId}] Processing referenced file: ${file.name}`)
+
+      audioBuffer = await downloadFileFromStorage(file, requestId, logger)
+      audioFileName = file.name
+      audioMimeType = file.type
+    } else if (body.audioUrl) {
+      logger.info(`[${requestId}] Downloading from URL: ${body.audioUrl}`)
+
+      const response = await fetch(body.audioUrl)
+      if (!response.ok) {
+        throw new Error(`Failed to download audio from URL: ${response.statusText}`)
+      }
+
+      const arrayBuffer = await response.arrayBuffer()
+      audioBuffer = Buffer.from(arrayBuffer)
+      audioFileName = body.audioUrl.split('/').pop() || 'audio_file'
+      audioMimeType = response.headers.get('content-type') || 'audio/mpeg'
+    } else {
+      return NextResponse.json(
+        { error: 'No audio source provided. Provide audioFile, audioFileReference, or audioUrl' },
+        { status: 400 }
+      )
+    }
+
+    if (isVideoFile(audioMimeType)) {
+      logger.info(`[${requestId}] Extracting audio from video file`)
+      try {
+        const extracted = await extractAudioFromVideo(audioBuffer, audioMimeType, {
+          outputFormat: 'mp3',
+          sampleRate: 16000,
+          channels: 1,
+        })
+        audioBuffer = extracted.buffer
+        audioMimeType = 'audio/mpeg'
+        audioFileName = audioFileName.replace(/\.[^.]+$/, '.mp3')
+      } catch (error) {
+        logger.error(`[${requestId}] Video extraction failed:`, error)
+        return NextResponse.json(
+          {
+            error: `Failed to extract audio from video: ${error instanceof Error ? error.message : 'Unknown error'}`,
+          },
+          { status: 500 }
+        )
+      }
+    }
+
+    logger.info(`[${requestId}] Transcribing with ${provider}, file: ${audioFileName}`)
+
+    let transcript: string
+    let segments: TranscriptSegment[] | undefined
+    let detectedLanguage: string | undefined
+    let duration: number | undefined
+    let confidence: number | undefined
+
+    try {
+      if (provider === 'whisper') {
+        const result = await transcribeWithWhisper(
+          audioBuffer,
+          apiKey,
+          language,
+          timestamps,
+          translateToEnglish,
+          model
+        )
+        transcript = result.transcript
+        segments = result.segments
+        detectedLanguage = result.language
+        duration = result.duration
+      } else if (provider === 'deepgram') {
+        const result = await transcribeWithDeepgram(
+          audioBuffer,
+          apiKey,
+          language,
+          timestamps,
+          diarization,
+          model
+        )
+        transcript = result.transcript
+        segments = result.segments
+        detectedLanguage = result.language
+        duration = result.duration
+        confidence = result.confidence
+      } else if (provider === 'elevenlabs') {
+        const result = await transcribeWithElevenLabs(
+          audioBuffer,
+          apiKey,
+          language,
+          timestamps,
+          model
+        )
+        transcript = result.transcript
+        segments = result.segments
+        detectedLanguage = result.language
+        duration = result.duration
+      } else {
+        return NextResponse.json({ error: `Unknown provider: ${provider}` }, { status: 400 })
+      }
+    } catch (error) {
+      logger.error(`[${requestId}] Transcription failed:`, error)
+      const errorMessage = error instanceof Error ? error.message : 'Transcription failed'
+      return NextResponse.json({ error: errorMessage }, { status: 500 })
+    }
+
+    logger.info(`[${requestId}] Transcription completed successfully`)
+
+    return NextResponse.json({
+      transcript,
+      segments,
+      language: detectedLanguage,
+      duration,
+      confidence,
+    })
+  } catch (error) {
+    logger.error(`[${requestId}] STT proxy error:`, error)
+    const errorMessage = error instanceof Error ? error.message : 'Unknown error'
+    return NextResponse.json({ error: errorMessage }, { status: 500 })
+  }
+}
+
+async function transcribeWithWhisper(
+  audioBuffer: Buffer,
+  apiKey: string,
+  language?: string,
+  timestamps?: 'none' | 'sentence' | 'word',
+  translate?: boolean,
+  model?: string
+): Promise<{
+  transcript: string
+  segments?: TranscriptSegment[]
+  language?: string
+  duration?: number
+}> {
+  const formData = new FormData()
+
+  const blob = new Blob([new Uint8Array(audioBuffer)], { type: 'audio/mpeg' })
+  formData.append('file', blob, 'audio.mp3')
+  formData.append('model', model || 'whisper-1')
+
+  if (language && language !== 'auto') {
+    formData.append('language', language)
+  }
+
+  if (timestamps === 'word') {
+    formData.append('response_format', 'verbose_json')
+    formData.append('timestamp_granularities[]', 'word')
+  } else if (timestamps === 'sentence') {
+    formData.append('response_format', 'verbose_json')
+    formData.append('timestamp_granularities[]', 'segment')
+  }
+
+  const endpoint = translate ? 'translations' : 'transcriptions'
+  const response = await fetch(`https://api.openai.com/v1/audio/${endpoint}`, {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${apiKey}`,
+    },
+    body: formData,
+  })
+
+  if (!response.ok) {
+    const error = await response.json()
+    const errorMessage = error.error?.message || error.message || JSON.stringify(error)
+    throw new Error(`Whisper API error: ${errorMessage}`)
+  }
+
+  const data = await response.json()
+
+  if (timestamps === 'none') {
+    return {
+      transcript: data.text,
+      language: data.language,
+    }
+  }
+  const segments: TranscriptSegment[] = (data.segments || data.words || []).map((seg: any) => ({
+    text: seg.text,
+    start: seg.start,
+    end: seg.end,
+  }))
+
+  return {
+    transcript: data.text,
+    segments,
+    language: data.language,
+    duration: data.duration,
+  }
+}
+
+async function transcribeWithDeepgram(
+  audioBuffer: Buffer,
+  apiKey: string,
+  language?: string,
+  timestamps?: 'none' | 'sentence' | 'word',
+  diarization?: boolean,
+  model?: string
+): Promise<{
+  transcript: string
+  segments?: TranscriptSegment[]
+  language?: string
+  duration?: number
+  confidence?: number
+}> {
+  const params = new URLSearchParams({
+    model: model || 'nova-3',
+    smart_format: 'true',
+    punctuate: 'true',
+  })
+
+  if (language && language !== 'auto') {
+    params.append('language', language)
+  }
+
+  if (timestamps !== 'none') {
+    params.append('utterances', 'true')
+  }
+
+  if (diarization) {
+    params.append('diarize', 'true')
+  }
+
+  const response = await fetch(`https://api.deepgram.com/v1/listen?${params.toString()}`, {
+    method: 'POST',
+    headers: {
+      Authorization: `Token ${apiKey}`,
+      'Content-Type': 'audio/mpeg',
+    },
+    body: new Uint8Array(audioBuffer),
+  })
+
+  if (!response.ok) {
+    const error = await response.json()
+    const errorMessage = error.err_msg || error.message || JSON.stringify(error)
+    throw new Error(`Deepgram API error: ${errorMessage}`)
+  }
+
+  const data = await response.json()
+  const result = data.results?.channels?.[0]?.alternatives?.[0]
+
+  if (!result) {
+    throw new Error('No transcription result from Deepgram')
+  }
+
+  const transcript = result.transcript
+  const detectedLanguage = data.results?.channels?.[0]?.detected_language
+  const confidence = result.confidence
+
+  let segments: TranscriptSegment[] | undefined
+  if (timestamps !== 'none' && result.words) {
+    segments = result.words.map((word: any) => ({
+      text: word.word,
+      start: word.start,
+      end: word.end,
+      speaker: word.speaker !== undefined ? `Speaker ${word.speaker}` : undefined,
+      confidence: word.confidence,
+    }))
+  }
+
+  return {
+    transcript,
+    segments,
+    language: detectedLanguage,
+    duration: data.metadata?.duration,
+    confidence,
+  }
+}
+
+async function transcribeWithElevenLabs(
+  audioBuffer: Buffer,
+  apiKey: string,
+  language?: string,
+  timestamps?: 'none' | 'sentence' | 'word',
+  model?: string
+): Promise<{
+  transcript: string
+  segments?: TranscriptSegment[]
+  language?: string
+  duration?: number
+}> {
+  const formData = new FormData()
+  const blob = new Blob([new Uint8Array(audioBuffer)], { type: 'audio/mpeg' })
+  formData.append('file', blob, 'audio.mp3')
+  formData.append('model_id', model || 'scribe_v1')
+
+  if (language && language !== 'auto') {
+    formData.append('language', language)
+  }
+
+  const response = await fetch('https://api.elevenlabs.io/v1/speech-to-text', {
+    method: 'POST',
+    headers: {
+      'xi-api-key': apiKey,
+    },
+    body: formData,
+  })
+
+  if (!response.ok) {
+    const error = await response.json()
+    const errorMessage =
+      typeof error.detail === 'string'
+        ? error.detail
+        : error.detail?.message || error.message || JSON.stringify(error)
+    throw new Error(`ElevenLabs API error: ${errorMessage}`)
+  }
+
+  const data = await response.json()
+
+  return {
+    transcript: data.text || '',
+    language: data.language,
+    duration: data.duration,
+  }
+}
--- a/apps/sim/app/workspace/[workspaceId]/knowledge/components/icons/document-icons.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/knowledge/components/icons/document-icons.tsx
@@ -144,6 +144,62 @@ export const TxtIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
  </svg>
 )

+export const AudioIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
+  <svg viewBox='0 0 24 24' fill='none' xmlns='http://www.w3.org/2000/svg' className={className}>
+    <path
+      d='M14 2H6C4.9 2 4 2.9 4 4V20C4 21.1 4.9 22 6 22H18C19.1 22 20 21.1 20 20V8L14 2Z'
+      fill='#0288D1'
+    />
+    <path d='M14 2V8H20' fill='#29B6F6' />
+    <path
+      d='M14 2L20 8V20C20 21.1 19.1 22 18 22H6C4.9 22 4 21.1 4 20V4C4 2.9 4.9 2 6 2H14Z'
+      stroke='#01579B'
+      strokeWidth='0.5'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    />
+    {/* Speaker icon */}
+    <path d='M8.5 10.5v3c0 .28.22.5.5.5h1.5l2 2V8l-2 2H9c-.28 0-.5.22-.5.5z' fill='white' />
+    {/* Sound waves */}
+    <path
+      d='M14 10.5c.6.6.6 1.4 0 2M15.5 9c1.2 1.2 1.2 3.8 0 5'
+      stroke='white'
+      strokeWidth='0.8'
+      strokeLinecap='round'
+    />
+  </svg>
+)
+
+export const VideoIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
+  <svg viewBox='0 0 24 24' fill='none' xmlns='http://www.w3.org/2000/svg' className={className}>
+    <path
+      d='M14 2H6C4.9 2 4 2.9 4 4V20C4 21.1 4.9 22 6 22H18C19.1 22 20 21.1 20 20V8L14 2Z'
+      fill='#D32F2F'
+    />
+    <path d='M14 2V8H20' fill='#EF5350' />
+    <path
+      d='M14 2L20 8V20C20 21.1 19.1 22 18 22H6C4.9 22 4 21.1 4 20V4C4 2.9 4.9 2 6 2H14Z'
+      stroke='#B71C1C'
+      strokeWidth='0.5'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    />
+    {/* Video screen */}
+    <rect
+      x='7.5'
+      y='9.5'
+      width='9'
+      height='6'
+      rx='0.5'
+      stroke='white'
+      strokeWidth='0.8'
+      fill='none'
+    />
+    {/* Play button */}
+    <path d='M10.5 11.5l3 2-3 2v-4z' fill='white' />
+  </svg>
+)
+
 export const DefaultFileIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
  <svg viewBox='0 0 24 24' fill='none' xmlns='http://www.w3.org/2000/svg' className={className}>
    <path
@@ -164,13 +220,23 @@ export const DefaultFileIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' })
  </svg>
 )

-// Helper function to get the appropriate icon component
 export function getDocumentIcon(mimeType: string, filename: string): React.FC<IconProps> {
  const extension = filename.split('.').pop()?.toLowerCase()

+  const audioExtensions = ['mp3', 'm4a', 'wav', 'webm', 'ogg', 'flac', 'aac', 'opus']
+  if (mimeType.startsWith('audio/') || (extension && audioExtensions.includes(extension))) {
+    return AudioIcon
+  }
+
+  const videoExtensions = ['mp4', 'mov', 'avi', 'mkv']
+  if (mimeType.startsWith('video/') || (extension && videoExtensions.includes(extension))) {
+    return VideoIcon
+  }
+
  if (mimeType === 'application/pdf' || extension === 'pdf') {
    return PdfIcon
  }
+
  if (
    mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
    mimeType === 'application/msword' ||
@@ -179,6 +245,7 @@ export function getDocumentIcon(mimeType: string, filename: string): React.FC<Ic
  ) {
    return DocxIcon
  }
+
  if (
    mimeType === 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ||
    mimeType === 'application/vnd.ms-excel' ||
@@ -187,11 +254,14 @@ export function getDocumentIcon(mimeType: string, filename: string): React.FC<Ic
  ) {
    return XlsxIcon
  }
+
  if (mimeType === 'text/csv' || extension === 'csv') {
    return CsvIcon
  }
+
  if (mimeType === 'text/plain' || extension === 'txt') {
    return TxtIcon
  }
+
  return DefaultFileIcon
 }
--- a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel-new/components/editor/components/sub-block/components/file-upload/file-upload.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel-new/components/editor/components/sub-block/components/file-upload/file-upload.tsx
@@ -148,21 +148,29 @@ export function FileUpload({
    const maxSizeInBytes = maxSize * 1024 * 1024
    const validFiles: File[] = []
    let totalNewSize = 0
+    let sizeExceededFile: string | null = null

    for (let i = 0; i < files.length; i++) {
      const file = files[i]
      if (existingTotalSize + totalNewSize + file.size > maxSizeInBytes) {
-        logger.error(
-          `Adding ${file.name} would exceed the maximum size limit of ${maxSize}MB`,
-          activeWorkflowId
-        )
+        const errorMessage = `Adding ${file.name} would exceed the maximum size limit of ${maxSize}MB`
+        logger.error(errorMessage, activeWorkflowId)
+        if (!sizeExceededFile) {
+          sizeExceededFile = errorMessage
+        }
      } else {
        validFiles.push(file)
        totalNewSize += file.size
      }
    }

-    if (validFiles.length === 0) return
+    if (validFiles.length === 0) {
+      if (sizeExceededFile) {
+        setUploadError(sizeExceededFile)
+        setTimeout(() => setUploadError(null), 5000)
+      }
+      return
+    }

    const uploading = validFiles.map((file) => ({
      id: `upload-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
--- a/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/settings-modal/components/files/files.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/settings-modal/components/files/files.tsx
@@ -32,6 +32,7 @@ const logger = createLogger('FileUploadsSettings')
 const isBillingEnabled = isTruthy(getEnv('NEXT_PUBLIC_BILLING_ENABLED'))

 const SUPPORTED_EXTENSIONS = [
+  // Documents
  'pdf',
  'csv',
  'doc',
@@ -47,9 +48,23 @@ const SUPPORTED_EXTENSIONS = [
  'json',
  'yaml',
  'yml',
+  // Audio formats
+  'mp3',
+  'm4a',
+  'wav',
+  'webm',
+  'ogg',
+  'flac',
+  'aac',
+  'opus',
+  // Video formats
+  'mp4',
+  'mov',
+  'avi',
+  'mkv',
 ] as const
 const ACCEPT_ATTR =
-  '.pdf,.csv,.doc,.docx,.txt,.md,.xlsx,.xls,.html,.htm,.pptx,.ppt,.json,.yaml,.yml'
+  '.pdf,.csv,.doc,.docx,.txt,.md,.xlsx,.xls,.html,.htm,.pptx,.ppt,.json,.yaml,.yml,.mp3,.m4a,.wav,.webm,.ogg,.flac,.aac,.opus,.mp4,.mov,.avi,.mkv'

 export function Files() {
  const params = useParams()
--- a/apps/sim/blocks/blocks/stt.ts
+++ b/apps/sim/blocks/blocks/stt.ts
@@ -0,0 +1,232 @@
+import { AudioWaveformIcon } from '@/components/icons'
+import { AuthMode, type BlockConfig } from '@/blocks/types'
+import type { SttBlockResponse } from '@/tools/stt/types'
+
+export const SttBlock: BlockConfig<SttBlockResponse> = {
+  type: 'stt',
+  name: 'Speech-to-Text',
+  description: 'Convert speech to text using AI',
+  authMode: AuthMode.ApiKey,
+  longDescription:
+    'Transcribe audio and video files to text using leading AI providers. Supports multiple languages, timestamps, and speaker diarization.',
+  docsLink: 'https://docs.sim.ai/tools/stt',
+  category: 'tools',
+  bgColor: '#181C1E',
+  icon: AudioWaveformIcon,
+
+  subBlocks: [
+    // Provider selection
+    {
+      id: 'provider',
+      title: 'Provider',
+      type: 'dropdown',
+      options: [
+        { label: 'OpenAI Whisper', id: 'whisper' },
+        { label: 'Deepgram', id: 'deepgram' },
+        { label: 'ElevenLabs', id: 'elevenlabs' },
+      ],
+      value: () => 'whisper',
+      required: true,
+    },
+
+    // OpenAI Whisper model selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'whisper' },
+      options: [{ label: 'Whisper-1', id: 'whisper-1' }],
+      value: () => 'whisper-1',
+      required: false,
+    },
+
+    // ElevenLabs model selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'elevenlabs' },
+      options: [
+        { label: 'Scribe v1', id: 'scribe_v1' },
+        { label: 'Scribe v1 Experimental', id: 'scribe_v1_experimental' },
+      ],
+      value: () => 'scribe_v1',
+      required: false,
+    },
+
+    // Deepgram model selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'deepgram' },
+      options: [
+        { label: 'Nova 3', id: 'nova-3' },
+        { label: 'Nova 2', id: 'nova-2' },
+        { label: 'Nova', id: 'nova' },
+        { label: 'Whisper Large', id: 'whisper-large' },
+        { label: 'Enhanced', id: 'enhanced' },
+        { label: 'Base', id: 'base' },
+      ],
+      value: () => 'nova-3',
+      required: false,
+    },
+
+    // Audio/Video file upload (basic mode)
+    {
+      id: 'audioFile',
+      title: 'Audio/Video File',
+      type: 'file-upload',
+      canonicalParamId: 'audioFile',
+      placeholder: 'Upload an audio or video file',
+      mode: 'basic',
+      multiple: false,
+      required: false,
+      acceptedTypes: '.mp3,.m4a,.wav,.webm,.ogg,.flac,.aac,.opus,.mp4,.mov,.avi,.mkv',
+    },
+
+    // Audio file reference (advanced mode)
+    {
+      id: 'audioFileReference',
+      title: 'Audio/Video File Reference',
+      type: 'short-input',
+      canonicalParamId: 'audioFile',
+      placeholder: 'Reference audio/video from previous blocks',
+      mode: 'advanced',
+      required: false,
+    },
+
+    // Audio URL (alternative)
+    {
+      id: 'audioUrl',
+      title: 'Audio/Video URL (alternative)',
+      type: 'short-input',
+      placeholder: 'Or enter publicly accessible audio/video URL',
+      required: false,
+    },
+
+    // Language selection
+    {
+      id: 'language',
+      title: 'Language',
+      type: 'dropdown',
+      options: [
+        { label: 'Auto-detect', id: 'auto' },
+        { label: 'English', id: 'en' },
+        { label: 'Spanish', id: 'es' },
+        { label: 'French', id: 'fr' },
+        { label: 'German', id: 'de' },
+        { label: 'Italian', id: 'it' },
+        { label: 'Portuguese', id: 'pt' },
+        { label: 'Dutch', id: 'nl' },
+        { label: 'Russian', id: 'ru' },
+        { label: 'Chinese', id: 'zh' },
+        { label: 'Japanese', id: 'ja' },
+        { label: 'Korean', id: 'ko' },
+        { label: 'Arabic', id: 'ar' },
+        { label: 'Hindi', id: 'hi' },
+        { label: 'Polish', id: 'pl' },
+        { label: 'Turkish', id: 'tr' },
+        { label: 'Swedish', id: 'sv' },
+        { label: 'Danish', id: 'da' },
+        { label: 'Norwegian', id: 'no' },
+        { label: 'Finnish', id: 'fi' },
+      ],
+      value: () => 'auto',
+    },
+
+    // Timestamps (word-level, sentence-level, or none)
+    {
+      id: 'timestamps',
+      title: 'Timestamps',
+      type: 'dropdown',
+      options: [
+        { label: 'None', id: 'none' },
+        { label: 'Sentence-level', id: 'sentence' },
+        { label: 'Word-level', id: 'word' },
+      ],
+      value: () => 'none',
+    },
+
+    // Speaker diarization (Deepgram/AssemblyAI only)
+    {
+      id: 'diarization',
+      title: 'Speaker Diarization',
+      type: 'switch',
+      condition: { field: 'provider', value: ['deepgram'] },
+    },
+
+    // Translate to English (Whisper only)
+    {
+      id: 'translateToEnglish',
+      title: 'Translate to English',
+      type: 'switch',
+      condition: { field: 'provider', value: 'whisper' },
+    },
+
+    // API Key
+    {
+      id: 'apiKey',
+      title: 'API Key',
+      type: 'short-input',
+      placeholder: 'Enter your API key',
+      password: true,
+      required: true,
+    },
+  ],
+
+  tools: {
+    access: ['stt_whisper', 'stt_deepgram', 'stt_elevenlabs'],
+    config: {
+      tool: (params) => {
+        // Select tool based on provider
+        switch (params.provider) {
+          case 'whisper':
+            return 'stt_whisper'
+          case 'deepgram':
+            return 'stt_deepgram'
+          case 'elevenlabs':
+            return 'stt_elevenlabs'
+          default:
+            return 'stt_whisper'
+        }
+      },
+      params: (params) => ({
+        provider: params.provider,
+        apiKey: params.apiKey,
+        model: params.model,
+        audioFile: params.audioFile,
+        audioFileReference: params.audioFileReference,
+        audioUrl: params.audioUrl,
+        language: params.language,
+        timestamps: params.timestamps,
+        diarization: params.diarization,
+        translateToEnglish: params.translateToEnglish,
+      }),
+    },
+  },
+
+  inputs: {
+    provider: { type: 'string', description: 'STT provider (whisper, deepgram, elevenlabs)' },
+    apiKey: { type: 'string', description: 'Provider API key' },
+    model: {
+      type: 'string',
+      description: 'Provider-specific model (e.g., scribe_v1 for ElevenLabs, nova-3 for Deepgram)',
+    },
+    audioFile: { type: 'json', description: 'Audio/video file (UserFile)' },
+    audioFileReference: { type: 'json', description: 'Audio/video file reference' },
+    audioUrl: { type: 'string', description: 'Audio/video URL' },
+    language: { type: 'string', description: 'Language code or auto' },
+    timestamps: { type: 'string', description: 'Timestamp granularity (none, sentence, word)' },
+    diarization: { type: 'boolean', description: 'Enable speaker diarization' },
+    translateToEnglish: { type: 'boolean', description: 'Translate to English (Whisper only)' },
+  },
+
+  outputs: {
+    transcript: { type: 'string', description: 'Full transcribed text' },
+    segments: { type: 'array', description: 'Timestamped segments with speaker labels' },
+    language: { type: 'string', description: 'Detected or specified language' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    confidence: { type: 'number', description: 'Overall confidence score' },
+  },
+}
--- a/apps/sim/blocks/registry.ts
+++ b/apps/sim/blocks/registry.ts
@@ -77,6 +77,7 @@ import { StagehandAgentBlock } from '@/blocks/blocks/stagehand_agent'
 import { StartTriggerBlock } from '@/blocks/blocks/start_trigger'
 import { StarterBlock } from '@/blocks/blocks/starter'
 import { StripeBlock } from '@/blocks/blocks/stripe'
+import { SttBlock } from '@/blocks/blocks/stt'
 import { SupabaseBlock } from '@/blocks/blocks/supabase'
 import { TavilyBlock } from '@/blocks/blocks/tavily'
 import { TelegramBlock } from '@/blocks/blocks/telegram'
@@ -177,6 +178,7 @@ export const registry: Record<string, BlockConfig> = {
  stagehand_agent: StagehandAgentBlock,
  slack: SlackBlock,
  starter: StarterBlock,
+  stt: SttBlock,
  start_trigger: StartTriggerBlock,
  input_trigger: InputTriggerBlock,
  chat_trigger: ChatTriggerBlock,
--- a/apps/sim/components/icons.tsx
+++ b/apps/sim/components/icons.tsx
@@ -4084,3 +4084,27 @@ export function CalendlyIcon(props: SVGProps<SVGSVGElement>) {
    </svg>
  )
 }
+
+export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
+  return (
+    <svg
+      {...props}
+      xmlns='http://www.w3.org/2000/svg'
+      width='24'
+      height='24'
+      viewBox='0 0 24 24'
+      fill='none'
+      stroke='currentColor'
+      strokeWidth='2'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    >
+      <path d='M2 10v3' />
+      <path d='M6 6v11' />
+      <path d='M10 3v18' />
+      <path d='M14 8v7' />
+      <path d='M18 5v13' />
+      <path d='M22 10v3' />
+    </svg>
+  )
+}
--- a/apps/sim/lib/audio/extractor.ts
+++ b/apps/sim/lib/audio/extractor.ts
@@ -0,0 +1,294 @@
+import { execSync } from 'node:child_process'
+import fs from 'node:fs/promises'
+import os from 'node:os'
+import path from 'node:path'
+import ffmpegStatic from 'ffmpeg-static'
+import ffmpeg from 'fluent-ffmpeg'
+import type {
+  AudioExtractionOptions,
+  AudioExtractionResult,
+  AudioMetadata,
+} from '@/lib/audio/types'
+
+// Set ffmpeg binary path with fallback to system ffmpeg
+try {
+  if (ffmpegStatic && typeof ffmpegStatic === 'string') {
+    ffmpeg.setFfmpegPath(ffmpegStatic)
+  } else {
+    // Try to find system ffmpeg
+    try {
+      const systemFfmpeg = execSync('which ffmpeg', { encoding: 'utf-8' }).trim()
+      if (systemFfmpeg) {
+        ffmpeg.setFfmpegPath(systemFfmpeg)
+        console.log('[FFmpeg] Using system ffmpeg:', systemFfmpeg)
+      }
+    } catch {
+      console.warn(
+        '[FFmpeg] ffmpeg-static not available and system ffmpeg not found. Please install ffmpeg: brew install ffmpeg (macOS) or apt-get install ffmpeg (Linux)'
+      )
+    }
+  }
+} catch (error) {
+  console.warn('[FFmpeg] Failed to set ffmpeg path:', error)
+}
+
+/**
+ * Extract audio from video or convert audio format using FFmpeg
+ */
+export async function extractAudioFromVideo(
+  inputBuffer: Buffer,
+  mimeType: string,
+  options: AudioExtractionOptions = {}
+): Promise<AudioExtractionResult> {
+  const isVideo = mimeType.startsWith('video/')
+  const isAudio = mimeType.startsWith('audio/')
+
+  // If it's already audio and no conversion needed, get metadata and return
+  if (isAudio && !options.outputFormat) {
+    try {
+      const metadata = await getAudioMetadata(inputBuffer, mimeType)
+      return {
+        buffer: inputBuffer,
+        format: mimeType.split('/')[1] || 'unknown',
+        duration: metadata.duration || 0,
+        size: inputBuffer.length,
+      }
+    } catch (error) {
+      // If metadata extraction fails, still return the buffer
+      return {
+        buffer: inputBuffer,
+        format: mimeType.split('/')[1] || 'unknown',
+        duration: 0,
+        size: inputBuffer.length,
+      }
+    }
+  }
+
+  // For video or audio conversion, use ffmpeg
+  if (isVideo || options.outputFormat) {
+    return await convertAudioWithFFmpeg(inputBuffer, mimeType, options)
+  }
+
+  // Fallback
+  return {
+    buffer: inputBuffer,
+    format: options.outputFormat || mimeType.split('/')[1] || 'unknown',
+    duration: 0,
+    size: inputBuffer.length,
+  }
+}
+
+/**
+ * Convert audio/video using FFmpeg
+ */
+async function convertAudioWithFFmpeg(
+  inputBuffer: Buffer,
+  mimeType: string,
+  options: AudioExtractionOptions
+): Promise<AudioExtractionResult> {
+  // Create temporary files
+  const tempDir = os.tmpdir()
+  const inputExt = getExtensionFromMimeType(mimeType)
+  const outputFormat = options.outputFormat || 'mp3'
+  const inputFile = path.join(tempDir, `ffmpeg-input-${Date.now()}.${inputExt}`)
+  const outputFile = path.join(tempDir, `ffmpeg-output-${Date.now()}.${outputFormat}`)
+
+  try {
+    // Write input buffer to temporary file
+    await fs.writeFile(inputFile, inputBuffer)
+
+    // Get metadata for duration
+    let duration = 0
+    try {
+      const metadata = await getAudioMetadataFromFile(inputFile)
+      duration = metadata.duration || 0
+    } catch (error) {
+      // Metadata extraction failed, continue without duration
+      console.warn('Failed to extract metadata:', error)
+    }
+
+    // Convert using FFmpeg
+    await new Promise<void>((resolve, reject) => {
+      let command = ffmpeg(inputFile).toFormat(outputFormat).audioCodec(getAudioCodec(outputFormat))
+
+      // Apply audio options
+      if (options.channels) {
+        command = command.audioChannels(options.channels)
+      }
+      if (options.sampleRate) {
+        command = command.audioFrequency(options.sampleRate)
+      }
+      if (options.bitrate) {
+        command = command.audioBitrate(options.bitrate)
+      }
+
+      command
+        .on('end', () => resolve())
+        .on('error', (err) => reject(new Error(`FFmpeg error: ${err.message}`)))
+        .save(outputFile)
+    })
+
+    // Read output file
+    const outputBuffer = await fs.readFile(outputFile)
+
+    return {
+      buffer: outputBuffer,
+      format: outputFormat,
+      duration,
+      size: outputBuffer.length,
+    }
+  } finally {
+    // Clean up temporary files
+    try {
+      await fs.unlink(inputFile).catch(() => {})
+      await fs.unlink(outputFile).catch(() => {})
+    } catch (error) {
+      // Ignore cleanup errors
+    }
+  }
+}
+
+/**
+ * Get audio metadata using ffprobe
+ */
+export async function getAudioMetadata(buffer: Buffer, mimeType: string): Promise<AudioMetadata> {
+  const tempDir = os.tmpdir()
+  const inputExt = getExtensionFromMimeType(mimeType)
+  const inputFile = path.join(tempDir, `ffprobe-input-${Date.now()}.${inputExt}`)
+
+  try {
+    // Write buffer to temporary file
+    await fs.writeFile(inputFile, buffer)
+
+    // Get metadata using ffprobe
+    return await getAudioMetadataFromFile(inputFile)
+  } finally {
+    // Clean up temporary file
+    try {
+      await fs.unlink(inputFile).catch(() => {})
+    } catch (error) {
+      // Ignore cleanup errors
+    }
+  }
+}
+
+/**
+ * Get audio metadata from a file path using ffprobe
+ */
+async function getAudioMetadataFromFile(filePath: string): Promise<AudioMetadata> {
+  return new Promise((resolve, reject) => {
+    ffmpeg.ffprobe(filePath, (err, metadata) => {
+      if (err) {
+        reject(new Error(`FFprobe error: ${err.message}`))
+        return
+      }
+
+      const audioStream = metadata.streams.find((s) => s.codec_type === 'audio')
+      const format = metadata.format
+
+      resolve({
+        duration: format.duration || 0,
+        format: format.format_name || 'unknown',
+        codec: audioStream?.codec_name,
+        sampleRate: audioStream?.sample_rate,
+        channels: audioStream?.channels,
+        bitrate: format.bit_rate ? Number(format.bit_rate) : undefined,
+      })
+    })
+  })
+}
+
+/**
+ * Get file extension from MIME type
+ */
+function getExtensionFromMimeType(mimeType: string): string {
+  const mimeToExt: Record<string, string> = {
+    // Video
+    'video/mp4': 'mp4',
+    'video/quicktime': 'mov',
+    'video/x-msvideo': 'avi',
+    'video/x-matroska': 'mkv',
+    'video/webm': 'webm',
+    // Audio
+    'audio/mpeg': 'mp3',
+    'audio/mp4': 'm4a',
+    'audio/wav': 'wav',
+    'audio/webm': 'webm',
+    'audio/ogg': 'ogg',
+    'audio/flac': 'flac',
+    'audio/aac': 'aac',
+    'audio/opus': 'opus',
+  }
+
+  return mimeToExt[mimeType] || mimeType.split('/')[1] || 'dat'
+}
+
+/**
+ * Get appropriate audio codec for output format
+ */
+function getAudioCodec(format: string): string {
+  const codecMap: Record<string, string> = {
+    mp3: 'libmp3lame',
+    wav: 'pcm_s16le',
+    flac: 'flac',
+    m4a: 'aac',
+    aac: 'aac',
+    ogg: 'libvorbis',
+    opus: 'libopus',
+  }
+
+  return codecMap[format] || 'libmp3lame'
+}
+
+/**
+ * Check if a file is a video file
+ */
+export function isVideoFile(mimeType: string): boolean {
+  return mimeType.startsWith('video/')
+}
+
+/**
+ * Check if a file is an audio file
+ */
+export function isAudioFile(mimeType: string): boolean {
+  return mimeType.startsWith('audio/')
+}
+
+/**
+ * Get optimal audio format for STT provider
+ */
+export function getOptimalFormat(provider: 'whisper' | 'deepgram' | 'elevenlabs'): {
+  format: 'mp3' | 'wav' | 'flac'
+  sampleRate: number
+  channels: 1 | 2
+} {
+  switch (provider) {
+    case 'whisper':
+      // Whisper prefers 16kHz mono
+      return {
+        format: 'mp3',
+        sampleRate: 16000,
+        channels: 1,
+      }
+    case 'deepgram':
+      // Deepgram works well with various formats
+      return {
+        format: 'mp3',
+        sampleRate: 16000,
+        channels: 1,
+      }
+    case 'elevenlabs':
+      // ElevenLabs format preferences
+      return {
+        format: 'mp3',
+        sampleRate: 16000,
+        channels: 1,
+      }
+    default:
+      return {
+        format: 'mp3',
+        sampleRate: 16000,
+        channels: 1,
+      }
+  }
+}
--- a/apps/sim/lib/audio/types.ts
+++ b/apps/sim/lib/audio/types.ts
@@ -0,0 +1,22 @@
+export interface AudioExtractionOptions {
+  outputFormat?: 'mp3' | 'wav' | 'flac'
+  sampleRate?: number
+  channels?: 1 | 2
+  bitrate?: string
+}
+
+export interface AudioExtractionResult {
+  buffer: Buffer
+  format: string
+  duration: number
+  size: number
+}
+
+export interface AudioMetadata {
+  duration: number
+  format: string
+  codec?: string
+  sampleRate?: number
+  channels?: number
+  bitrate?: number
+}
--- a/apps/sim/lib/uploads/utils/file-utils.ts
+++ b/apps/sim/lib/uploads/utils/file-utils.ts
@@ -12,7 +12,7 @@ export interface FileAttachment {
 }

 export interface MessageContent {
-  type: 'text' | 'image' | 'document'
+  type: 'text' | 'image' | 'document' | 'audio' | 'video'
  text?: string
  source?: {
    type: 'base64'
@@ -24,7 +24,7 @@ export interface MessageContent {
 /**
 * Mapping of MIME types to content types
 */
-export const MIME_TYPE_MAPPING: Record<string, 'image' | 'document'> = {
+export const MIME_TYPE_MAPPING: Record<string, 'image' | 'document' | 'audio' | 'video'> = {
  // Images
  'image/jpeg': 'image',
  'image/jpg': 'image',
@@ -49,12 +49,40 @@ export const MIME_TYPE_MAPPING: Record<string, 'image' | 'document'> = {
  'application/vnd.ms-powerpoint': 'document', // .ppt
  'text/markdown': 'document',
  'application/rtf': 'document',
+
+  // Audio
+  'audio/mpeg': 'audio', // .mp3
+  'audio/mp3': 'audio',
+  'audio/mp4': 'audio', // .m4a
+  'audio/x-m4a': 'audio',
+  'audio/m4a': 'audio',
+  'audio/wav': 'audio',
+  'audio/wave': 'audio',
+  'audio/x-wav': 'audio',
+  'audio/webm': 'audio',
+  'audio/ogg': 'audio',
+  'audio/vorbis': 'audio',
+  'audio/flac': 'audio',
+  'audio/x-flac': 'audio',
+  'audio/aac': 'audio',
+  'audio/x-aac': 'audio',
+  'audio/opus': 'audio',
+
+  // Video
+  'video/mp4': 'video',
+  'video/mpeg': 'video',
+  'video/quicktime': 'video', // .mov
+  'video/x-quicktime': 'video',
+  'video/x-msvideo': 'video', // .avi
+  'video/avi': 'video',
+  'video/x-matroska': 'video', // .mkv
+  'video/webm': 'video',
 }

 /**
 * Get the content type for a given MIME type
 */
-export function getContentType(mimeType: string): 'image' | 'document' | null {
+export function getContentType(mimeType: string): 'image' | 'document' | 'audio' | 'video' | null {
  return MIME_TYPE_MAPPING[mimeType.toLowerCase()] || null
 }

@@ -80,6 +108,28 @@ export function isImageFileType(mimeType: string): boolean {
  return imageTypes.includes(mimeType.toLowerCase())
 }

+/**
+ * Check if a MIME type is an audio type
+ */
+export function isAudioFileType(mimeType: string): boolean {
+  return getContentType(mimeType) === 'audio'
+}
+
+/**
+ * Check if a MIME type is a video type
+ */
+export function isVideoFileType(mimeType: string): boolean {
+  return getContentType(mimeType) === 'video'
+}
+
+/**
+ * Check if a MIME type is an audio or video type
+ */
+export function isMediaFileType(mimeType: string): boolean {
+  const contentType = getContentType(mimeType)
+  return contentType === 'audio' || contentType === 'video'
+}
+
 /**
 * Convert a file buffer to base64
 */
@@ -143,6 +193,22 @@ export function getMimeTypeFromExtension(extension: string): string {
    ppt: 'application/vnd.ms-powerpoint',
    md: 'text/markdown',
    rtf: 'application/rtf',
+
+    // Audio
+    mp3: 'audio/mpeg',
+    m4a: 'audio/mp4',
+    wav: 'audio/wav',
+    webm: 'audio/webm',
+    ogg: 'audio/ogg',
+    flac: 'audio/flac',
+    aac: 'audio/aac',
+    opus: 'audio/opus',
+
+    // Video
+    mp4: 'video/mp4',
+    mov: 'video/quicktime',
+    avi: 'video/x-msvideo',
+    mkv: 'video/x-matroska',
  }

  return extensionMimeMap[extension.toLowerCase()] || 'application/octet-stream'
--- a/apps/sim/lib/uploads/utils/validation.ts
+++ b/apps/sim/lib/uploads/utils/validation.ts
@@ -20,7 +20,26 @@ export const SUPPORTED_DOCUMENT_EXTENSIONS = [
  'yml',
 ] as const

+export const SUPPORTED_AUDIO_EXTENSIONS = [
+  'mp3',
+  'm4a',
+  'wav',
+  'webm',
+  'ogg',
+  'flac',
+  'aac',
+  'opus',
+] as const
+
+export const SUPPORTED_VIDEO_EXTENSIONS = ['mp4', 'mov', 'avi', 'mkv', 'webm'] as const
+
 export type SupportedDocumentExtension = (typeof SUPPORTED_DOCUMENT_EXTENSIONS)[number]
+export type SupportedAudioExtension = (typeof SUPPORTED_AUDIO_EXTENSIONS)[number]
+export type SupportedVideoExtension = (typeof SUPPORTED_VIDEO_EXTENSIONS)[number]
+export type SupportedMediaExtension =
+  | SupportedDocumentExtension
+  | SupportedAudioExtension
+  | SupportedVideoExtension

 export const SUPPORTED_MIME_TYPES: Record<SupportedDocumentExtension, string[]> = {
  pdf: ['application/pdf', 'application/x-pdf'],
@@ -54,7 +73,33 @@ export const SUPPORTED_MIME_TYPES: Record<SupportedDocumentExtension, string[]>
  yml: ['text/yaml', 'text/x-yaml', 'application/yaml', 'application/x-yaml'],
 }

+export const SUPPORTED_AUDIO_MIME_TYPES: Record<SupportedAudioExtension, string[]> = {
+  mp3: ['audio/mpeg', 'audio/mp3'],
+  m4a: ['audio/mp4', 'audio/x-m4a', 'audio/m4a'],
+  wav: ['audio/wav', 'audio/wave', 'audio/x-wav'],
+  webm: ['audio/webm'],
+  ogg: ['audio/ogg', 'audio/vorbis'],
+  flac: ['audio/flac', 'audio/x-flac'],
+  aac: ['audio/aac', 'audio/x-aac'],
+  opus: ['audio/opus'],
+}
+
+export const SUPPORTED_VIDEO_MIME_TYPES: Record<SupportedVideoExtension, string[]> = {
+  mp4: ['video/mp4', 'video/mpeg'],
+  mov: ['video/quicktime', 'video/x-quicktime'],
+  avi: ['video/x-msvideo', 'video/avi'],
+  mkv: ['video/x-matroska'],
+  webm: ['video/webm'],
+}
+
 export const ACCEPTED_FILE_TYPES = Object.values(SUPPORTED_MIME_TYPES).flat()
+export const ACCEPTED_AUDIO_TYPES = Object.values(SUPPORTED_AUDIO_MIME_TYPES).flat()
+export const ACCEPTED_VIDEO_TYPES = Object.values(SUPPORTED_VIDEO_MIME_TYPES).flat()
+export const ACCEPTED_MEDIA_TYPES = [
+  ...ACCEPTED_FILE_TYPES,
+  ...ACCEPTED_AUDIO_TYPES,
+  ...ACCEPTED_VIDEO_TYPES,
+]

 export const ACCEPTED_FILE_EXTENSIONS = SUPPORTED_DOCUMENT_EXTENSIONS.map((ext) => `.${ext}`)

@@ -110,5 +155,61 @@ export function getSupportedMimeTypes(extension: string): string[] {
  if (isSupportedExtension(extension)) {
    return SUPPORTED_MIME_TYPES[extension as SupportedDocumentExtension]
  }
+  if (SUPPORTED_AUDIO_EXTENSIONS.includes(extension as SupportedAudioExtension)) {
+    return SUPPORTED_AUDIO_MIME_TYPES[extension as SupportedAudioExtension]
+  }
+  if (SUPPORTED_VIDEO_EXTENSIONS.includes(extension as SupportedVideoExtension)) {
+    return SUPPORTED_VIDEO_MIME_TYPES[extension as SupportedVideoExtension]
+  }
  return []
 }
+
+/**
+ * Check if file extension is a supported audio extension
+ */
+export function isSupportedAudioExtension(extension: string): extension is SupportedAudioExtension {
+  return SUPPORTED_AUDIO_EXTENSIONS.includes(extension.toLowerCase() as SupportedAudioExtension)
+}
+
+/**
+ * Check if file extension is a supported video extension
+ */
+export function isSupportedVideoExtension(extension: string): extension is SupportedVideoExtension {
+  return SUPPORTED_VIDEO_EXTENSIONS.includes(extension.toLowerCase() as SupportedVideoExtension)
+}
+
+/**
+ * Validate if an audio/video file type is supported for STT processing
+ */
+export function validateMediaFileType(
+  fileName: string,
+  mimeType: string
+): FileValidationError | null {
+  const extension = path.extname(fileName).toLowerCase().substring(1)
+
+  const isAudio = SUPPORTED_AUDIO_EXTENSIONS.includes(extension as SupportedAudioExtension)
+  const isVideo = SUPPORTED_VIDEO_EXTENSIONS.includes(extension as SupportedVideoExtension)
+
+  if (!isAudio && !isVideo) {
+    return {
+      code: 'UNSUPPORTED_FILE_TYPE',
+      message: `Unsupported media file type: ${extension}. Supported audio types: ${SUPPORTED_AUDIO_EXTENSIONS.join(', ')}. Supported video types: ${SUPPORTED_VIDEO_EXTENSIONS.join(', ')}`,
+      supportedTypes: [...SUPPORTED_AUDIO_EXTENSIONS, ...SUPPORTED_VIDEO_EXTENSIONS],
+    }
+  }
+
+  const baseMimeType = mimeType.split(';')[0].trim()
+  const allowedMimeTypes = isAudio
+    ? SUPPORTED_AUDIO_MIME_TYPES[extension as SupportedAudioExtension]
+    : SUPPORTED_VIDEO_MIME_TYPES[extension as SupportedVideoExtension]
+
+  if (!allowedMimeTypes.includes(baseMimeType)) {
+    return {
+      code: 'MIME_TYPE_MISMATCH',
+      message: `MIME type ${baseMimeType} does not match file extension ${extension}. Expected: ${allowedMimeTypes.join(', ')}`,
+      supportedTypes: allowedMimeTypes,
+    }
+  }
+
+  return null
+}
--- a/apps/sim/next.config.ts
+++ b/apps/sim/next.config.ts
@@ -75,7 +75,7 @@ const nextConfig: NextConfig = {
  turbopack: {
    resolveExtensions: ['.tsx', '.ts', '.jsx', '.js', '.mjs', '.json'],
  },
-  serverExternalPackages: ['unpdf'],
+  serverExternalPackages: ['unpdf', 'ffmpeg-static', 'fluent-ffmpeg'],
  experimental: {
    optimizeCss: true,
    turbopackSourceMaps: false,
--- a/apps/sim/tools/registry.ts
+++ b/apps/sim/tools/registry.ts
@@ -605,6 +605,7 @@ import {
  stripeUpdateSubscriptionTool,
  stripeVoidInvoiceTool,
 } from '@/tools/stripe'
+import { deepgramSttTool, elevenLabsSttTool, whisperSttTool } from '@/tools/stt'
 import {
  supabaseCountTool,
  supabaseDeleteTool,
@@ -1050,6 +1051,9 @@ export const tools: Record<string, ToolConfig> = {
  knowledge_upload_chunk: knowledgeUploadChunkTool,
  knowledge_create_document: knowledgeCreateDocumentTool,
  elevenlabs_tts: elevenLabsTtsTool,
+  stt_whisper: whisperSttTool,
+  stt_deepgram: deepgramSttTool,
+  stt_elevenlabs: elevenLabsSttTool,
  s3_get_object: s3GetObjectTool,
  s3_put_object: s3PutObjectTool,
  s3_list_objects: s3ListObjectsTool,
--- a/apps/sim/tools/stt/deepgram.ts
+++ b/apps/sim/tools/stt/deepgram.ts
@@ -0,0 +1,125 @@
+import type { SttParams, SttResponse } from '@/tools/stt/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const deepgramSttTool: ToolConfig<SttParams, SttResponse> = {
+  id: 'stt_deepgram',
+  name: 'Deepgram STT',
+  description: 'Transcribe audio to text using Deepgram',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'STT provider (deepgram)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Deepgram API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Deepgram model to use (nova-3, nova-2, whisper-large, etc.)',
+    },
+    audioFile: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Audio or video file to transcribe',
+    },
+    audioFileReference: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Reference to audio/video file from previous blocks',
+    },
+    audioUrl: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'URL to audio or video file',
+    },
+    language: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
+    },
+    timestamps: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Timestamp granularity: none, sentence, or word',
+    },
+    diarization: {
+      type: 'boolean',
+      required: false,
+      visibility: 'user-only',
+      description: 'Enable speaker diarization',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/stt',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: SttParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'deepgram',
+      apiKey: params.apiKey,
+      model: params.model,
+      audioFile: params.audioFile,
+      audioFileReference: params.audioFileReference,
+      audioUrl: params.audioUrl,
+      language: params.language || 'auto',
+      timestamps: params.timestamps || 'none',
+      diarization: params.diarization || false,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Transcription failed',
+        output: {
+          transcript: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        transcript: data.transcript,
+        segments: data.segments,
+        language: data.language,
+        duration: data.duration,
+        confidence: data.confidence,
+      },
+    }
+  },
+
+  outputs: {
+    transcript: { type: 'string', description: 'Full transcribed text' },
+    segments: { type: 'array', description: 'Timestamped segments with speaker labels' },
+    language: { type: 'string', description: 'Detected or specified language' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    confidence: { type: 'number', description: 'Overall confidence score' },
+  },
+}
--- a/apps/sim/tools/stt/elevenlabs.ts
+++ b/apps/sim/tools/stt/elevenlabs.ts
@@ -0,0 +1,118 @@
+import type { SttParams, SttResponse } from '@/tools/stt/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const elevenLabsSttTool: ToolConfig<SttParams, SttResponse> = {
+  id: 'stt_elevenlabs',
+  name: 'ElevenLabs STT',
+  description: 'Transcribe audio to text using ElevenLabs',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'STT provider (elevenlabs)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'ElevenLabs API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'ElevenLabs model to use (scribe_v1, scribe_v1_experimental)',
+    },
+    audioFile: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Audio or video file to transcribe',
+    },
+    audioFileReference: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Reference to audio/video file from previous blocks',
+    },
+    audioUrl: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'URL to audio or video file',
+    },
+    language: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
+    },
+    timestamps: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Timestamp granularity: none, sentence, or word',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/stt',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: SttParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'elevenlabs',
+      apiKey: params.apiKey,
+      model: params.model,
+      audioFile: params.audioFile,
+      audioFileReference: params.audioFileReference,
+      audioUrl: params.audioUrl,
+      language: params.language || 'auto',
+      timestamps: params.timestamps || 'none',
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Transcription failed',
+        output: {
+          transcript: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        transcript: data.transcript,
+        segments: data.segments,
+        language: data.language,
+        duration: data.duration,
+        confidence: data.confidence,
+      },
+    }
+  },
+
+  outputs: {
+    transcript: { type: 'string', description: 'Full transcribed text' },
+    segments: { type: 'array', description: 'Timestamped segments' },
+    language: { type: 'string', description: 'Detected or specified language' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    confidence: { type: 'number', description: 'Overall confidence score' },
+  },
+}
--- a/apps/sim/tools/stt/index.ts
+++ b/apps/sim/tools/stt/index.ts
@@ -0,0 +1,5 @@
+import { deepgramSttTool } from '@/tools/stt/deepgram'
+import { elevenLabsSttTool } from '@/tools/stt/elevenlabs'
+import { whisperSttTool } from '@/tools/stt/whisper'
+
+export { whisperSttTool, deepgramSttTool, elevenLabsSttTool }
--- a/apps/sim/tools/stt/types.ts
+++ b/apps/sim/tools/stt/types.ts
@@ -0,0 +1,62 @@
+import type { UserFile } from '@/executor/types'
+import type { ToolResponse } from '@/tools/types'
+
+export interface SttParams {
+  provider: 'whisper' | 'deepgram' | 'elevenlabs'
+  apiKey: string
+  model?: string
+  audioFile?: UserFile | UserFile[]
+  audioFileReference?: UserFile | UserFile[]
+  audioUrl?: string
+  language?: string
+  timestamps?: 'none' | 'sentence' | 'word'
+  diarization?: boolean
+  translateToEnglish?: boolean
+}
+
+export interface TranscriptSegment {
+  text: string
+  start: number
+  end: number
+  speaker?: string
+  confidence?: number
+}
+
+export interface SttResponse extends ToolResponse {
+  output: {
+    transcript: string
+    segments?: TranscriptSegment[]
+    language?: string
+    duration?: number
+    confidence?: number
+  }
+}
+
+export interface SttBlockResponse extends ToolResponse {
+  output: {
+    transcript: string
+    segments?: TranscriptSegment[]
+    language?: string
+    duration?: number
+    confidence?: number
+  }
+}
+
+// Provider-specific types
+
+export interface WhisperParams extends Omit<SttParams, 'provider'> {
+  model?: string
+  responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'
+  temperature?: number
+}
+
+export interface DeepgramParams extends Omit<SttParams, 'provider'> {
+  model?: string
+  punctuate?: boolean
+  paragraphs?: boolean
+  utterances?: boolean
+}
+
+export interface ElevenLabsSttParams extends Omit<SttParams, 'provider'> {
+  model?: string
+}
--- a/apps/sim/tools/stt/whisper.ts
+++ b/apps/sim/tools/stt/whisper.ts
@@ -0,0 +1,125 @@
+import type { SttParams, SttResponse } from '@/tools/stt/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const whisperSttTool: ToolConfig<SttParams, SttResponse> = {
+  id: 'stt_whisper',
+  name: 'OpenAI Whisper STT',
+  description: 'Transcribe audio to text using OpenAI Whisper',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'STT provider (whisper)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'OpenAI API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Whisper model to use (default: whisper-1)',
+    },
+    audioFile: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Audio or video file to transcribe',
+    },
+    audioFileReference: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Reference to audio/video file from previous blocks',
+    },
+    audioUrl: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'URL to audio or video file',
+    },
+    language: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
+    },
+    timestamps: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Timestamp granularity: none, sentence, or word',
+    },
+    translateToEnglish: {
+      type: 'boolean',
+      required: false,
+      visibility: 'user-only',
+      description: 'Translate audio to English',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/stt',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: SttParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'whisper',
+      apiKey: params.apiKey,
+      model: params.model,
+      audioFile: params.audioFile,
+      audioFileReference: params.audioFileReference,
+      audioUrl: params.audioUrl,
+      language: params.language || 'auto',
+      timestamps: params.timestamps || 'none',
+      translateToEnglish: params.translateToEnglish || false,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Transcription failed',
+        output: {
+          transcript: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        transcript: data.transcript,
+        segments: data.segments,
+        language: data.language,
+        duration: data.duration,
+        confidence: data.confidence,
+      },
+    }
+  },
+
+  outputs: {
+    transcript: { type: 'string', description: 'Full transcribed text' },
+    segments: { type: 'array', description: 'Timestamped segments' },
+    language: { type: 'string', description: 'Detected or specified language' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    confidence: { type: 'number', description: 'Overall confidence score' },
+  },
+}
--- a/bun.lock
+++ b/bun.lock
@@ -9,8 +9,11 @@
        "@t3-oss/env-nextjs": "0.13.4",
        "@tanstack/react-query": "5.90.8",
        "@tanstack/react-query-devtools": "5.90.2",
+        "@types/fluent-ffmpeg": "2.1.28",
        "cronstrue": "3.3.0",
        "drizzle-orm": "^0.44.5",
+        "ffmpeg-static": "5.3.0",
+        "fluent-ffmpeg": "2.1.3",
        "mongodb": "6.19.0",
        "neo4j-driver": "6.0.1",
        "onedollarstats": "0.0.10",
@@ -235,6 +238,7 @@
    },
  },
  "trustedDependencies": [
+    "ffmpeg-static",
    "sharp",
  ],
  "overrides": {
@@ -496,6 +500,8 @@

    "@csstools/css-tokenizer": ["@csstools/css-tokenizer@3.0.4", "", {}, "sha512-Vd/9EVDiu6PPJt9yAh6roZP6El1xHrdvIVGjyBsHR0RYwNHgL7FJPyIIW4fANJNG6FtyZfvlRPpFI4ZM/lubvw=="],

+    "@derhuerst/http-basic": ["@derhuerst/http-basic@8.2.4", "", { "dependencies": { "caseless": "^0.12.0", "concat-stream": "^2.0.0", "http-response-object": "^3.0.1", "parse-cache-control": "^1.0.1" } }, "sha512-F9rL9k9Xjf5blCz8HsJRO4diy111cayL2vkY2XE4r4t3n0yPXVYy3KD3nJ1qbrSn9743UWSXH4IwuCa/HWlGFw=="],
+
    "@dimforge/rapier3d-compat": ["@dimforge/rapier3d-compat@0.12.0", "", {}, "sha512-uekIGetywIgopfD97oDL5PfeezkFpNhwlzlaEYNOA0N6ghdsOvh/HYjSMek5Q2O1PYvRSDFcqFVJl4r4ZBwOow=="],

    "@drizzle-team/brocli": ["@drizzle-team/brocli@0.10.2", "", {}, "sha512-z33Il7l5dKjUgGULTqBsQBQwckHh5AbIuxhdsIxDDiZAzBOrZO6q9ogcWC65kU382AfynTfgNumVcNIjuIua6w=="],
@@ -1336,6 +1342,8 @@

    "@types/estree-jsx": ["@types/estree-jsx@1.0.5", "", { "dependencies": { "@types/estree": "*" } }, "sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg=="],

+    "@types/fluent-ffmpeg": ["@types/fluent-ffmpeg@2.1.28", "", { "dependencies": { "@types/node": "*" } }, "sha512-5ovxsDwBcPfJ+eYs1I/ZpcYCnkce7pvH9AHSvrZllAp1ZPpTRDZAFjF3TRFbukxSgIYTTNYePbS0rKUmaxVbXw=="],
+
    "@types/geojson": ["@types/geojson@7946.0.16", "", {}, "sha512-6C8nqWur3j98U6+lXDfTUWIfgvZU+EumvpHKcYjujKH7woYyLj2sUmff0tRhrqM7BohUw7Pz3ZB1jj2gW9Fvmg=="],

    "@types/hast": ["@types/hast@3.0.4", "", { "dependencies": { "@types/unist": "*" } }, "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ=="],
@@ -1470,6 +1478,8 @@

    "astring": ["astring@1.9.0", "", { "bin": { "astring": "bin/astring" } }, "sha512-LElXdjswlqjWrPpJFg1Fx4wpkOCxj1TDHlSV4PlaRxHGWko024xICaa97ZkMfs6DRKlCguiAI+rbXv5GWwXIkg=="],

+    "async": ["async@0.2.10", "", {}, "sha512-eAkdoKxU6/LkKDBzLpT+t6Ff5EtfSF4wx1WfJiPEEV7WNLnDaRXk0oVysiEPm262roaachGexwUv94WhSgN5TQ=="],
+
    "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="],

    "atomic-sleep": ["atomic-sleep@1.0.0", "", {}, "sha512-kNOjDqAh7px0XWNI+4QbzoiR/nTkHAWNud2uvnJquD1/x5a7EQZMJT0AczqK0Qn67oY/TTQ1LbUKajZpp3I9tQ=="],
@@ -1550,6 +1560,8 @@

    "caniuse-lite": ["caniuse-lite@1.0.30001745", "", {}, "sha512-ywt6i8FzvdgrrrGbr1jZVObnVv6adj+0if2/omv9cmR2oiZs30zL4DIyaptKcbOrBdOIc74QTMoJvSE2QHh5UQ=="],

+    "caseless": ["caseless@0.12.0", "", {}, "sha512-4tYFyifaFfGacoiObjJegolkwSU4xQNGbVgUiNYVUxbQ2x2lUsFvY4hVgVzGiIe6WLOPqycWXA40l+PWsxthUw=="],
+
    "ccount": ["ccount@2.0.1", "", {}, "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg=="],

    "cfb": ["cfb@1.2.2", "", { "dependencies": { "adler-32": "~1.3.0", "crc-32": "~1.2.0" } }, "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA=="],
@@ -1818,6 +1830,8 @@

    "entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="],

+    "env-paths": ["env-paths@2.2.1", "", {}, "sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A=="],
+
    "environment": ["environment@1.1.0", "", {}, "sha512-xUtoPkMggbz0MPyPiIWr1Kp4aeWJjDZ6SMvURhimjdZgsRuDplF5/s9hcgGhyXMhs+6vpnuoiZ2kFiu3FMnS8Q=="],

    "error": ["error@7.0.2", "", { "dependencies": { "string-template": "~0.2.1", "xtend": "~4.0.0" } }, "sha512-UtVv4l5MhijsYUxPJo4390gzfZvAnTHreNnDjnTZaKIiZ/SemXxAhBkYSKtWa5RtBXbLP8tMgn/n0RUa/H7jXw=="],
@@ -1916,6 +1930,8 @@

    "fflate": ["fflate@0.8.2", "", {}, "sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A=="],

+    "ffmpeg-static": ["ffmpeg-static@5.3.0", "", { "dependencies": { "@derhuerst/http-basic": "^8.2.0", "env-paths": "^2.2.0", "https-proxy-agent": "^5.0.0", "progress": "^2.0.3" } }, "sha512-H+K6sW6TiIX6VGend0KQwthe+kaceeH/luE8dIZyOP35ik7ahYojDuqlTV1bOrtEwl01sy2HFNGQfi5IDJvotg=="],
+
    "figures": ["figures@3.2.0", "", { "dependencies": { "escape-string-regexp": "^1.0.5" } }, "sha512-yaduQFRKLXYOGgEn6AZau90j3ggSOyiqXU0F9JZfeXYhNa+Jk4X+s45A2zg5jns87GAFa34BBm2kXw4XpNcbdg=="],

    "file-type": ["file-type@16.5.4", "", { "dependencies": { "readable-web-to-node-stream": "^3.0.0", "strtok3": "^6.2.4", "token-types": "^4.1.1" } }, "sha512-/yFHK0aGjFEgDJjEKP0pWCplsPFPhwyfwevf/pVxiN0tmE4L9LmwWxWukdJSHdoCli4VgQLehjJtwQBnqmsKcw=="],
@@ -1924,6 +1940,8 @@

    "finalhandler": ["finalhandler@2.1.0", "", { "dependencies": { "debug": "^4.4.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "on-finished": "^2.4.1", "parseurl": "^1.3.3", "statuses": "^2.0.1" } }, "sha512-/t88Ty3d5JWQbWYgaOGCCYfXRwV1+be02WqYYlL6h0lEiUAMPM8o8qKGO01YIkOHzka2up08wvgYD0mDiI+q3Q=="],

+    "fluent-ffmpeg": ["fluent-ffmpeg@2.1.3", "", { "dependencies": { "async": "^0.2.9", "which": "^1.1.1" } }, "sha512-Be3narBNt2s6bsaqP6Jzq91heDgOEaDCJAXcE3qcma/EJBSy5FB4cvO31XBInuAuKBx8Kptf8dkhjK0IOru39Q=="],
+
    "follow-redirects": ["follow-redirects@1.15.11", "", {}, "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ=="],

    "foreground-child": ["foreground-child@3.3.1", "", { "dependencies": { "cross-spawn": "^7.0.6", "signal-exit": "^4.0.1" } }, "sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw=="],
@@ -2050,6 +2068,8 @@

    "http-proxy-agent": ["http-proxy-agent@7.0.2", "", { "dependencies": { "agent-base": "^7.1.0", "debug": "^4.3.4" } }, "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig=="],

+    "http-response-object": ["http-response-object@3.0.2", "", { "dependencies": { "@types/node": "^10.0.3" } }, "sha512-bqX0XTF6fnXSQcEJ2Iuyr75yVakyjIDCqroJQ/aHfSdlM743Cwqoi2nDYMzLGWUcuTWGWy8AAvOKXTfiv6q9RA=="],
+
    "https-proxy-agent": ["https-proxy-agent@5.0.1", "", { "dependencies": { "agent-base": "6", "debug": "4" } }, "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA=="],

    "human-signals": ["human-signals@5.0.0", "", {}, "sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ=="],
@@ -2538,6 +2558,8 @@

    "papaparse": ["papaparse@5.5.3", "", {}, "sha512-5QvjGxYVjxO59MGU2lHVYpRWBBtKHnlIAcSe1uNFCkkptUh63NFRj0FJQm7nR67puEruUci/ZkjmEFrjCAyP4A=="],

+    "parse-cache-control": ["parse-cache-control@1.0.1", "", {}, "sha512-60zvsJReQPX5/QP0Kzfd/VrpjScIQ7SHBW6bFCYfEP+fp0Eppr1SHhIO5nd1PjZtvclzSzES9D/p5nFJurwfWg=="],
+
    "parse-css-color": ["parse-css-color@0.2.1", "", { "dependencies": { "color-name": "^1.1.4", "hex-rgb": "^4.1.0" } }, "sha512-bwS/GGIFV3b6KS4uwpzCFj4w297Yl3uqnSgIPsoQkx7GMLROXfMnWvxfNkL0oh8HVhZA4hvJoEoEIqonfJ3BWg=="],

    "parse-entities": ["parse-entities@4.0.2", "", { "dependencies": { "@types/unist": "^2.0.0", "character-entities-legacy": "^3.0.0", "character-reference-invalid": "^2.0.0", "decode-named-character-reference": "^1.0.0", "is-alphanumerical": "^2.0.0", "is-decimal": "^2.0.0", "is-hexadecimal": "^2.0.0" } }, "sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw=="],
@@ -2638,6 +2660,8 @@

    "process-warning": ["process-warning@5.0.0", "", {}, "sha512-a39t9ApHNx2L4+HBnQKqxxHNs1r7KF+Intd8Q/g1bUh6q0WIp9voPXJ/x0j+ZL45KF1pJd9+q2jLIRMfvEshkA=="],

+    "progress": ["progress@2.0.3", "", {}, "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA=="],
+
    "prom-client": ["prom-client@15.1.3", "", { "dependencies": { "@opentelemetry/api": "^1.4.0", "tdigest": "^0.1.1" } }, "sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g=="],

    "prompts": ["prompts@2.4.2", "", { "dependencies": { "kleur": "^3.0.3", "sisteransi": "^1.0.5" } }, "sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q=="],
@@ -3140,7 +3164,7 @@

    "whatwg-url": ["whatwg-url@14.2.0", "", { "dependencies": { "tr46": "^5.1.0", "webidl-conversions": "^7.0.0" } }, "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw=="],

-    "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
+    "which": ["which@1.3.1", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "which": "./bin/which" } }, "sha512-HxJdYWq1MTIQbJ3nw0cqssHoTNU267KlrDuGZ1WYlxDStUtKUhOaJmh112/TZmHxxUfuJqPXSOm7tDyas0OSIQ=="],

    "why-is-node-running": ["why-is-node-running@2.3.0", "", { "dependencies": { "siginfo": "^2.0.0", "stackback": "0.0.2" }, "bin": { "why-is-node-running": "cli.js" } }, "sha512-hUrmaWBdVDcxvYqnyh09zunKzROWjbZTiNy8dBEjkS7ehEDQibXJ7XvlmtbwuTclUiIyN+CyXQD4Vmko8fNm8w=="],

@@ -3418,6 +3442,8 @@

    "@types/cors/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],

+    "@types/fluent-ffmpeg/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
+
    "@types/jsdom/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],

    "@types/node-fetch/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
@@ -3454,6 +3480,8 @@

    "content-disposition/safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="],

+    "cross-spawn/which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
+
    "dom-serializer/entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="],

    "ecdsa-sig-formatter/safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="],
@@ -3508,6 +3536,8 @@

    "http-proxy-agent/agent-base": ["agent-base@7.1.4", "", {}, "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ=="],

+    "http-response-object/@types/node": ["@types/node@10.17.60", "", {}, "sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw=="],
+
    "inquirer/ora": ["ora@5.4.1", "", { "dependencies": { "bl": "^4.1.0", "chalk": "^4.1.0", "cli-cursor": "^3.1.0", "cli-spinners": "^2.5.0", "is-interactive": "^1.0.0", "is-unicode-supported": "^0.1.0", "log-symbols": "^4.1.0", "strip-ansi": "^6.0.0", "wcwidth": "^1.0.1" } }, "sha512-5b6Y85tPxZZ7QytO+BQzysW31HJku27cRIlkbAXaNx+BdcVi+LlRFmVXzeF6a7JCwJpyw5c4b+YSVImQIrBpuQ=="],

    "isomorphic-unfetch/node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="],
@@ -3766,6 +3796,8 @@

    "@types/cors/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],

+    "@types/fluent-ffmpeg/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
+
    "@types/jsdom/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],

    "@types/node-fetch/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
--- a/docker/app.Dockerfile
+++ b/docker/app.Dockerfile
@@ -78,7 +78,8 @@ FROM base AS runner
 WORKDIR /app

 # Install Python and dependencies for guardrails PII detection (cached separately)
-RUN apk add --no-cache python3 py3-pip bash
+# Also install ffmpeg for audio/video processing in STT
+RUN apk add --no-cache python3 py3-pip bash ffmpeg

 ENV NODE_ENV=production

--- a/package.json
+++ b/package.json
@@ -39,8 +39,11 @@
    "@t3-oss/env-nextjs": "0.13.4",
    "@tanstack/react-query": "5.90.8",
    "@tanstack/react-query-devtools": "5.90.2",
+    "@types/fluent-ffmpeg": "2.1.28",
    "cronstrue": "3.3.0",
    "drizzle-orm": "^0.44.5",
+    "ffmpeg-static": "5.3.0",
+    "fluent-ffmpeg": "2.1.3",
    "mongodb": "6.19.0",
    "neo4j-driver": "6.0.1",
    "onedollarstats": "0.0.10",
@@ -63,5 +66,8 @@
    "*.{js,jsx,ts,tsx,json,css,scss}": [
      "biome check --write --no-errors-on-unmatched --files-ignore-unknown=true"
    ]
-  }
+  },
+  "trustedDependencies": [
+    "ffmpeg-static"
+  ]
 }