From e64b1c9fcd0eb6ba0d7cf90814dc0d919668e41f Mon Sep 17 00:00:00 2001
From: Waleed <walif6@gmail.com>
Date: Wed, 19 Nov 2025 21:03:54 -0800
Subject: [PATCH] feat(tools): added speech to text with openai whisper,
 elevenlabs, and deepgram (#2068)

* feat(tools): added speech to text with openai whisper, elevenlabs, and deepgram

* added new file icons, implemented ffmpeg

* updated docs

* revert environment
---
 apps/docs/components/icons.tsx                |  24 ++
 apps/docs/components/ui/icon-mapping.ts       |   2 +
 apps/docs/content/docs/en/tools/calendly.mdx  |  14 +
 apps/docs/content/docs/en/tools/meta.json     |   1 +
 apps/docs/content/docs/en/tools/stt.mdx       | 122 ++++++
 apps/sim/app/api/files/upload/route.ts        |  24 +-
 apps/sim/app/api/proxy/stt/route.ts           | 375 ++++++++++++++++++
 .../components/icons/document-icons.tsx       |  72 +++-
 .../components/file-upload/file-upload.tsx    |  18 +-
 .../settings-modal/components/files/files.tsx |  17 +-
 apps/sim/blocks/blocks/stt.ts                 | 232 +++++++++++
 apps/sim/blocks/registry.ts                   |   2 +
 apps/sim/components/icons.tsx                 |  24 ++
 apps/sim/lib/audio/extractor.ts               | 294 ++++++++++++++
 apps/sim/lib/audio/types.ts                   |  22 +
 apps/sim/lib/uploads/utils/file-utils.ts      |  72 +++-
 apps/sim/lib/uploads/utils/validation.ts      | 101 +++++
 apps/sim/next.config.ts                       |   2 +-
 apps/sim/tools/registry.ts                    |   4 +
 apps/sim/tools/stt/deepgram.ts                | 125 ++++++
 apps/sim/tools/stt/elevenlabs.ts              | 118 ++++++
 apps/sim/tools/stt/index.ts                   |   5 +
 apps/sim/tools/stt/types.ts                   |  62 +++
 apps/sim/tools/stt/whisper.ts                 | 125 ++++++
 bun.lock                                      |  34 +-
 docker/app.Dockerfile                         |   3 +-
 package.json                                  |   8 +-
 27 files changed, 1884 insertions(+), 18 deletions(-)
 create mode 100644 apps/docs/content/docs/en/tools/stt.mdx
 create mode 100644 apps/sim/app/api/proxy/stt/route.ts
 create mode 100644 apps/sim/blocks/blocks/stt.ts
 create mode 100644 apps/sim/lib/audio/extractor.ts
 create mode 100644 apps/sim/lib/audio/types.ts
 create mode 100644 apps/sim/tools/stt/deepgram.ts
 create mode 100644 apps/sim/tools/stt/elevenlabs.ts
 create mode 100644 apps/sim/tools/stt/index.ts
 create mode 100644 apps/sim/tools/stt/types.ts
 create mode 100644 apps/sim/tools/stt/whisper.ts
diff --git a/apps/docs/components/icons.tsx b/apps/docs/components/icons.tsx
index 5b45022db..9562e82bc 100644
--- a/apps/docs/components/icons.tsx
+++ b/apps/docs/components/icons.tsx
@@ -4084,3 +4084,27 @@ export function CalendlyIcon(props: SVGProps<SVGSVGElement>) {
     </svg>
   )
 }
+
+export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
+  return (
+    <svg
+      {...props}
+      xmlns='http://www.w3.org/2000/svg'
+      width='24'
+      height='24'
+      viewBox='0 0 24 24'
+      fill='none'
+      stroke='currentColor'
+      strokeWidth='2'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    >
+      <path d='M2 10v3' />
+      <path d='M6 6v11' />
+      <path d='M10 3v18' />
+      <path d='M14 8v7' />
+      <path d='M18 5v13' />
+      <path d='M22 10v3' />
+    </svg>
+  )
+}
diff --git a/apps/docs/components/ui/icon-mapping.ts b/apps/docs/components/ui/icon-mapping.ts
index 50f5c8406..4d57eae8a 100644
--- a/apps/docs/components/ui/icon-mapping.ts
+++ b/apps/docs/components/ui/icon-mapping.ts
@@ -8,6 +8,7 @@ import {
   ApolloIcon,
   ArxivIcon,
   AsanaIcon,
+  AudioWaveformIcon,
   BrainIcon,
   BrowserUseIcon,
   CalendlyIcon,
@@ -100,6 +101,7 @@ export const blockTypeToIconMap: Record<string, IconComponent> = {
   telegram: TelegramIcon,
   tavily: TavilyIcon,
   supabase: SupabaseIcon,
+  stt: AudioWaveformIcon,
   stripe: StripeIcon,
   stagehand_agent: StagehandIcon,
   stagehand: StagehandIcon,
diff --git a/apps/docs/content/docs/en/tools/calendly.mdx b/apps/docs/content/docs/en/tools/calendly.mdx
index ad053ec11..d1772f84c 100644
--- a/apps/docs/content/docs/en/tools/calendly.mdx
+++ b/apps/docs/content/docs/en/tools/calendly.mdx
@@ -10,6 +10,20 @@ import { BlockInfoCard } from "@/components/ui/block-info-card"
   color="#FFFFFF"
 />
 
+{/* MANUAL-CONTENT-START:intro */}
+[Calendly](https://calendly.com/) is a popular scheduling automation platform that helps you book meetings, events, and appointments with ease. With Calendly, teams and individuals can streamline scheduling, reduce back-and-forth emails, and automate tasks around events.
+
+With the Sim Calendly integration, your agents can:
+
+- **Retrieve information about your account and scheduled events**: Use tools to fetch user info, event types, and scheduled events for analysis or automation.
+- **Manage event types and scheduling**: Access and list available event types for users or organizations, retrieve details about specific event types, and monitor scheduled meetings and invitee data.
+- **Automate follow-ups and workflows**: When users schedule, reschedule, or cancel meetings, Sim agents can automatically trigger corresponding workflows—such as sending reminders, updating CRMs, or notifying participants.
+- **Integrate easily using webhooks**: Set up Sim workflows to respond to real-time Calendly webhook events, including when invitees schedule, cancel, or interact with routing forms.
+
+Whether you want to automate meeting prep, manage invites, or run custom workflows in response to scheduling activity, the Calendly tools in Sim give you flexible and secure access. Unlock new automation by reacting instantly to scheduling changes—streamlining your team's operations and communications.
+{/* MANUAL-CONTENT-END */}
+
+
 ## Usage Instructions
 
 Integrate Calendly into your workflow. Manage event types, scheduled events, invitees, and webhooks. Can also trigger workflows based on Calendly webhook events (invitee scheduled, invitee canceled, routing form submitted). Requires Personal Access Token.
diff --git a/apps/docs/content/docs/en/tools/meta.json b/apps/docs/content/docs/en/tools/meta.json
index ab3280ed6..437ad185c 100644
--- a/apps/docs/content/docs/en/tools/meta.json
+++ b/apps/docs/content/docs/en/tools/meta.json
@@ -61,6 +61,7 @@
     "stagehand",
     "stagehand_agent",
     "stripe",
+    "stt",
     "supabase",
     "tavily",
     "telegram",
diff --git a/apps/docs/content/docs/en/tools/stt.mdx b/apps/docs/content/docs/en/tools/stt.mdx
new file mode 100644
index 000000000..2132b8c51
--- /dev/null
+++ b/apps/docs/content/docs/en/tools/stt.mdx
@@ -0,0 +1,122 @@
+---
+title: Speech-to-Text
+description: Convert speech to text using AI
+---
+
+import { BlockInfoCard } from "@/components/ui/block-info-card"
+
+<BlockInfoCard 
+  type="stt"
+  color="#181C1E"
+/>
+
+{/* MANUAL-CONTENT-START:intro */}
+Transcribe speech to text using state-of-the-art AI models from leading providers. The Sim Speech-to-Text (STT) tools allow you to convert audio and video files into accurate transcripts, supporting multiple languages, timestamps, and optional translation.
+
+Supported providers:
+
+- **[OpenAI Whisper](https://platform.openai.com/docs/guides/speech-to-text/overview)**: Advanced open-source STT model from OpenAI. Supports models such as `whisper-1` and handles a wide variety of languages and audio formats.
+- **[Deepgram](https://deepgram.com/)**: Real-time and batch STT API with deep learning models like `nova-3`, `nova-2`, and `whisper-large`. Offers features like diarization, intent recognition, and industry-specific tuning.
+- **[ElevenLabs](https://elevenlabs.io/)**: Known for high-quality speech AI, ElevenLabs provides STT models focused on accuracy and natural language understanding for numerous languages and dialects.
+
+Choose the provider and model best suited to your task—whether fast, production-grade transcription (Deepgram), highly accurate multi-language capability (Whisper), or advanced understanding and language coverage (ElevenLabs).
+{/* MANUAL-CONTENT-END */}
+
+
+## Usage Instructions
+
+Transcribe audio and video files to text using leading AI providers. Supports multiple languages, timestamps, and speaker diarization.
+
+
+
+## Tools
+
+### `stt_whisper`
+
+Transcribe audio to text using OpenAI Whisper
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | STT provider \(whisper\) |
+| `apiKey` | string | Yes | OpenAI API key |
+| `model` | string | No | Whisper model to use \(default: whisper-1\) |
+| `audioFile` | file | No | Audio or video file to transcribe |
+| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
+| `audioUrl` | string | No | URL to audio or video file |
+| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
+| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
+| `translateToEnglish` | boolean | No | Translate audio to English |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `transcript` | string | Full transcribed text |
+| `segments` | array | Timestamped segments |
+| `language` | string | Detected or specified language |
+| `duration` | number | Audio duration in seconds |
+| `confidence` | number | Overall confidence score |
+
+### `stt_deepgram`
+
+Transcribe audio to text using Deepgram
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | STT provider \(deepgram\) |
+| `apiKey` | string | Yes | Deepgram API key |
+| `model` | string | No | Deepgram model to use \(nova-3, nova-2, whisper-large, etc.\) |
+| `audioFile` | file | No | Audio or video file to transcribe |
+| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
+| `audioUrl` | string | No | URL to audio or video file |
+| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
+| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
+| `diarization` | boolean | No | Enable speaker diarization |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `transcript` | string | Full transcribed text |
+| `segments` | array | Timestamped segments with speaker labels |
+| `language` | string | Detected or specified language |
+| `duration` | number | Audio duration in seconds |
+| `confidence` | number | Overall confidence score |
+
+### `stt_elevenlabs`
+
+Transcribe audio to text using ElevenLabs
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | STT provider \(elevenlabs\) |
+| `apiKey` | string | Yes | ElevenLabs API key |
+| `model` | string | No | ElevenLabs model to use \(scribe_v1, scribe_v1_experimental\) |
+| `audioFile` | file | No | Audio or video file to transcribe |
+| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
+| `audioUrl` | string | No | URL to audio or video file |
+| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
+| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `transcript` | string | Full transcribed text |
+| `segments` | array | Timestamped segments |
+| `language` | string | Detected or specified language |
+| `duration` | number | Audio duration in seconds |
+| `confidence` | number | Overall confidence score |
+
+
+
+## Notes
+
+- Category: `tools`
+- Type: `stt`
diff --git a/apps/sim/app/api/files/upload/route.ts b/apps/sim/app/api/files/upload/route.ts
index d9c099481..c2f629fa4 100644
--- a/apps/sim/app/api/files/upload/route.ts
+++ b/apps/sim/app/api/files/upload/route.ts
@@ -13,21 +13,37 @@ import {
 } from '@/app/api/files/utils'
 
 const ALLOWED_EXTENSIONS = new Set([
+  // Documents
   'pdf',
   'doc',
   'docx',
   'txt',
   'md',
-  'png',
-  'jpg',
-  'jpeg',
-  'gif',
   'csv',
   'xlsx',
   'xls',
   'json',
   'yaml',
   'yml',
+  // Images
+  'png',
+  'jpg',
+  'jpeg',
+  'gif',
+  // Audio
+  'mp3',
+  'm4a',
+  'wav',
+  'webm',
+  'ogg',
+  'flac',
+  'aac',
+  'opus',
+  // Video
+  'mp4',
+  'mov',
+  'avi',
+  'mkv',
 ])
 
 function validateFileExtension(filename: string): boolean {
diff --git a/apps/sim/app/api/proxy/stt/route.ts b/apps/sim/app/api/proxy/stt/route.ts
new file mode 100644
index 000000000..7e30e7564
--- /dev/null
+++ b/apps/sim/app/api/proxy/stt/route.ts
@@ -0,0 +1,375 @@
+import { type NextRequest, NextResponse } from 'next/server'
+import { extractAudioFromVideo, isVideoFile } from '@/lib/audio/extractor'
+import { checkHybridAuth } from '@/lib/auth/hybrid'
+import { createLogger } from '@/lib/logs/console/logger'
+import { downloadFileFromStorage } from '@/lib/uploads/utils/file-utils.server'
+import type { UserFile } from '@/executor/types'
+import type { TranscriptSegment } from '@/tools/stt/types'
+
+const logger = createLogger('SttProxyAPI')
+
+export const dynamic = 'force-dynamic'
+export const maxDuration = 300 // 5 minutes for large files
+
+interface SttRequestBody {
+  provider: 'whisper' | 'deepgram' | 'elevenlabs'
+  apiKey: string
+  model?: string
+  audioFile?: UserFile | UserFile[]
+  audioFileReference?: UserFile | UserFile[]
+  audioUrl?: string
+  language?: string
+  timestamps?: 'none' | 'sentence' | 'word'
+  diarization?: boolean
+  translateToEnglish?: boolean
+  workspaceId?: string
+  workflowId?: string
+  executionId?: string
+}
+
+export async function POST(request: NextRequest) {
+  const requestId = crypto.randomUUID()
+  logger.info(`[${requestId}] STT transcription request started`)
+
+  try {
+    const authResult = await checkHybridAuth(request, { requireWorkflowId: false })
+    if (!authResult.success) {
+      return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
+    }
+
+    const body: SttRequestBody = await request.json()
+    const { provider, apiKey, model, language, timestamps, diarization, translateToEnglish } = body
+
+    if (!provider || !apiKey) {
+      return NextResponse.json(
+        { error: 'Missing required fields: provider and apiKey' },
+        { status: 400 }
+      )
+    }
+
+    let audioBuffer: Buffer
+    let audioFileName: string
+    let audioMimeType: string
+
+    if (body.audioFile) {
+      const file = Array.isArray(body.audioFile) ? body.audioFile[0] : body.audioFile
+      logger.info(`[${requestId}] Processing uploaded file: ${file.name}`)
+
+      audioBuffer = await downloadFileFromStorage(file, requestId, logger)
+      audioFileName = file.name
+      audioMimeType = file.type
+    } else if (body.audioFileReference) {
+      const file = Array.isArray(body.audioFileReference)
+        ? body.audioFileReference[0]
+        : body.audioFileReference
+      logger.info(`[${requestId}] Processing referenced file: ${file.name}`)
+
+      audioBuffer = await downloadFileFromStorage(file, requestId, logger)
+      audioFileName = file.name
+      audioMimeType = file.type
+    } else if (body.audioUrl) {
+      logger.info(`[${requestId}] Downloading from URL: ${body.audioUrl}`)
+
+      const response = await fetch(body.audioUrl)
+      if (!response.ok) {
+        throw new Error(`Failed to download audio from URL: ${response.statusText}`)
+      }
+
+      const arrayBuffer = await response.arrayBuffer()
+      audioBuffer = Buffer.from(arrayBuffer)
+      audioFileName = body.audioUrl.split('/').pop() || 'audio_file'
+      audioMimeType = response.headers.get('content-type') || 'audio/mpeg'
+    } else {
+      return NextResponse.json(
+        { error: 'No audio source provided. Provide audioFile, audioFileReference, or audioUrl' },
+        { status: 400 }
+      )
+    }
+
+    if (isVideoFile(audioMimeType)) {
+      logger.info(`[${requestId}] Extracting audio from video file`)
+      try {
+        const extracted = await extractAudioFromVideo(audioBuffer, audioMimeType, {
+          outputFormat: 'mp3',
+          sampleRate: 16000,
+          channels: 1,
+        })
+        audioBuffer = extracted.buffer
+        audioMimeType = 'audio/mpeg'
+        audioFileName = audioFileName.replace(/\.[^.]+$/, '.mp3')
+      } catch (error) {
+        logger.error(`[${requestId}] Video extraction failed:`, error)
+        return NextResponse.json(
+          {
+            error: `Failed to extract audio from video: ${error instanceof Error ? error.message : 'Unknown error'}`,
+          },
+          { status: 500 }
+        )
+      }
+    }
+
+    logger.info(`[${requestId}] Transcribing with ${provider}, file: ${audioFileName}`)
+
+    let transcript: string
+    let segments: TranscriptSegment[] | undefined
+    let detectedLanguage: string | undefined
+    let duration: number | undefined
+    let confidence: number | undefined
+
+    try {
+      if (provider === 'whisper') {
+        const result = await transcribeWithWhisper(
+          audioBuffer,
+          apiKey,
+          language,
+          timestamps,
+          translateToEnglish,
+          model
+        )
+        transcript = result.transcript
+        segments = result.segments
+        detectedLanguage = result.language
+        duration = result.duration
+      } else if (provider === 'deepgram') {
+        const result = await transcribeWithDeepgram(
+          audioBuffer,
+          apiKey,
+          language,
+          timestamps,
+          diarization,
+          model
+        )
+        transcript = result.transcript
+        segments = result.segments
+        detectedLanguage = result.language
+        duration = result.duration
+        confidence = result.confidence
+      } else if (provider === 'elevenlabs') {
+        const result = await transcribeWithElevenLabs(
+          audioBuffer,
+          apiKey,
+          language,
+          timestamps,
+          model
+        )
+        transcript = result.transcript
+        segments = result.segments
+        detectedLanguage = result.language
+        duration = result.duration
+      } else {
+        return NextResponse.json({ error: `Unknown provider: ${provider}` }, { status: 400 })
+      }
+    } catch (error) {
+      logger.error(`[${requestId}] Transcription failed:`, error)
+      const errorMessage = error instanceof Error ? error.message : 'Transcription failed'
+      return NextResponse.json({ error: errorMessage }, { status: 500 })
+    }
+
+    logger.info(`[${requestId}] Transcription completed successfully`)
+
+    return NextResponse.json({
+      transcript,
+      segments,
+      language: detectedLanguage,
+      duration,
+      confidence,
+    })
+  } catch (error) {
+    logger.error(`[${requestId}] STT proxy error:`, error)
+    const errorMessage = error instanceof Error ? error.message : 'Unknown error'
+    return NextResponse.json({ error: errorMessage }, { status: 500 })
+  }
+}
+
+async function transcribeWithWhisper(
+  audioBuffer: Buffer,
+  apiKey: string,
+  language?: string,
+  timestamps?: 'none' | 'sentence' | 'word',
+  translate?: boolean,
+  model?: string
+): Promise<{
+  transcript: string
+  segments?: TranscriptSegment[]
+  language?: string
+  duration?: number
+}> {
+  const formData = new FormData()
+
+  const blob = new Blob([new Uint8Array(audioBuffer)], { type: 'audio/mpeg' })
+  formData.append('file', blob, 'audio.mp3')
+  formData.append('model', model || 'whisper-1')
+
+  if (language && language !== 'auto') {
+    formData.append('language', language)
+  }
+
+  if (timestamps === 'word') {
+    formData.append('response_format', 'verbose_json')
+    formData.append('timestamp_granularities[]', 'word')
+  } else if (timestamps === 'sentence') {
+    formData.append('response_format', 'verbose_json')
+    formData.append('timestamp_granularities[]', 'segment')
+  }
+
+  const endpoint = translate ? 'translations' : 'transcriptions'
+  const response = await fetch(`https://api.openai.com/v1/audio/${endpoint}`, {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${apiKey}`,
+    },
+    body: formData,
+  })
+
+  if (!response.ok) {
+    const error = await response.json()
+    const errorMessage = error.error?.message || error.message || JSON.stringify(error)
+    throw new Error(`Whisper API error: ${errorMessage}`)
+  }
+
+  const data = await response.json()
+
+  if (timestamps === 'none') {
+    return {
+      transcript: data.text,
+      language: data.language,
+    }
+  }
+  const segments: TranscriptSegment[] = (data.segments || data.words || []).map((seg: any) => ({
+    text: seg.text,
+    start: seg.start,
+    end: seg.end,
+  }))
+
+  return {
+    transcript: data.text,
+    segments,
+    language: data.language,
+    duration: data.duration,
+  }
+}
+
+async function transcribeWithDeepgram(
+  audioBuffer: Buffer,
+  apiKey: string,
+  language?: string,
+  timestamps?: 'none' | 'sentence' | 'word',
+  diarization?: boolean,
+  model?: string
+): Promise<{
+  transcript: string
+  segments?: TranscriptSegment[]
+  language?: string
+  duration?: number
+  confidence?: number
+}> {
+  const params = new URLSearchParams({
+    model: model || 'nova-3',
+    smart_format: 'true',
+    punctuate: 'true',
+  })
+
+  if (language && language !== 'auto') {
+    params.append('language', language)
+  }
+
+  if (timestamps !== 'none') {
+    params.append('utterances', 'true')
+  }
+
+  if (diarization) {
+    params.append('diarize', 'true')
+  }
+
+  const response = await fetch(`https://api.deepgram.com/v1/listen?${params.toString()}`, {
+    method: 'POST',
+    headers: {
+      Authorization: `Token ${apiKey}`,
+      'Content-Type': 'audio/mpeg',
+    },
+    body: new Uint8Array(audioBuffer),
+  })
+
+  if (!response.ok) {
+    const error = await response.json()
+    const errorMessage = error.err_msg || error.message || JSON.stringify(error)
+    throw new Error(`Deepgram API error: ${errorMessage}`)
+  }
+
+  const data = await response.json()
+  const result = data.results?.channels?.[0]?.alternatives?.[0]
+
+  if (!result) {
+    throw new Error('No transcription result from Deepgram')
+  }
+
+  const transcript = result.transcript
+  const detectedLanguage = data.results?.channels?.[0]?.detected_language
+  const confidence = result.confidence
+
+  let segments: TranscriptSegment[] | undefined
+  if (timestamps !== 'none' && result.words) {
+    segments = result.words.map((word: any) => ({
+      text: word.word,
+      start: word.start,
+      end: word.end,
+      speaker: word.speaker !== undefined ? `Speaker ${word.speaker}` : undefined,
+      confidence: word.confidence,
+    }))
+  }
+
+  return {
+    transcript,
+    segments,
+    language: detectedLanguage,
+    duration: data.metadata?.duration,
+    confidence,
+  }
+}
+
+async function transcribeWithElevenLabs(
+  audioBuffer: Buffer,
+  apiKey: string,
+  language?: string,
+  timestamps?: 'none' | 'sentence' | 'word',
+  model?: string
+): Promise<{
+  transcript: string
+  segments?: TranscriptSegment[]
+  language?: string
+  duration?: number
+}> {
+  const formData = new FormData()
+  const blob = new Blob([new Uint8Array(audioBuffer)], { type: 'audio/mpeg' })
+  formData.append('file', blob, 'audio.mp3')
+  formData.append('model_id', model || 'scribe_v1')
+
+  if (language && language !== 'auto') {
+    formData.append('language', language)
+  }
+
+  const response = await fetch('https://api.elevenlabs.io/v1/speech-to-text', {
+    method: 'POST',
+    headers: {
+      'xi-api-key': apiKey,
+    },
+    body: formData,
+  })
+
+  if (!response.ok) {
+    const error = await response.json()
+    const errorMessage =
+      typeof error.detail === 'string'
+        ? error.detail
+        : error.detail?.message || error.message || JSON.stringify(error)
+    throw new Error(`ElevenLabs API error: ${errorMessage}`)
+  }
+
+  const data = await response.json()
+
+  return {
+    transcript: data.text || '',
+    language: data.language,
+    duration: data.duration,
+  }
+}
diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/components/icons/document-icons.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/components/icons/document-icons.tsx
index 7d4fb162d..dbbfe9840 100644
--- a/apps/sim/app/workspace/[workspaceId]/knowledge/components/icons/document-icons.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/knowledge/components/icons/document-icons.tsx
@@ -144,6 +144,62 @@ export const TxtIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
   </svg>
 )
 
+export const AudioIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
+  <svg viewBox='0 0 24 24' fill='none' xmlns='http://www.w3.org/2000/svg' className={className}>
+    <path
+      d='M14 2H6C4.9 2 4 2.9 4 4V20C4 21.1 4.9 22 6 22H18C19.1 22 20 21.1 20 20V8L14 2Z'
+      fill='#0288D1'
+    />
+    <path d='M14 2V8H20' fill='#29B6F6' />
+    <path
+      d='M14 2L20 8V20C20 21.1 19.1 22 18 22H6C4.9 22 4 21.1 4 20V4C4 2.9 4.9 2 6 2H14Z'
+      stroke='#01579B'
+      strokeWidth='0.5'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    />
+    {/* Speaker icon */}
+    <path d='M8.5 10.5v3c0 .28.22.5.5.5h1.5l2 2V8l-2 2H9c-.28 0-.5.22-.5.5z' fill='white' />
+    {/* Sound waves */}
+    <path
+      d='M14 10.5c.6.6.6 1.4 0 2M15.5 9c1.2 1.2 1.2 3.8 0 5'
+      stroke='white'
+      strokeWidth='0.8'
+      strokeLinecap='round'
+    />
+  </svg>
+)
+
+export const VideoIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
+  <svg viewBox='0 0 24 24' fill='none' xmlns='http://www.w3.org/2000/svg' className={className}>
+    <path
+      d='M14 2H6C4.9 2 4 2.9 4 4V20C4 21.1 4.9 22 6 22H18C19.1 22 20 21.1 20 20V8L14 2Z'
+      fill='#D32F2F'
+    />
+    <path d='M14 2V8H20' fill='#EF5350' />
+    <path
+      d='M14 2L20 8V20C20 21.1 19.1 22 18 22H6C4.9 22 4 21.1 4 20V4C4 2.9 4.9 2 6 2H14Z'
+      stroke='#B71C1C'
+      strokeWidth='0.5'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    />
+    {/* Video screen */}
+    <rect
+      x='7.5'
+      y='9.5'
+      width='9'
+      height='6'
+      rx='0.5'
+      stroke='white'
+      strokeWidth='0.8'
+      fill='none'
+    />
+    {/* Play button */}
+    <path d='M10.5 11.5l3 2-3 2v-4z' fill='white' />
+  </svg>
+)
+
 export const DefaultFileIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' }) => (
   <svg viewBox='0 0 24 24' fill='none' xmlns='http://www.w3.org/2000/svg' className={className}>
     <path
@@ -164,13 +220,23 @@ export const DefaultFileIcon: React.FC<IconProps> = ({ className = 'w-6 h-6' })
   </svg>
 )
 
-// Helper function to get the appropriate icon component
 export function getDocumentIcon(mimeType: string, filename: string): React.FC<IconProps> {
   const extension = filename.split('.').pop()?.toLowerCase()
 
+  const audioExtensions = ['mp3', 'm4a', 'wav', 'webm', 'ogg', 'flac', 'aac', 'opus']
+  if (mimeType.startsWith('audio/') || (extension && audioExtensions.includes(extension))) {
+    return AudioIcon
+  }
+
+  const videoExtensions = ['mp4', 'mov', 'avi', 'mkv']
+  if (mimeType.startsWith('video/') || (extension && videoExtensions.includes(extension))) {
+    return VideoIcon
+  }
+
   if (mimeType === 'application/pdf' || extension === 'pdf') {
     return PdfIcon
   }
+
   if (
     mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
     mimeType === 'application/msword' ||
@@ -179,6 +245,7 @@ export function getDocumentIcon(mimeType: string, filename: string): React.FC<Ic
   ) {
     return DocxIcon
   }
+
   if (
     mimeType === 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ||
     mimeType === 'application/vnd.ms-excel' ||
@@ -187,11 +254,14 @@ export function getDocumentIcon(mimeType: string, filename: string): React.FC<Ic
   ) {
     return XlsxIcon
   }
+
   if (mimeType === 'text/csv' || extension === 'csv') {
     return CsvIcon
   }
+
   if (mimeType === 'text/plain' || extension === 'txt') {
     return TxtIcon
   }
+
   return DefaultFileIcon
 }
diff --git a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel-new/components/editor/components/sub-block/components/file-upload/file-upload.tsx b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel-new/components/editor/components/sub-block/components/file-upload/file-upload.tsx
index 459dba0b6..82a845bc8 100644
--- a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel-new/components/editor/components/sub-block/components/file-upload/file-upload.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel-new/components/editor/components/sub-block/components/file-upload/file-upload.tsx
@@ -148,21 +148,29 @@ export function FileUpload({
     const maxSizeInBytes = maxSize * 1024 * 1024
     const validFiles: File[] = []
     let totalNewSize = 0
+    let sizeExceededFile: string | null = null
 
     for (let i = 0; i < files.length; i++) {
       const file = files[i]
       if (existingTotalSize + totalNewSize + file.size > maxSizeInBytes) {
-        logger.error(
-          `Adding ${file.name} would exceed the maximum size limit of ${maxSize}MB`,
-          activeWorkflowId
-        )
+        const errorMessage = `Adding ${file.name} would exceed the maximum size limit of ${maxSize}MB`
+        logger.error(errorMessage, activeWorkflowId)
+        if (!sizeExceededFile) {
+          sizeExceededFile = errorMessage
+        }
       } else {
         validFiles.push(file)
         totalNewSize += file.size
       }
     }
 
-    if (validFiles.length === 0) return
+    if (validFiles.length === 0) {
+      if (sizeExceededFile) {
+        setUploadError(sizeExceededFile)
+        setTimeout(() => setUploadError(null), 5000)
+      }
+      return
+    }
 
     const uploading = validFiles.map((file) => ({
       id: `upload-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
diff --git a/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/settings-modal/components/files/files.tsx b/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/settings-modal/components/files/files.tsx
index 0e79ebf39..fd1acde0a 100644
--- a/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/settings-modal/components/files/files.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/settings-modal/components/files/files.tsx
@@ -32,6 +32,7 @@ const logger = createLogger('FileUploadsSettings')
 const isBillingEnabled = isTruthy(getEnv('NEXT_PUBLIC_BILLING_ENABLED'))
 
 const SUPPORTED_EXTENSIONS = [
+  // Documents
   'pdf',
   'csv',
   'doc',
@@ -47,9 +48,23 @@ const SUPPORTED_EXTENSIONS = [
   'json',
   'yaml',
   'yml',
+  // Audio formats
+  'mp3',
+  'm4a',
+  'wav',
+  'webm',
+  'ogg',
+  'flac',
+  'aac',
+  'opus',
+  // Video formats
+  'mp4',
+  'mov',
+  'avi',
+  'mkv',
 ] as const
 const ACCEPT_ATTR =
-  '.pdf,.csv,.doc,.docx,.txt,.md,.xlsx,.xls,.html,.htm,.pptx,.ppt,.json,.yaml,.yml'
+  '.pdf,.csv,.doc,.docx,.txt,.md,.xlsx,.xls,.html,.htm,.pptx,.ppt,.json,.yaml,.yml,.mp3,.m4a,.wav,.webm,.ogg,.flac,.aac,.opus,.mp4,.mov,.avi,.mkv'
 
 export function Files() {
   const params = useParams()
diff --git a/apps/sim/blocks/blocks/stt.ts b/apps/sim/blocks/blocks/stt.ts
new file mode 100644
index 000000000..98a53a3ab
--- /dev/null
+++ b/apps/sim/blocks/blocks/stt.ts
@@ -0,0 +1,232 @@
+import { AudioWaveformIcon } from '@/components/icons'
+import { AuthMode, type BlockConfig } from '@/blocks/types'
+import type { SttBlockResponse } from '@/tools/stt/types'
+
+export const SttBlock: BlockConfig<SttBlockResponse> = {
+  type: 'stt',
+  name: 'Speech-to-Text',
+  description: 'Convert speech to text using AI',
+  authMode: AuthMode.ApiKey,
+  longDescription:
+    'Transcribe audio and video files to text using leading AI providers. Supports multiple languages, timestamps, and speaker diarization.',
+  docsLink: 'https://docs.sim.ai/tools/stt',
+  category: 'tools',
+  bgColor: '#181C1E',
+  icon: AudioWaveformIcon,
+
+  subBlocks: [
+    // Provider selection
+    {
+      id: 'provider',
+      title: 'Provider',
+      type: 'dropdown',
+      options: [
+        { label: 'OpenAI Whisper', id: 'whisper' },
+        { label: 'Deepgram', id: 'deepgram' },
+        { label: 'ElevenLabs', id: 'elevenlabs' },
+      ],
+      value: () => 'whisper',
+      required: true,
+    },
+
+    // OpenAI Whisper model selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'whisper' },
+      options: [{ label: 'Whisper-1', id: 'whisper-1' }],
+      value: () => 'whisper-1',
+      required: false,
+    },
+
+    // ElevenLabs model selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'elevenlabs' },
+      options: [
+        { label: 'Scribe v1', id: 'scribe_v1' },
+        { label: 'Scribe v1 Experimental', id: 'scribe_v1_experimental' },
+      ],
+      value: () => 'scribe_v1',
+      required: false,
+    },
+
+    // Deepgram model selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'deepgram' },
+      options: [
+        { label: 'Nova 3', id: 'nova-3' },
+        { label: 'Nova 2', id: 'nova-2' },
+        { label: 'Nova', id: 'nova' },
+        { label: 'Whisper Large', id: 'whisper-large' },
+        { label: 'Enhanced', id: 'enhanced' },
+        { label: 'Base', id: 'base' },
+      ],
+      value: () => 'nova-3',
+      required: false,
+    },
+
+    // Audio/Video file upload (basic mode)
+    {
+      id: 'audioFile',
+      title: 'Audio/Video File',
+      type: 'file-upload',
+      canonicalParamId: 'audioFile',
+      placeholder: 'Upload an audio or video file',
+      mode: 'basic',
+      multiple: false,
+      required: false,
+      acceptedTypes: '.mp3,.m4a,.wav,.webm,.ogg,.flac,.aac,.opus,.mp4,.mov,.avi,.mkv',
+    },
+
+    // Audio file reference (advanced mode)
+    {
+      id: 'audioFileReference',
+      title: 'Audio/Video File Reference',
+      type: 'short-input',
+      canonicalParamId: 'audioFile',
+      placeholder: 'Reference audio/video from previous blocks',
+      mode: 'advanced',
+      required: false,
+    },
+
+    // Audio URL (alternative)
+    {
+      id: 'audioUrl',
+      title: 'Audio/Video URL (alternative)',
+      type: 'short-input',
+      placeholder: 'Or enter publicly accessible audio/video URL',
+      required: false,
+    },
+
+    // Language selection
+    {
+      id: 'language',
+      title: 'Language',
+      type: 'dropdown',
+      options: [
+        { label: 'Auto-detect', id: 'auto' },
+        { label: 'English', id: 'en' },
+        { label: 'Spanish', id: 'es' },
+        { label: 'French', id: 'fr' },
+        { label: 'German', id: 'de' },
+        { label: 'Italian', id: 'it' },
+        { label: 'Portuguese', id: 'pt' },
+        { label: 'Dutch', id: 'nl' },
+        { label: 'Russian', id: 'ru' },
+        { label: 'Chinese', id: 'zh' },
+        { label: 'Japanese', id: 'ja' },
+        { label: 'Korean', id: 'ko' },
+        { label: 'Arabic', id: 'ar' },
+        { label: 'Hindi', id: 'hi' },
+        { label: 'Polish', id: 'pl' },
+        { label: 'Turkish', id: 'tr' },
+        { label: 'Swedish', id: 'sv' },
+        { label: 'Danish', id: 'da' },
+        { label: 'Norwegian', id: 'no' },
+        { label: 'Finnish', id: 'fi' },
+      ],
+      value: () => 'auto',
+    },
+
+    // Timestamps (word-level, sentence-level, or none)
+    {
+      id: 'timestamps',
+      title: 'Timestamps',
+      type: 'dropdown',
+      options: [
+        { label: 'None', id: 'none' },
+        { label: 'Sentence-level', id: 'sentence' },
+        { label: 'Word-level', id: 'word' },
+      ],
+      value: () => 'none',
+    },
+
+    // Speaker diarization (Deepgram/AssemblyAI only)
+    {
+      id: 'diarization',
+      title: 'Speaker Diarization',
+      type: 'switch',
+      condition: { field: 'provider', value: ['deepgram'] },
+    },
+
+    // Translate to English (Whisper only)
+    {
+      id: 'translateToEnglish',
+      title: 'Translate to English',
+      type: 'switch',
+      condition: { field: 'provider', value: 'whisper' },
+    },
+
+    // API Key
+    {
+      id: 'apiKey',
+      title: 'API Key',
+      type: 'short-input',
+      placeholder: 'Enter your API key',
+      password: true,
+      required: true,
+    },
+  ],
+
+  tools: {
+    access: ['stt_whisper', 'stt_deepgram', 'stt_elevenlabs'],
+    config: {
+      tool: (params) => {
+        // Select tool based on provider
+        switch (params.provider) {
+          case 'whisper':
+            return 'stt_whisper'
+          case 'deepgram':
+            return 'stt_deepgram'
+          case 'elevenlabs':
+            return 'stt_elevenlabs'
+          default:
+            return 'stt_whisper'
+        }
+      },
+      params: (params) => ({
+        provider: params.provider,
+        apiKey: params.apiKey,
+        model: params.model,
+        audioFile: params.audioFile,
+        audioFileReference: params.audioFileReference,
+        audioUrl: params.audioUrl,
+        language: params.language,
+        timestamps: params.timestamps,
+        diarization: params.diarization,
+        translateToEnglish: params.translateToEnglish,
+      }),
+    },
+  },
+
+  inputs: {
+    provider: { type: 'string', description: 'STT provider (whisper, deepgram, elevenlabs)' },
+    apiKey: { type: 'string', description: 'Provider API key' },
+    model: {
+      type: 'string',
+      description: 'Provider-specific model (e.g., scribe_v1 for ElevenLabs, nova-3 for Deepgram)',
+    },
+    audioFile: { type: 'json', description: 'Audio/video file (UserFile)' },
+    audioFileReference: { type: 'json', description: 'Audio/video file reference' },
+    audioUrl: { type: 'string', description: 'Audio/video URL' },
+    language: { type: 'string', description: 'Language code or auto' },
+    timestamps: { type: 'string', description: 'Timestamp granularity (none, sentence, word)' },
+    diarization: { type: 'boolean', description: 'Enable speaker diarization' },
+    translateToEnglish: { type: 'boolean', description: 'Translate to English (Whisper only)' },
+  },
+
+  outputs: {
+    transcript: { type: 'string', description: 'Full transcribed text' },
+    segments: { type: 'array', description: 'Timestamped segments with speaker labels' },
+    language: { type: 'string', description: 'Detected or specified language' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    confidence: { type: 'number', description: 'Overall confidence score' },
+  },
+}
diff --git a/apps/sim/blocks/registry.ts b/apps/sim/blocks/registry.ts
index f144ffdc7..99de2386d 100644
--- a/apps/sim/blocks/registry.ts
+++ b/apps/sim/blocks/registry.ts
@@ -77,6 +77,7 @@ import { StagehandAgentBlock } from '@/blocks/blocks/stagehand_agent'
 import { StartTriggerBlock } from '@/blocks/blocks/start_trigger'
 import { StarterBlock } from '@/blocks/blocks/starter'
 import { StripeBlock } from '@/blocks/blocks/stripe'
+import { SttBlock } from '@/blocks/blocks/stt'
 import { SupabaseBlock } from '@/blocks/blocks/supabase'
 import { TavilyBlock } from '@/blocks/blocks/tavily'
 import { TelegramBlock } from '@/blocks/blocks/telegram'
@@ -177,6 +178,7 @@ export const registry: Record<string, BlockConfig> = {
   stagehand_agent: StagehandAgentBlock,
   slack: SlackBlock,
   starter: StarterBlock,
+  stt: SttBlock,
   start_trigger: StartTriggerBlock,
   input_trigger: InputTriggerBlock,
   chat_trigger: ChatTriggerBlock,
diff --git a/apps/sim/components/icons.tsx b/apps/sim/components/icons.tsx
index 5b45022db..9562e82bc 100644
--- a/apps/sim/components/icons.tsx
+++ b/apps/sim/components/icons.tsx
@@ -4084,3 +4084,27 @@ export function CalendlyIcon(props: SVGProps<SVGSVGElement>) {
     </svg>
   )
 }
+
+export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
+  return (
+    <svg
+      {...props}
+      xmlns='http://www.w3.org/2000/svg'
+      width='24'
+      height='24'
+      viewBox='0 0 24 24'
+      fill='none'
+      stroke='currentColor'
+      strokeWidth='2'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    >
+      <path d='M2 10v3' />
+      <path d='M6 6v11' />
+      <path d='M10 3v18' />
+      <path d='M14 8v7' />
+      <path d='M18 5v13' />
+      <path d='M22 10v3' />
+    </svg>
+  )
+}
diff --git a/apps/sim/lib/audio/extractor.ts b/apps/sim/lib/audio/extractor.ts
new file mode 100644
index 000000000..e1e1ec7b2
--- /dev/null
+++ b/apps/sim/lib/audio/extractor.ts
@@ -0,0 +1,294 @@
+import { execSync } from 'node:child_process'
+import fs from 'node:fs/promises'
+import os from 'node:os'
+import path from 'node:path'
+import ffmpegStatic from 'ffmpeg-static'
+import ffmpeg from 'fluent-ffmpeg'
+import type {
+  AudioExtractionOptions,
+  AudioExtractionResult,
+  AudioMetadata,
+} from '@/lib/audio/types'
+
+// Set ffmpeg binary path with fallback to system ffmpeg
+try {
+  if (ffmpegStatic && typeof ffmpegStatic === 'string') {
+    ffmpeg.setFfmpegPath(ffmpegStatic)
+  } else {
+    // Try to find system ffmpeg
+    try {
+      const systemFfmpeg = execSync('which ffmpeg', { encoding: 'utf-8' }).trim()
+      if (systemFfmpeg) {
+        ffmpeg.setFfmpegPath(systemFfmpeg)
+        console.log('[FFmpeg] Using system ffmpeg:', systemFfmpeg)
+      }
+    } catch {
+      console.warn(
+        '[FFmpeg] ffmpeg-static not available and system ffmpeg not found. Please install ffmpeg: brew install ffmpeg (macOS) or apt-get install ffmpeg (Linux)'
+      )
+    }
+  }
+} catch (error) {
+  console.warn('[FFmpeg] Failed to set ffmpeg path:', error)
+}
+
+/**
+ * Extract audio from video or convert audio format using FFmpeg
+ */
+export async function extractAudioFromVideo(
+  inputBuffer: Buffer,
+  mimeType: string,
+  options: AudioExtractionOptions = {}
+): Promise<AudioExtractionResult> {
+  const isVideo = mimeType.startsWith('video/')
+  const isAudio = mimeType.startsWith('audio/')
+
+  // If it's already audio and no conversion needed, get metadata and return
+  if (isAudio && !options.outputFormat) {
+    try {
+      const metadata = await getAudioMetadata(inputBuffer, mimeType)
+      return {
+        buffer: inputBuffer,
+        format: mimeType.split('/')[1] || 'unknown',
+        duration: metadata.duration || 0,
+        size: inputBuffer.length,
+      }
+    } catch (error) {
+      // If metadata extraction fails, still return the buffer
+      return {
+        buffer: inputBuffer,
+        format: mimeType.split('/')[1] || 'unknown',
+        duration: 0,
+        size: inputBuffer.length,
+      }
+    }
+  }
+
+  // For video or audio conversion, use ffmpeg
+  if (isVideo || options.outputFormat) {
+    return await convertAudioWithFFmpeg(inputBuffer, mimeType, options)
+  }
+
+  // Fallback
+  return {
+    buffer: inputBuffer,
+    format: options.outputFormat || mimeType.split('/')[1] || 'unknown',
+    duration: 0,
+    size: inputBuffer.length,
+  }
+}
+
+/**
+ * Convert audio/video using FFmpeg
+ */
+async function convertAudioWithFFmpeg(
+  inputBuffer: Buffer,
+  mimeType: string,
+  options: AudioExtractionOptions
+): Promise<AudioExtractionResult> {
+  // Create temporary files
+  const tempDir = os.tmpdir()
+  const inputExt = getExtensionFromMimeType(mimeType)
+  const outputFormat = options.outputFormat || 'mp3'
+  const inputFile = path.join(tempDir, `ffmpeg-input-${Date.now()}.${inputExt}`)
+  const outputFile = path.join(tempDir, `ffmpeg-output-${Date.now()}.${outputFormat}`)
+
+  try {
+    // Write input buffer to temporary file
+    await fs.writeFile(inputFile, inputBuffer)
+
+    // Get metadata for duration
+    let duration = 0
+    try {
+      const metadata = await getAudioMetadataFromFile(inputFile)
+      duration = metadata.duration || 0
+    } catch (error) {
+      // Metadata extraction failed, continue without duration
+      console.warn('Failed to extract metadata:', error)
+    }
+
+    // Convert using FFmpeg
+    await new Promise<void>((resolve, reject) => {
+      let command = ffmpeg(inputFile).toFormat(outputFormat).audioCodec(getAudioCodec(outputFormat))
+
+      // Apply audio options
+      if (options.channels) {
+        command = command.audioChannels(options.channels)
+      }
+      if (options.sampleRate) {
+        command = command.audioFrequency(options.sampleRate)
+      }
+      if (options.bitrate) {
+        command = command.audioBitrate(options.bitrate)
+      }
+
+      command
+        .on('end', () => resolve())
+        .on('error', (err) => reject(new Error(`FFmpeg error: ${err.message}`)))
+        .save(outputFile)
+    })
+
+    // Read output file
+    const outputBuffer = await fs.readFile(outputFile)
+
+    return {
+      buffer: outputBuffer,
+      format: outputFormat,
+      duration,
+      size: outputBuffer.length,
+    }
+  } finally {
+    // Clean up temporary files
+    try {
+      await fs.unlink(inputFile).catch(() => {})
+      await fs.unlink(outputFile).catch(() => {})
+    } catch (error) {
+      // Ignore cleanup errors
+    }
+  }
+}
+
+/**
+ * Get audio metadata using ffprobe
+ */
+export async function getAudioMetadata(buffer: Buffer, mimeType: string): Promise<AudioMetadata> {
+  const tempDir = os.tmpdir()
+  const inputExt = getExtensionFromMimeType(mimeType)
+  const inputFile = path.join(tempDir, `ffprobe-input-${Date.now()}.${inputExt}`)
+
+  try {
+    // Write buffer to temporary file
+    await fs.writeFile(inputFile, buffer)
+
+    // Get metadata using ffprobe
+    return await getAudioMetadataFromFile(inputFile)
+  } finally {
+    // Clean up temporary file
+    try {
+      await fs.unlink(inputFile).catch(() => {})
+    } catch (error) {
+      // Ignore cleanup errors
+    }
+  }
+}
+
+/**
+ * Get audio metadata from a file path using ffprobe
+ */
+async function getAudioMetadataFromFile(filePath: string): Promise<AudioMetadata> {
+  return new Promise((resolve, reject) => {
+    ffmpeg.ffprobe(filePath, (err, metadata) => {
+      if (err) {
+        reject(new Error(`FFprobe error: ${err.message}`))
+        return
+      }
+
+      const audioStream = metadata.streams.find((s) => s.codec_type === 'audio')
+      const format = metadata.format
+
+      resolve({
+        duration: format.duration || 0,
+        format: format.format_name || 'unknown',
+        codec: audioStream?.codec_name,
+        sampleRate: audioStream?.sample_rate,
+        channels: audioStream?.channels,
+        bitrate: format.bit_rate ? Number(format.bit_rate) : undefined,
+      })
+    })
+  })
+}
+
+/**
+ * Get file extension from MIME type
+ */
+function getExtensionFromMimeType(mimeType: string): string {
+  const mimeToExt: Record<string, string> = {
+    // Video
+    'video/mp4': 'mp4',
+    'video/quicktime': 'mov',
+    'video/x-msvideo': 'avi',
+    'video/x-matroska': 'mkv',
+    'video/webm': 'webm',
+    // Audio
+    'audio/mpeg': 'mp3',
+    'audio/mp4': 'm4a',
+    'audio/wav': 'wav',
+    'audio/webm': 'webm',
+    'audio/ogg': 'ogg',
+    'audio/flac': 'flac',
+    'audio/aac': 'aac',
+    'audio/opus': 'opus',
+  }
+
+  return mimeToExt[mimeType] || mimeType.split('/')[1] || 'dat'
+}
+
+/**
+ * Get appropriate audio codec for output format
+ */
+function getAudioCodec(format: string): string {
+  const codecMap: Record<string, string> = {
+    mp3: 'libmp3lame',
+    wav: 'pcm_s16le',
+    flac: 'flac',
+    m4a: 'aac',
+    aac: 'aac',
+    ogg: 'libvorbis',
+    opus: 'libopus',
+  }
+
+  return codecMap[format] || 'libmp3lame'
+}
+
+/**
+ * Check if a file is a video file
+ */
+export function isVideoFile(mimeType: string): boolean {
+  return mimeType.startsWith('video/')
+}
+
+/**
+ * Check if a file is an audio file
+ */
+export function isAudioFile(mimeType: string): boolean {
+  return mimeType.startsWith('audio/')
+}
+
+/**
+ * Get optimal audio format for STT provider
+ */
+export function getOptimalFormat(provider: 'whisper' | 'deepgram' | 'elevenlabs'): {
+  format: 'mp3' | 'wav' | 'flac'
+  sampleRate: number
+  channels: 1 | 2
+} {
+  switch (provider) {
+    case 'whisper':
+      // Whisper prefers 16kHz mono
+      return {
+        format: 'mp3',
+        sampleRate: 16000,
+        channels: 1,
+      }
+    case 'deepgram':
+      // Deepgram works well with various formats
+      return {
+        format: 'mp3',
+        sampleRate: 16000,
+        channels: 1,
+      }
+    case 'elevenlabs':
+      // ElevenLabs format preferences
+      return {
+        format: 'mp3',
+        sampleRate: 16000,
+        channels: 1,
+      }
+    default:
+      return {
+        format: 'mp3',
+        sampleRate: 16000,
+        channels: 1,
+      }
+  }
+}
diff --git a/apps/sim/lib/audio/types.ts b/apps/sim/lib/audio/types.ts
new file mode 100644
index 000000000..286b2f6ff
--- /dev/null
+++ b/apps/sim/lib/audio/types.ts
@@ -0,0 +1,22 @@
+export interface AudioExtractionOptions {
+  outputFormat?: 'mp3' | 'wav' | 'flac'
+  sampleRate?: number
+  channels?: 1 | 2
+  bitrate?: string
+}
+
+export interface AudioExtractionResult {
+  buffer: Buffer
+  format: string
+  duration: number
+  size: number
+}
+
+export interface AudioMetadata {
+  duration: number
+  format: string
+  codec?: string
+  sampleRate?: number
+  channels?: number
+  bitrate?: number
+}
diff --git a/apps/sim/lib/uploads/utils/file-utils.ts b/apps/sim/lib/uploads/utils/file-utils.ts
index edeb4f048..0ca0b687a 100644
--- a/apps/sim/lib/uploads/utils/file-utils.ts
+++ b/apps/sim/lib/uploads/utils/file-utils.ts
@@ -12,7 +12,7 @@ export interface FileAttachment {
 }
 
 export interface MessageContent {
-  type: 'text' | 'image' | 'document'
+  type: 'text' | 'image' | 'document' | 'audio' | 'video'
   text?: string
   source?: {
     type: 'base64'
@@ -24,7 +24,7 @@ export interface MessageContent {
 /**
  * Mapping of MIME types to content types
  */
-export const MIME_TYPE_MAPPING: Record<string, 'image' | 'document'> = {
+export const MIME_TYPE_MAPPING: Record<string, 'image' | 'document' | 'audio' | 'video'> = {
   // Images
   'image/jpeg': 'image',
   'image/jpg': 'image',
@@ -49,12 +49,40 @@ export const MIME_TYPE_MAPPING: Record<string, 'image' | 'document'> = {
   'application/vnd.ms-powerpoint': 'document', // .ppt
   'text/markdown': 'document',
   'application/rtf': 'document',
+
+  // Audio
+  'audio/mpeg': 'audio', // .mp3
+  'audio/mp3': 'audio',
+  'audio/mp4': 'audio', // .m4a
+  'audio/x-m4a': 'audio',
+  'audio/m4a': 'audio',
+  'audio/wav': 'audio',
+  'audio/wave': 'audio',
+  'audio/x-wav': 'audio',
+  'audio/webm': 'audio',
+  'audio/ogg': 'audio',
+  'audio/vorbis': 'audio',
+  'audio/flac': 'audio',
+  'audio/x-flac': 'audio',
+  'audio/aac': 'audio',
+  'audio/x-aac': 'audio',
+  'audio/opus': 'audio',
+
+  // Video
+  'video/mp4': 'video',
+  'video/mpeg': 'video',
+  'video/quicktime': 'video', // .mov
+  'video/x-quicktime': 'video',
+  'video/x-msvideo': 'video', // .avi
+  'video/avi': 'video',
+  'video/x-matroska': 'video', // .mkv
+  'video/webm': 'video',
 }
 
 /**
  * Get the content type for a given MIME type
  */
-export function getContentType(mimeType: string): 'image' | 'document' | null {
+export function getContentType(mimeType: string): 'image' | 'document' | 'audio' | 'video' | null {
   return MIME_TYPE_MAPPING[mimeType.toLowerCase()] || null
 }
 
@@ -80,6 +108,28 @@ export function isImageFileType(mimeType: string): boolean {
   return imageTypes.includes(mimeType.toLowerCase())
 }
 
+/**
+ * Check if a MIME type is an audio type
+ */
+export function isAudioFileType(mimeType: string): boolean {
+  return getContentType(mimeType) === 'audio'
+}
+
+/**
+ * Check if a MIME type is a video type
+ */
+export function isVideoFileType(mimeType: string): boolean {
+  return getContentType(mimeType) === 'video'
+}
+
+/**
+ * Check if a MIME type is an audio or video type
+ */
+export function isMediaFileType(mimeType: string): boolean {
+  const contentType = getContentType(mimeType)
+  return contentType === 'audio' || contentType === 'video'
+}
+
 /**
  * Convert a file buffer to base64
  */
@@ -143,6 +193,22 @@ export function getMimeTypeFromExtension(extension: string): string {
     ppt: 'application/vnd.ms-powerpoint',
     md: 'text/markdown',
     rtf: 'application/rtf',
+
+    // Audio
+    mp3: 'audio/mpeg',
+    m4a: 'audio/mp4',
+    wav: 'audio/wav',
+    webm: 'audio/webm',
+    ogg: 'audio/ogg',
+    flac: 'audio/flac',
+    aac: 'audio/aac',
+    opus: 'audio/opus',
+
+    // Video
+    mp4: 'video/mp4',
+    mov: 'video/quicktime',
+    avi: 'video/x-msvideo',
+    mkv: 'video/x-matroska',
   }
 
   return extensionMimeMap[extension.toLowerCase()] || 'application/octet-stream'
diff --git a/apps/sim/lib/uploads/utils/validation.ts b/apps/sim/lib/uploads/utils/validation.ts
index 056b13f3f..b3f87c99d 100644
--- a/apps/sim/lib/uploads/utils/validation.ts
+++ b/apps/sim/lib/uploads/utils/validation.ts
@@ -20,7 +20,26 @@ export const SUPPORTED_DOCUMENT_EXTENSIONS = [
   'yml',
 ] as const
 
+export const SUPPORTED_AUDIO_EXTENSIONS = [
+  'mp3',
+  'm4a',
+  'wav',
+  'webm',
+  'ogg',
+  'flac',
+  'aac',
+  'opus',
+] as const
+
+export const SUPPORTED_VIDEO_EXTENSIONS = ['mp4', 'mov', 'avi', 'mkv', 'webm'] as const
+
 export type SupportedDocumentExtension = (typeof SUPPORTED_DOCUMENT_EXTENSIONS)[number]
+export type SupportedAudioExtension = (typeof SUPPORTED_AUDIO_EXTENSIONS)[number]
+export type SupportedVideoExtension = (typeof SUPPORTED_VIDEO_EXTENSIONS)[number]
+export type SupportedMediaExtension =
+  | SupportedDocumentExtension
+  | SupportedAudioExtension
+  | SupportedVideoExtension
 
 export const SUPPORTED_MIME_TYPES: Record<SupportedDocumentExtension, string[]> = {
   pdf: ['application/pdf', 'application/x-pdf'],
@@ -54,7 +73,33 @@ export const SUPPORTED_MIME_TYPES: Record<SupportedDocumentExtension, string[]>
   yml: ['text/yaml', 'text/x-yaml', 'application/yaml', 'application/x-yaml'],
 }
 
+export const SUPPORTED_AUDIO_MIME_TYPES: Record<SupportedAudioExtension, string[]> = {
+  mp3: ['audio/mpeg', 'audio/mp3'],
+  m4a: ['audio/mp4', 'audio/x-m4a', 'audio/m4a'],
+  wav: ['audio/wav', 'audio/wave', 'audio/x-wav'],
+  webm: ['audio/webm'],
+  ogg: ['audio/ogg', 'audio/vorbis'],
+  flac: ['audio/flac', 'audio/x-flac'],
+  aac: ['audio/aac', 'audio/x-aac'],
+  opus: ['audio/opus'],
+}
+
+export const SUPPORTED_VIDEO_MIME_TYPES: Record<SupportedVideoExtension, string[]> = {
+  mp4: ['video/mp4', 'video/mpeg'],
+  mov: ['video/quicktime', 'video/x-quicktime'],
+  avi: ['video/x-msvideo', 'video/avi'],
+  mkv: ['video/x-matroska'],
+  webm: ['video/webm'],
+}
+
 export const ACCEPTED_FILE_TYPES = Object.values(SUPPORTED_MIME_TYPES).flat()
+export const ACCEPTED_AUDIO_TYPES = Object.values(SUPPORTED_AUDIO_MIME_TYPES).flat()
+export const ACCEPTED_VIDEO_TYPES = Object.values(SUPPORTED_VIDEO_MIME_TYPES).flat()
+export const ACCEPTED_MEDIA_TYPES = [
+  ...ACCEPTED_FILE_TYPES,
+  ...ACCEPTED_AUDIO_TYPES,
+  ...ACCEPTED_VIDEO_TYPES,
+]
 
 export const ACCEPTED_FILE_EXTENSIONS = SUPPORTED_DOCUMENT_EXTENSIONS.map((ext) => `.${ext}`)
 
@@ -110,5 +155,61 @@ export function getSupportedMimeTypes(extension: string): string[] {
   if (isSupportedExtension(extension)) {
     return SUPPORTED_MIME_TYPES[extension as SupportedDocumentExtension]
   }
+  if (SUPPORTED_AUDIO_EXTENSIONS.includes(extension as SupportedAudioExtension)) {
+    return SUPPORTED_AUDIO_MIME_TYPES[extension as SupportedAudioExtension]
+  }
+  if (SUPPORTED_VIDEO_EXTENSIONS.includes(extension as SupportedVideoExtension)) {
+    return SUPPORTED_VIDEO_MIME_TYPES[extension as SupportedVideoExtension]
+  }
   return []
 }
+
+/**
+ * Check if file extension is a supported audio extension
+ */
+export function isSupportedAudioExtension(extension: string): extension is SupportedAudioExtension {
+  return SUPPORTED_AUDIO_EXTENSIONS.includes(extension.toLowerCase() as SupportedAudioExtension)
+}
+
+/**
+ * Check if file extension is a supported video extension
+ */
+export function isSupportedVideoExtension(extension: string): extension is SupportedVideoExtension {
+  return SUPPORTED_VIDEO_EXTENSIONS.includes(extension.toLowerCase() as SupportedVideoExtension)
+}
+
+/**
+ * Validate if an audio/video file type is supported for STT processing
+ */
+export function validateMediaFileType(
+  fileName: string,
+  mimeType: string
+): FileValidationError | null {
+  const extension = path.extname(fileName).toLowerCase().substring(1)
+
+  const isAudio = SUPPORTED_AUDIO_EXTENSIONS.includes(extension as SupportedAudioExtension)
+  const isVideo = SUPPORTED_VIDEO_EXTENSIONS.includes(extension as SupportedVideoExtension)
+
+  if (!isAudio && !isVideo) {
+    return {
+      code: 'UNSUPPORTED_FILE_TYPE',
+      message: `Unsupported media file type: ${extension}. Supported audio types: ${SUPPORTED_AUDIO_EXTENSIONS.join(', ')}. Supported video types: ${SUPPORTED_VIDEO_EXTENSIONS.join(', ')}`,
+      supportedTypes: [...SUPPORTED_AUDIO_EXTENSIONS, ...SUPPORTED_VIDEO_EXTENSIONS],
+    }
+  }
+
+  const baseMimeType = mimeType.split(';')[0].trim()
+  const allowedMimeTypes = isAudio
+    ? SUPPORTED_AUDIO_MIME_TYPES[extension as SupportedAudioExtension]
+    : SUPPORTED_VIDEO_MIME_TYPES[extension as SupportedVideoExtension]
+
+  if (!allowedMimeTypes.includes(baseMimeType)) {
+    return {
+      code: 'MIME_TYPE_MISMATCH',
+      message: `MIME type ${baseMimeType} does not match file extension ${extension}. Expected: ${allowedMimeTypes.join(', ')}`,
+      supportedTypes: allowedMimeTypes,
+    }
+  }
+
+  return null
+}
diff --git a/apps/sim/next.config.ts b/apps/sim/next.config.ts
index 10b80bc4e..628ee264f 100644
--- a/apps/sim/next.config.ts
+++ b/apps/sim/next.config.ts
@@ -75,7 +75,7 @@ const nextConfig: NextConfig = {
   turbopack: {
     resolveExtensions: ['.tsx', '.ts', '.jsx', '.js', '.mjs', '.json'],
   },
-  serverExternalPackages: ['unpdf'],
+  serverExternalPackages: ['unpdf', 'ffmpeg-static', 'fluent-ffmpeg'],
   experimental: {
     optimizeCss: true,
     turbopackSourceMaps: false,
diff --git a/apps/sim/tools/registry.ts b/apps/sim/tools/registry.ts
index d9f0965fe..e8503436a 100644
--- a/apps/sim/tools/registry.ts
+++ b/apps/sim/tools/registry.ts
@@ -605,6 +605,7 @@ import {
   stripeUpdateSubscriptionTool,
   stripeVoidInvoiceTool,
 } from '@/tools/stripe'
+import { deepgramSttTool, elevenLabsSttTool, whisperSttTool } from '@/tools/stt'
 import {
   supabaseCountTool,
   supabaseDeleteTool,
@@ -1050,6 +1051,9 @@ export const tools: Record<string, ToolConfig> = {
   knowledge_upload_chunk: knowledgeUploadChunkTool,
   knowledge_create_document: knowledgeCreateDocumentTool,
   elevenlabs_tts: elevenLabsTtsTool,
+  stt_whisper: whisperSttTool,
+  stt_deepgram: deepgramSttTool,
+  stt_elevenlabs: elevenLabsSttTool,
   s3_get_object: s3GetObjectTool,
   s3_put_object: s3PutObjectTool,
   s3_list_objects: s3ListObjectsTool,
diff --git a/apps/sim/tools/stt/deepgram.ts b/apps/sim/tools/stt/deepgram.ts
new file mode 100644
index 000000000..ce33b49bb
--- /dev/null
+++ b/apps/sim/tools/stt/deepgram.ts
@@ -0,0 +1,125 @@
+import type { SttParams, SttResponse } from '@/tools/stt/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const deepgramSttTool: ToolConfig<SttParams, SttResponse> = {
+  id: 'stt_deepgram',
+  name: 'Deepgram STT',
+  description: 'Transcribe audio to text using Deepgram',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'STT provider (deepgram)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Deepgram API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Deepgram model to use (nova-3, nova-2, whisper-large, etc.)',
+    },
+    audioFile: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Audio or video file to transcribe',
+    },
+    audioFileReference: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Reference to audio/video file from previous blocks',
+    },
+    audioUrl: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'URL to audio or video file',
+    },
+    language: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
+    },
+    timestamps: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Timestamp granularity: none, sentence, or word',
+    },
+    diarization: {
+      type: 'boolean',
+      required: false,
+      visibility: 'user-only',
+      description: 'Enable speaker diarization',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/stt',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: SttParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'deepgram',
+      apiKey: params.apiKey,
+      model: params.model,
+      audioFile: params.audioFile,
+      audioFileReference: params.audioFileReference,
+      audioUrl: params.audioUrl,
+      language: params.language || 'auto',
+      timestamps: params.timestamps || 'none',
+      diarization: params.diarization || false,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Transcription failed',
+        output: {
+          transcript: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        transcript: data.transcript,
+        segments: data.segments,
+        language: data.language,
+        duration: data.duration,
+        confidence: data.confidence,
+      },
+    }
+  },
+
+  outputs: {
+    transcript: { type: 'string', description: 'Full transcribed text' },
+    segments: { type: 'array', description: 'Timestamped segments with speaker labels' },
+    language: { type: 'string', description: 'Detected or specified language' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    confidence: { type: 'number', description: 'Overall confidence score' },
+  },
+}
diff --git a/apps/sim/tools/stt/elevenlabs.ts b/apps/sim/tools/stt/elevenlabs.ts
new file mode 100644
index 000000000..9cf601f0d
--- /dev/null
+++ b/apps/sim/tools/stt/elevenlabs.ts
@@ -0,0 +1,118 @@
+import type { SttParams, SttResponse } from '@/tools/stt/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const elevenLabsSttTool: ToolConfig<SttParams, SttResponse> = {
+  id: 'stt_elevenlabs',
+  name: 'ElevenLabs STT',
+  description: 'Transcribe audio to text using ElevenLabs',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'STT provider (elevenlabs)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'ElevenLabs API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'ElevenLabs model to use (scribe_v1, scribe_v1_experimental)',
+    },
+    audioFile: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Audio or video file to transcribe',
+    },
+    audioFileReference: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Reference to audio/video file from previous blocks',
+    },
+    audioUrl: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'URL to audio or video file',
+    },
+    language: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
+    },
+    timestamps: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Timestamp granularity: none, sentence, or word',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/stt',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: SttParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'elevenlabs',
+      apiKey: params.apiKey,
+      model: params.model,
+      audioFile: params.audioFile,
+      audioFileReference: params.audioFileReference,
+      audioUrl: params.audioUrl,
+      language: params.language || 'auto',
+      timestamps: params.timestamps || 'none',
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Transcription failed',
+        output: {
+          transcript: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        transcript: data.transcript,
+        segments: data.segments,
+        language: data.language,
+        duration: data.duration,
+        confidence: data.confidence,
+      },
+    }
+  },
+
+  outputs: {
+    transcript: { type: 'string', description: 'Full transcribed text' },
+    segments: { type: 'array', description: 'Timestamped segments' },
+    language: { type: 'string', description: 'Detected or specified language' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    confidence: { type: 'number', description: 'Overall confidence score' },
+  },
+}
diff --git a/apps/sim/tools/stt/index.ts b/apps/sim/tools/stt/index.ts
new file mode 100644
index 000000000..e96e41cc9
--- /dev/null
+++ b/apps/sim/tools/stt/index.ts
@@ -0,0 +1,5 @@
+import { deepgramSttTool } from '@/tools/stt/deepgram'
+import { elevenLabsSttTool } from '@/tools/stt/elevenlabs'
+import { whisperSttTool } from '@/tools/stt/whisper'
+
+export { whisperSttTool, deepgramSttTool, elevenLabsSttTool }
diff --git a/apps/sim/tools/stt/types.ts b/apps/sim/tools/stt/types.ts
new file mode 100644
index 000000000..c652c3819
--- /dev/null
+++ b/apps/sim/tools/stt/types.ts
@@ -0,0 +1,62 @@
+import type { UserFile } from '@/executor/types'
+import type { ToolResponse } from '@/tools/types'
+
+export interface SttParams {
+  provider: 'whisper' | 'deepgram' | 'elevenlabs'
+  apiKey: string
+  model?: string
+  audioFile?: UserFile | UserFile[]
+  audioFileReference?: UserFile | UserFile[]
+  audioUrl?: string
+  language?: string
+  timestamps?: 'none' | 'sentence' | 'word'
+  diarization?: boolean
+  translateToEnglish?: boolean
+}
+
+export interface TranscriptSegment {
+  text: string
+  start: number
+  end: number
+  speaker?: string
+  confidence?: number
+}
+
+export interface SttResponse extends ToolResponse {
+  output: {
+    transcript: string
+    segments?: TranscriptSegment[]
+    language?: string
+    duration?: number
+    confidence?: number
+  }
+}
+
+export interface SttBlockResponse extends ToolResponse {
+  output: {
+    transcript: string
+    segments?: TranscriptSegment[]
+    language?: string
+    duration?: number
+    confidence?: number
+  }
+}
+
+// Provider-specific types
+
+export interface WhisperParams extends Omit<SttParams, 'provider'> {
+  model?: string
+  responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'
+  temperature?: number
+}
+
+export interface DeepgramParams extends Omit<SttParams, 'provider'> {
+  model?: string
+  punctuate?: boolean
+  paragraphs?: boolean
+  utterances?: boolean
+}
+
+export interface ElevenLabsSttParams extends Omit<SttParams, 'provider'> {
+  model?: string
+}
diff --git a/apps/sim/tools/stt/whisper.ts b/apps/sim/tools/stt/whisper.ts
new file mode 100644
index 000000000..a47729b56
--- /dev/null
+++ b/apps/sim/tools/stt/whisper.ts
@@ -0,0 +1,125 @@
+import type { SttParams, SttResponse } from '@/tools/stt/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const whisperSttTool: ToolConfig<SttParams, SttResponse> = {
+  id: 'stt_whisper',
+  name: 'OpenAI Whisper STT',
+  description: 'Transcribe audio to text using OpenAI Whisper',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'STT provider (whisper)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'OpenAI API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Whisper model to use (default: whisper-1)',
+    },
+    audioFile: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Audio or video file to transcribe',
+    },
+    audioFileReference: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Reference to audio/video file from previous blocks',
+    },
+    audioUrl: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'URL to audio or video file',
+    },
+    language: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
+    },
+    timestamps: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Timestamp granularity: none, sentence, or word',
+    },
+    translateToEnglish: {
+      type: 'boolean',
+      required: false,
+      visibility: 'user-only',
+      description: 'Translate audio to English',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/stt',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: SttParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'whisper',
+      apiKey: params.apiKey,
+      model: params.model,
+      audioFile: params.audioFile,
+      audioFileReference: params.audioFileReference,
+      audioUrl: params.audioUrl,
+      language: params.language || 'auto',
+      timestamps: params.timestamps || 'none',
+      translateToEnglish: params.translateToEnglish || false,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Transcription failed',
+        output: {
+          transcript: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        transcript: data.transcript,
+        segments: data.segments,
+        language: data.language,
+        duration: data.duration,
+        confidence: data.confidence,
+      },
+    }
+  },
+
+  outputs: {
+    transcript: { type: 'string', description: 'Full transcribed text' },
+    segments: { type: 'array', description: 'Timestamped segments' },
+    language: { type: 'string', description: 'Detected or specified language' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    confidence: { type: 'number', description: 'Overall confidence score' },
+  },
+}
diff --git a/bun.lock b/bun.lock
index 014a8b39d..6a6053836 100644
--- a/bun.lock
+++ b/bun.lock
@@ -9,8 +9,11 @@
         "@t3-oss/env-nextjs": "0.13.4",
         "@tanstack/react-query": "5.90.8",
         "@tanstack/react-query-devtools": "5.90.2",
+        "@types/fluent-ffmpeg": "2.1.28",
         "cronstrue": "3.3.0",
         "drizzle-orm": "^0.44.5",
+        "ffmpeg-static": "5.3.0",
+        "fluent-ffmpeg": "2.1.3",
         "mongodb": "6.19.0",
         "neo4j-driver": "6.0.1",
         "onedollarstats": "0.0.10",
@@ -235,6 +238,7 @@
     },
   },
   "trustedDependencies": [
+    "ffmpeg-static",
     "sharp",
   ],
   "overrides": {
@@ -496,6 +500,8 @@
 
     "@csstools/css-tokenizer": ["@csstools/css-tokenizer@3.0.4", "", {}, "sha512-Vd/9EVDiu6PPJt9yAh6roZP6El1xHrdvIVGjyBsHR0RYwNHgL7FJPyIIW4fANJNG6FtyZfvlRPpFI4ZM/lubvw=="],
 
+    "@derhuerst/http-basic": ["@derhuerst/http-basic@8.2.4", "", { "dependencies": { "caseless": "^0.12.0", "concat-stream": "^2.0.0", "http-response-object": "^3.0.1", "parse-cache-control": "^1.0.1" } }, "sha512-F9rL9k9Xjf5blCz8HsJRO4diy111cayL2vkY2XE4r4t3n0yPXVYy3KD3nJ1qbrSn9743UWSXH4IwuCa/HWlGFw=="],
+
     "@dimforge/rapier3d-compat": ["@dimforge/rapier3d-compat@0.12.0", "", {}, "sha512-uekIGetywIgopfD97oDL5PfeezkFpNhwlzlaEYNOA0N6ghdsOvh/HYjSMek5Q2O1PYvRSDFcqFVJl4r4ZBwOow=="],
 
     "@drizzle-team/brocli": ["@drizzle-team/brocli@0.10.2", "", {}, "sha512-z33Il7l5dKjUgGULTqBsQBQwckHh5AbIuxhdsIxDDiZAzBOrZO6q9ogcWC65kU382AfynTfgNumVcNIjuIua6w=="],
@@ -1336,6 +1342,8 @@
 
     "@types/estree-jsx": ["@types/estree-jsx@1.0.5", "", { "dependencies": { "@types/estree": "*" } }, "sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg=="],
 
+    "@types/fluent-ffmpeg": ["@types/fluent-ffmpeg@2.1.28", "", { "dependencies": { "@types/node": "*" } }, "sha512-5ovxsDwBcPfJ+eYs1I/ZpcYCnkce7pvH9AHSvrZllAp1ZPpTRDZAFjF3TRFbukxSgIYTTNYePbS0rKUmaxVbXw=="],
+
     "@types/geojson": ["@types/geojson@7946.0.16", "", {}, "sha512-6C8nqWur3j98U6+lXDfTUWIfgvZU+EumvpHKcYjujKH7woYyLj2sUmff0tRhrqM7BohUw7Pz3ZB1jj2gW9Fvmg=="],
 
     "@types/hast": ["@types/hast@3.0.4", "", { "dependencies": { "@types/unist": "*" } }, "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ=="],
@@ -1470,6 +1478,8 @@
 
     "astring": ["astring@1.9.0", "", { "bin": { "astring": "bin/astring" } }, "sha512-LElXdjswlqjWrPpJFg1Fx4wpkOCxj1TDHlSV4PlaRxHGWko024xICaa97ZkMfs6DRKlCguiAI+rbXv5GWwXIkg=="],
 
+    "async": ["async@0.2.10", "", {}, "sha512-eAkdoKxU6/LkKDBzLpT+t6Ff5EtfSF4wx1WfJiPEEV7WNLnDaRXk0oVysiEPm262roaachGexwUv94WhSgN5TQ=="],
+
     "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="],
 
     "atomic-sleep": ["atomic-sleep@1.0.0", "", {}, "sha512-kNOjDqAh7px0XWNI+4QbzoiR/nTkHAWNud2uvnJquD1/x5a7EQZMJT0AczqK0Qn67oY/TTQ1LbUKajZpp3I9tQ=="],
@@ -1550,6 +1560,8 @@
 
     "caniuse-lite": ["caniuse-lite@1.0.30001745", "", {}, "sha512-ywt6i8FzvdgrrrGbr1jZVObnVv6adj+0if2/omv9cmR2oiZs30zL4DIyaptKcbOrBdOIc74QTMoJvSE2QHh5UQ=="],
 
+    "caseless": ["caseless@0.12.0", "", {}, "sha512-4tYFyifaFfGacoiObjJegolkwSU4xQNGbVgUiNYVUxbQ2x2lUsFvY4hVgVzGiIe6WLOPqycWXA40l+PWsxthUw=="],
+
     "ccount": ["ccount@2.0.1", "", {}, "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg=="],
 
     "cfb": ["cfb@1.2.2", "", { "dependencies": { "adler-32": "~1.3.0", "crc-32": "~1.2.0" } }, "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA=="],
@@ -1818,6 +1830,8 @@
 
     "entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="],
 
+    "env-paths": ["env-paths@2.2.1", "", {}, "sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A=="],
+
     "environment": ["environment@1.1.0", "", {}, "sha512-xUtoPkMggbz0MPyPiIWr1Kp4aeWJjDZ6SMvURhimjdZgsRuDplF5/s9hcgGhyXMhs+6vpnuoiZ2kFiu3FMnS8Q=="],
 
     "error": ["error@7.0.2", "", { "dependencies": { "string-template": "~0.2.1", "xtend": "~4.0.0" } }, "sha512-UtVv4l5MhijsYUxPJo4390gzfZvAnTHreNnDjnTZaKIiZ/SemXxAhBkYSKtWa5RtBXbLP8tMgn/n0RUa/H7jXw=="],
@@ -1916,6 +1930,8 @@
 
     "fflate": ["fflate@0.8.2", "", {}, "sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A=="],
 
+    "ffmpeg-static": ["ffmpeg-static@5.3.0", "", { "dependencies": { "@derhuerst/http-basic": "^8.2.0", "env-paths": "^2.2.0", "https-proxy-agent": "^5.0.0", "progress": "^2.0.3" } }, "sha512-H+K6sW6TiIX6VGend0KQwthe+kaceeH/luE8dIZyOP35ik7ahYojDuqlTV1bOrtEwl01sy2HFNGQfi5IDJvotg=="],
+
     "figures": ["figures@3.2.0", "", { "dependencies": { "escape-string-regexp": "^1.0.5" } }, "sha512-yaduQFRKLXYOGgEn6AZau90j3ggSOyiqXU0F9JZfeXYhNa+Jk4X+s45A2zg5jns87GAFa34BBm2kXw4XpNcbdg=="],
 
     "file-type": ["file-type@16.5.4", "", { "dependencies": { "readable-web-to-node-stream": "^3.0.0", "strtok3": "^6.2.4", "token-types": "^4.1.1" } }, "sha512-/yFHK0aGjFEgDJjEKP0pWCplsPFPhwyfwevf/pVxiN0tmE4L9LmwWxWukdJSHdoCli4VgQLehjJtwQBnqmsKcw=="],
@@ -1924,6 +1940,8 @@
 
     "finalhandler": ["finalhandler@2.1.0", "", { "dependencies": { "debug": "^4.4.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "on-finished": "^2.4.1", "parseurl": "^1.3.3", "statuses": "^2.0.1" } }, "sha512-/t88Ty3d5JWQbWYgaOGCCYfXRwV1+be02WqYYlL6h0lEiUAMPM8o8qKGO01YIkOHzka2up08wvgYD0mDiI+q3Q=="],
 
+    "fluent-ffmpeg": ["fluent-ffmpeg@2.1.3", "", { "dependencies": { "async": "^0.2.9", "which": "^1.1.1" } }, "sha512-Be3narBNt2s6bsaqP6Jzq91heDgOEaDCJAXcE3qcma/EJBSy5FB4cvO31XBInuAuKBx8Kptf8dkhjK0IOru39Q=="],
+
     "follow-redirects": ["follow-redirects@1.15.11", "", {}, "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ=="],
 
     "foreground-child": ["foreground-child@3.3.1", "", { "dependencies": { "cross-spawn": "^7.0.6", "signal-exit": "^4.0.1" } }, "sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw=="],
@@ -2050,6 +2068,8 @@
 
     "http-proxy-agent": ["http-proxy-agent@7.0.2", "", { "dependencies": { "agent-base": "^7.1.0", "debug": "^4.3.4" } }, "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig=="],
 
+    "http-response-object": ["http-response-object@3.0.2", "", { "dependencies": { "@types/node": "^10.0.3" } }, "sha512-bqX0XTF6fnXSQcEJ2Iuyr75yVakyjIDCqroJQ/aHfSdlM743Cwqoi2nDYMzLGWUcuTWGWy8AAvOKXTfiv6q9RA=="],
+
     "https-proxy-agent": ["https-proxy-agent@5.0.1", "", { "dependencies": { "agent-base": "6", "debug": "4" } }, "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA=="],
 
     "human-signals": ["human-signals@5.0.0", "", {}, "sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ=="],
@@ -2538,6 +2558,8 @@
 
     "papaparse": ["papaparse@5.5.3", "", {}, "sha512-5QvjGxYVjxO59MGU2lHVYpRWBBtKHnlIAcSe1uNFCkkptUh63NFRj0FJQm7nR67puEruUci/ZkjmEFrjCAyP4A=="],
 
+    "parse-cache-control": ["parse-cache-control@1.0.1", "", {}, "sha512-60zvsJReQPX5/QP0Kzfd/VrpjScIQ7SHBW6bFCYfEP+fp0Eppr1SHhIO5nd1PjZtvclzSzES9D/p5nFJurwfWg=="],
+
     "parse-css-color": ["parse-css-color@0.2.1", "", { "dependencies": { "color-name": "^1.1.4", "hex-rgb": "^4.1.0" } }, "sha512-bwS/GGIFV3b6KS4uwpzCFj4w297Yl3uqnSgIPsoQkx7GMLROXfMnWvxfNkL0oh8HVhZA4hvJoEoEIqonfJ3BWg=="],
 
     "parse-entities": ["parse-entities@4.0.2", "", { "dependencies": { "@types/unist": "^2.0.0", "character-entities-legacy": "^3.0.0", "character-reference-invalid": "^2.0.0", "decode-named-character-reference": "^1.0.0", "is-alphanumerical": "^2.0.0", "is-decimal": "^2.0.0", "is-hexadecimal": "^2.0.0" } }, "sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw=="],
@@ -2638,6 +2660,8 @@
 
     "process-warning": ["process-warning@5.0.0", "", {}, "sha512-a39t9ApHNx2L4+HBnQKqxxHNs1r7KF+Intd8Q/g1bUh6q0WIp9voPXJ/x0j+ZL45KF1pJd9+q2jLIRMfvEshkA=="],
 
+    "progress": ["progress@2.0.3", "", {}, "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA=="],
+
     "prom-client": ["prom-client@15.1.3", "", { "dependencies": { "@opentelemetry/api": "^1.4.0", "tdigest": "^0.1.1" } }, "sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g=="],
 
     "prompts": ["prompts@2.4.2", "", { "dependencies": { "kleur": "^3.0.3", "sisteransi": "^1.0.5" } }, "sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q=="],
@@ -3140,7 +3164,7 @@
 
     "whatwg-url": ["whatwg-url@14.2.0", "", { "dependencies": { "tr46": "^5.1.0", "webidl-conversions": "^7.0.0" } }, "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw=="],
 
-    "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
+    "which": ["which@1.3.1", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "which": "./bin/which" } }, "sha512-HxJdYWq1MTIQbJ3nw0cqssHoTNU267KlrDuGZ1WYlxDStUtKUhOaJmh112/TZmHxxUfuJqPXSOm7tDyas0OSIQ=="],
 
     "why-is-node-running": ["why-is-node-running@2.3.0", "", { "dependencies": { "siginfo": "^2.0.0", "stackback": "0.0.2" }, "bin": { "why-is-node-running": "cli.js" } }, "sha512-hUrmaWBdVDcxvYqnyh09zunKzROWjbZTiNy8dBEjkS7ehEDQibXJ7XvlmtbwuTclUiIyN+CyXQD4Vmko8fNm8w=="],
 
@@ -3418,6 +3442,8 @@
 
     "@types/cors/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
 
+    "@types/fluent-ffmpeg/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
+
     "@types/jsdom/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
 
     "@types/node-fetch/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
@@ -3454,6 +3480,8 @@
 
     "content-disposition/safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="],
 
+    "cross-spawn/which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
+
     "dom-serializer/entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="],
 
     "ecdsa-sig-formatter/safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="],
@@ -3508,6 +3536,8 @@
 
     "http-proxy-agent/agent-base": ["agent-base@7.1.4", "", {}, "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ=="],
 
+    "http-response-object/@types/node": ["@types/node@10.17.60", "", {}, "sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw=="],
+
     "inquirer/ora": ["ora@5.4.1", "", { "dependencies": { "bl": "^4.1.0", "chalk": "^4.1.0", "cli-cursor": "^3.1.0", "cli-spinners": "^2.5.0", "is-interactive": "^1.0.0", "is-unicode-supported": "^0.1.0", "log-symbols": "^4.1.0", "strip-ansi": "^6.0.0", "wcwidth": "^1.0.1" } }, "sha512-5b6Y85tPxZZ7QytO+BQzysW31HJku27cRIlkbAXaNx+BdcVi+LlRFmVXzeF6a7JCwJpyw5c4b+YSVImQIrBpuQ=="],
 
     "isomorphic-unfetch/node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="],
@@ -3766,6 +3796,8 @@
 
     "@types/cors/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
 
+    "@types/fluent-ffmpeg/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
+
     "@types/jsdom/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
 
     "@types/node-fetch/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
diff --git a/docker/app.Dockerfile b/docker/app.Dockerfile
index b55a3953b..f8ea3bece 100644
--- a/docker/app.Dockerfile
+++ b/docker/app.Dockerfile
@@ -78,7 +78,8 @@ FROM base AS runner
 WORKDIR /app
 
 # Install Python and dependencies for guardrails PII detection (cached separately)
-RUN apk add --no-cache python3 py3-pip bash
+# Also install ffmpeg for audio/video processing in STT
+RUN apk add --no-cache python3 py3-pip bash ffmpeg
 
 ENV NODE_ENV=production
 
diff --git a/package.json b/package.json
index 282e2f0f0..60219b778 100644
--- a/package.json
+++ b/package.json
@@ -39,8 +39,11 @@
     "@t3-oss/env-nextjs": "0.13.4",
     "@tanstack/react-query": "5.90.8",
     "@tanstack/react-query-devtools": "5.90.2",
+    "@types/fluent-ffmpeg": "2.1.28",
     "cronstrue": "3.3.0",
     "drizzle-orm": "^0.44.5",
+    "ffmpeg-static": "5.3.0",
+    "fluent-ffmpeg": "2.1.3",
     "mongodb": "6.19.0",
     "neo4j-driver": "6.0.1",
     "onedollarstats": "0.0.10",
@@ -63,5 +66,8 @@
     "*.{js,jsx,ts,tsx,json,css,scss}": [
       "biome check --write --no-errors-on-unmatched --files-ignore-unknown=true"
     ]
-  }
+  },
+  "trustedDependencies": [
+    "ffmpeg-static"
+  ]
 }