improvement(voice): interrupt UI + mute mic while agent is talking (#705)

* improvement(voice): interrupt UI + live transcription * cleanup logs * remove cross
2026-01-09 23:17:59 -05:00 · 2025-07-16 13:38:51 -07:00
parent bdfe7e9b99
commit 55a9adfdda
2 changed files with 350 additions and 263 deletions
--- a/apps/sim/app/chat/[subdomain]/chat-client.tsx
+++ b/apps/sim/app/chat/[subdomain]/chat-client.tsx
@@ -269,6 +269,8 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
    const messageToSend = messageParam ?? inputValue
    if (!messageToSend.trim() || isLoading) return

+    logger.info('Sending message:', { messageToSend, isVoiceInput, conversationId })
+
    // Reset userHasScrolled when sending a new message
    setUserHasScrolled(false)

@@ -305,6 +307,8 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
        conversationId,
      }

+      logger.info('API payload:', payload)
+
      const response = await fetch(`/api/chat/${subdomain}`, {
        method: 'POST',
        headers: {
@@ -321,6 +325,7 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {

      if (!response.ok) {
        const errorData = await response.json()
+        logger.error('API error response:', errorData)
        throw new Error(errorData.error || 'Failed to get response')
      }

@@ -334,6 +339,8 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
        ? createAudioStreamHandler(streamTextToAudio, DEFAULT_VOICE_SETTINGS.voiceId)
        : undefined

+      logger.info('Starting to handle streamed response:', { shouldPlayAudio })
+
      await handleStreamedResponse(
        response,
        setMessages,
@@ -405,6 +412,7 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
  // Handle voice transcript from voice-first interface
  const handleVoiceTranscript = useCallback(
    (transcript: string) => {
+      logger.info('Received voice transcript:', transcript)
      handleSendMessage(transcript, true)
    },
    [handleSendMessage]
--- a/apps/sim/app/chat/[subdomain]/components/voice-interface/voice-interface.tsx
+++ b/apps/sim/app/chat/[subdomain]/components/voice-interface/voice-interface.tsx
@@ -1,7 +1,7 @@
 'use client'

 import { type RefObject, useCallback, useEffect, useRef, useState } from 'react'
-import { Mic, MicOff, Phone, X } from 'lucide-react'
+import { Mic, MicOff, Phone } from 'lucide-react'
 import { Button } from '@/components/ui/button'
 import { createLogger } from '@/lib/logs/console-logger'
 import { cn } from '@/lib/utils'
@@ -68,132 +68,136 @@ export function VoiceInterface({
  messages = [],
  className,
 }: VoiceInterfaceProps) {
-  const [isListening, setIsListening] = useState(false)
+  // Simple state machine
+  const [state, setState] = useState<'idle' | 'listening' | 'agent_speaking'>('idle')
+  const [isInitialized, setIsInitialized] = useState(false)
  const [isMuted, setIsMuted] = useState(false)
  const [audioLevels, setAudioLevels] = useState<number[]>(new Array(200).fill(0))
-  const [permissionStatus, setPermissionStatus] = useState<'granted' | 'denied' | 'prompt'>(
+  const [permissionStatus, setPermissionStatus] = useState<'prompt' | 'granted' | 'denied'>(
    'prompt'
  )
-  const [isInitialized, setIsInitialized] = useState(false)
+
+  // Current turn transcript (subtitle)
+  const [currentTranscript, setCurrentTranscript] = useState('')
+
+  // State tracking
+  const currentStateRef = useRef<'idle' | 'listening' | 'agent_speaking'>('idle')
+
+  useEffect(() => {
+    currentStateRef.current = state
+  }, [state])

  const recognitionRef = useRef<SpeechRecognition | null>(null)
-  const localAudioContextRef = useRef<AudioContext | null>(null)
-  const audioContextRef = sharedAudioContextRef || localAudioContextRef
-  const analyserRef = useRef<AnalyserNode | null>(null)
  const mediaStreamRef = useRef<MediaStream | null>(null)
+  const audioContextRef = useRef<AudioContext | null>(null)
+  const analyserRef = useRef<AnalyserNode | null>(null)
  const animationFrameRef = useRef<number | null>(null)
-  const isStartingRef = useRef(false)
  const isMutedRef = useRef(false)
-  const compressorRef = useRef<DynamicsCompressorNode | null>(null)
-  const gainNodeRef = useRef<GainNode | null>(null)
+  const responseTimeoutRef = useRef<NodeJS.Timeout | null>(null)

  const isSupported =
    typeof window !== 'undefined' && !!(window.SpeechRecognition || window.webkitSpeechRecognition)

+  // Update muted ref
  useEffect(() => {
    isMutedRef.current = isMuted
  }, [isMuted])

-  const cleanup = useCallback(() => {
-    if (animationFrameRef.current) {
-      cancelAnimationFrame(animationFrameRef.current)
-      animationFrameRef.current = null
+  // Timeout to handle cases where agent doesn't provide audio response
+  const setResponseTimeout = useCallback(() => {
+    if (responseTimeoutRef.current) {
+      clearTimeout(responseTimeoutRef.current)
    }

-    if (mediaStreamRef.current) {
-      mediaStreamRef.current.getTracks().forEach((track) => track.stop())
-      mediaStreamRef.current = null
-    }
-
-    if (audioContextRef.current && audioContextRef.current.state !== 'closed') {
-      audioContextRef.current.close()
-      audioContextRef.current = null
-    }
-
-    if (recognitionRef.current) {
-      try {
-        recognitionRef.current.stop()
-      } catch (e) {
-        // Ignore errors during cleanup
+    responseTimeoutRef.current = setTimeout(() => {
+      if (currentStateRef.current === 'listening') {
+        setState('idle')
      }
-      recognitionRef.current = null
-    }
-
-    analyserRef.current = null
-    setAudioLevels(new Array(200).fill(0))
-    setIsListening(false)
+    }, 5000) // 5 second timeout (increased from 3)
  }, [])

-  const setupAudioVisualization = useCallback(async () => {
+  const clearResponseTimeout = useCallback(() => {
+    if (responseTimeoutRef.current) {
+      clearTimeout(responseTimeoutRef.current)
+      responseTimeoutRef.current = null
+    }
+  }, [])
+
+  // Sync with external state
+  useEffect(() => {
+    if (isPlayingAudio && state !== 'agent_speaking') {
+      clearResponseTimeout() // Clear timeout since agent is responding
+      setState('agent_speaking')
+      setCurrentTranscript('')
+
+      // Mute microphone immediately
+      setIsMuted(true)
+      if (mediaStreamRef.current) {
+        mediaStreamRef.current.getAudioTracks().forEach((track) => {
+          track.enabled = false
+        })
+      }
+
+      // Stop speech recognition completely
+      if (recognitionRef.current) {
+        try {
+          recognitionRef.current.abort()
+        } catch (error) {
+          logger.debug('Error aborting speech recognition:', error)
+        }
+      }
+    } else if (!isPlayingAudio && state === 'agent_speaking') {
+      setState('idle')
+      setCurrentTranscript('')
+
+      // Re-enable microphone
+      setIsMuted(false)
+      if (mediaStreamRef.current) {
+        mediaStreamRef.current.getAudioTracks().forEach((track) => {
+          track.enabled = true
+        })
+      }
+    }
+  }, [isPlayingAudio, state, clearResponseTimeout])
+
+  // Audio setup
+  const setupAudio = useCallback(async () => {
    try {
      const stream = await navigator.mediaDevices.getUserMedia({
        audio: {
          echoCancellation: true,
          noiseSuppression: true,
          autoGainControl: true,
-          sampleRate: 44100,
          channelCount: 1,
-          // Enhanced echo cancellation settings to prevent picking up speaker output
-          suppressLocalAudioPlayback: true, // Modern browsers
-          googEchoCancellation: true, // Chrome-specific
-          googAutoGainControl: true,
-          googNoiseSuppression: true,
-          googHighpassFilter: true,
-          googTypingNoiseDetection: true,
-        } as any, // Type assertion for experimental properties
+        },
      })

      setPermissionStatus('granted')
      mediaStreamRef.current = stream

+      // Setup audio context for visualization
      if (!audioContextRef.current) {
-        const AudioContextConstructor = window.AudioContext || window.webkitAudioContext
-        if (!AudioContextConstructor) {
-          throw new Error('AudioContext is not supported in this browser')
-        }
-        audioContextRef.current = new AudioContextConstructor()
+        const AudioContext = window.AudioContext || window.webkitAudioContext
+        audioContextRef.current = new AudioContext()
      }
-      const audioContext = audioContextRef.current

+      const audioContext = audioContextRef.current
      if (audioContext.state === 'suspended') {
        await audioContext.resume()
      }

      const source = audioContext.createMediaStreamSource(stream)
-
-      const gainNode = audioContext.createGain()
-      gainNode.gain.setValueAtTime(1, audioContext.currentTime)
-
-      const compressor = audioContext.createDynamicsCompressor()
-      compressor.threshold.setValueAtTime(-50, audioContext.currentTime)
-      compressor.knee.setValueAtTime(40, audioContext.currentTime)
-      compressor.ratio.setValueAtTime(12, audioContext.currentTime)
-      compressor.attack.setValueAtTime(0, audioContext.currentTime)
-      compressor.release.setValueAtTime(0.25, audioContext.currentTime)
-
      const analyser = audioContext.createAnalyser()
      analyser.fftSize = 256
-      analyser.smoothingTimeConstant = 0.5
+      analyser.smoothingTimeConstant = 0.8

-      source.connect(gainNode)
-      gainNode.connect(compressor)
-      compressor.connect(analyser)
-
-      audioContextRef.current = audioContext
+      source.connect(analyser)
      analyserRef.current = analyser
-      compressorRef.current = compressor
-      gainNodeRef.current = gainNode

-      // Start visualization loop
+      // Start visualization
      const updateVisualization = () => {
        if (!analyserRef.current) return

-        if (isMutedRef.current) {
-          setAudioLevels(new Array(200).fill(0))
-          animationFrameRef.current = requestAnimationFrame(updateVisualization)
-          return
-        }
-
        const bufferLength = analyserRef.current.frequencyBinCount
        const dataArray = new Uint8Array(bufferLength)
        analyserRef.current.getByteFrequencyData(dataArray)
@@ -210,280 +214,354 @@ export function VoiceInterface({
      }

      updateVisualization()
+      setIsInitialized(true)
      return true
    } catch (error) {
      logger.error('Error setting up audio:', error)
      setPermissionStatus('denied')
      return false
    }
-  }, [isMuted])
+  }, [])

-  // Start listening
-  const startListening = useCallback(async () => {
-    if (
-      !isSupported ||
-      !recognitionRef.current ||
-      isListening ||
-      isMuted ||
-      isStartingRef.current
-    ) {
-      return
-    }
-
-    try {
-      isStartingRef.current = true
-
-      if (!mediaStreamRef.current) {
-        await setupAudioVisualization()
-      }
-
-      recognitionRef.current.start()
-    } catch (error) {
-      isStartingRef.current = false
-      logger.error('Error starting voice input:', error)
-      setIsListening(false)
-    }
-  }, [isSupported, isListening, setupAudioVisualization, isMuted])
-
-  const initializeSpeechRecognition = useCallback(() => {
-    if (!isSupported || recognitionRef.current) return
+  // Speech recognition setup
+  const setupSpeechRecognition = useCallback(() => {
+    if (!isSupported) return

    const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition
    if (!SpeechRecognition) return

    const recognition = new SpeechRecognition()
+
    recognition.continuous = true
    recognition.interimResults = true
    recognition.lang = 'en-US'

-    recognition.onstart = () => {
-      isStartingRef.current = false
-      setIsListening(true)
-      onVoiceStart?.()
-    }
+    recognition.onstart = () => {}

    recognition.onresult = (event: SpeechRecognitionEvent) => {
-      // Don't process results if muted
-      if (isMutedRef.current) {
+      const currentState = currentStateRef.current
+
+      if (isMutedRef.current || currentState !== 'listening') {
        return
      }

      let finalTranscript = ''
+      let interimTranscript = ''

      for (let i = event.resultIndex; i < event.results.length; i++) {
        const result = event.results[i]
+        const transcript = result[0].transcript
+
        if (result.isFinal) {
-          finalTranscript += result[0].transcript
-        }
-      }
-
-      if (finalTranscript) {
-        if (isPlayingAudio) {
-          const cleanTranscript = finalTranscript.trim().toLowerCase()
-          const isSubstantialSpeech = cleanTranscript.length >= 10
-          const hasMultipleWords = cleanTranscript.split(/\s+/).length >= 3
-
-          if (isSubstantialSpeech && hasMultipleWords) {
-            onInterrupt?.()
-            onVoiceTranscript?.(finalTranscript)
-          }
+          finalTranscript += transcript
        } else {
-          onVoiceTranscript?.(finalTranscript)
+          interimTranscript += transcript
        }
      }
-    }

-    recognition.onerror = (event: SpeechRecognitionErrorEvent) => {
-      isStartingRef.current = false
-      logger.error('Speech recognition error:', event.error)
+      // Update live transcript
+      setCurrentTranscript(interimTranscript || finalTranscript)

-      if (event.error === 'not-allowed') {
-        setPermissionStatus('denied')
-        setIsListening(false)
-        onVoiceEnd?.()
-        return
-      }
+      // Send final transcript (but keep listening state until agent responds)
+      if (finalTranscript.trim()) {
+        setCurrentTranscript('') // Clear transcript

-      if (!isMutedRef.current && !isStartingRef.current) {
-        setTimeout(() => {
-          if (recognitionRef.current && !isMutedRef.current && !isStartingRef.current) {
-            startListening()
+        // Stop recognition to avoid interference while waiting for response
+        if (recognitionRef.current) {
+          try {
+            recognitionRef.current.stop()
+          } catch (error) {
+            // Ignore
          }
-        }, 500)
+        }
+
+        // Start timeout in case agent doesn't provide audio response
+        setResponseTimeout()
+
+        onVoiceTranscript?.(finalTranscript)
      }
    }

    recognition.onend = () => {
-      isStartingRef.current = false
-      setIsListening(false)
-      onVoiceEnd?.()
+      const currentState = currentStateRef.current

-      if (!isMutedRef.current && !isStartingRef.current) {
+      // Only restart recognition if we're in listening state and not muted
+      if (currentState === 'listening' && !isMutedRef.current) {
+        // Add a delay to avoid immediate restart after sending transcript
        setTimeout(() => {
-          if (recognitionRef.current && !isMutedRef.current && !isStartingRef.current) {
-            startListening()
+          // Double-check state hasn't changed during delay
+          if (
+            recognitionRef.current &&
+            currentStateRef.current === 'listening' &&
+            !isMutedRef.current
+          ) {
+            try {
+              recognitionRef.current.start()
+            } catch (error) {
+              logger.debug('Error restarting speech recognition:', error)
+            }
          }
-        }, 200)
+        }, 1000) // Longer delay to give agent time to respond
+      }
+    }
+
+    recognition.onerror = (event: SpeechRecognitionErrorEvent) => {
+      // Filter out "aborted" errors - these are expected when we intentionally stop recognition
+      if (event.error === 'aborted') {
+        // Ignore
+        return
+      }
+
+      if (event.error === 'not-allowed') {
+        setPermissionStatus('denied')
      }
    }

    recognitionRef.current = recognition
-    setIsInitialized(true)
-  }, [
-    isSupported,
-    isPlayingAudio,
-    isMuted,
-    onVoiceStart,
-    onVoiceEnd,
-    onVoiceTranscript,
-    onInterrupt,
-    startListening,
-  ])
+  }, [isSupported, onVoiceTranscript, setResponseTimeout])

-  const toggleMute = useCallback(() => {
-    const newMutedState = !isMuted
+  // Start/stop listening
+  const startListening = useCallback(() => {
+    if (!isInitialized || isMuted || state !== 'idle') {
+      return
+    }

-    if (newMutedState) {
-      isStartingRef.current = false
+    setState('listening')
+    setCurrentTranscript('')

-      if (recognitionRef.current) {
-        try {
-          recognitionRef.current.stop()
-        } catch (e) {
-          // Ignore errors
-        }
+    if (recognitionRef.current) {
+      try {
+        recognitionRef.current.start()
+      } catch (error) {
+        logger.error('Error starting recognition:', error)
      }
+    }
+  }, [isInitialized, isMuted, state])

-      if (mediaStreamRef.current) {
-        mediaStreamRef.current.getAudioTracks().forEach((track) => {
-          track.enabled = false
-        })
+  const stopListening = useCallback(() => {
+    setState('idle')
+    setCurrentTranscript('')
+
+    if (recognitionRef.current) {
+      try {
+        recognitionRef.current.stop()
+      } catch (error) {
+        // Ignore
      }
+    }
+  }, [])

-      setIsListening(false)
-    } else {
+  // Handle interrupt
+  const handleInterrupt = useCallback(() => {
+    if (state === 'agent_speaking') {
+      // Clear any subtitle timeouts and text
+      // (No longer needed after removing subtitle system)
+
+      onInterrupt?.()
+      setState('listening')
+      setCurrentTranscript('')
+
+      // Unmute microphone for user input
+      setIsMuted(false)
      if (mediaStreamRef.current) {
        mediaStreamRef.current.getAudioTracks().forEach((track) => {
          track.enabled = true
        })
      }
-      setTimeout(() => {
-        if (!isMutedRef.current) {
-          startListening()
+
+      // Start listening immediately
+      if (recognitionRef.current) {
+        try {
+          recognitionRef.current.start()
+        } catch (error) {
+          logger.error('Could not start recognition after interrupt:', error)
        }
-      }, 200)
+      }
+    }
+  }, [state, onInterrupt])
+
+  // Handle call end with proper cleanup
+  const handleCallEnd = useCallback(() => {
+    // Stop everything immediately
+    setState('idle')
+    setCurrentTranscript('')
+    setIsMuted(false)
+
+    // Stop speech recognition
+    if (recognitionRef.current) {
+      try {
+        recognitionRef.current.abort()
+      } catch (error) {
+        logger.error('Error stopping speech recognition:', error)
+      }
    }

-    setIsMuted(newMutedState)
-  }, [isMuted, isListening, startListening])
+    // Clear timeouts
+    clearResponseTimeout()

-  const handleEndCall = useCallback(() => {
-    cleanup()
+    // Stop audio playback and streaming immediately
+    onInterrupt?.()
+
+    // Call the original onCallEnd
    onCallEnd?.()
-  }, [cleanup, onCallEnd])
+  }, [onCallEnd, onInterrupt, clearResponseTimeout])

-  const getStatusText = () => {
-    if (isStreaming) return 'Thinking...'
-    if (isPlayingAudio) return 'Speaking...'
-    if (isListening) return 'Listening...'
-    return 'Ready'
-  }
+  // Keyboard handler
+  useEffect(() => {
+    const handleKeyDown = (event: KeyboardEvent) => {
+      if (event.code === 'Space') {
+        event.preventDefault()
+        handleInterrupt()
+      }
+    }

+    document.addEventListener('keydown', handleKeyDown)
+    return () => document.removeEventListener('keydown', handleKeyDown)
+  }, [handleInterrupt])
+
+  // Mute toggle
+  const toggleMute = useCallback(() => {
+    if (state === 'agent_speaking') {
+      handleInterrupt()
+      return
+    }
+
+    const newMutedState = !isMuted
+    setIsMuted(newMutedState)
+
+    if (mediaStreamRef.current) {
+      mediaStreamRef.current.getAudioTracks().forEach((track) => {
+        track.enabled = !newMutedState
+      })
+    }
+
+    if (newMutedState) {
+      stopListening()
+    } else if (state === 'idle') {
+      startListening()
+    }
+  }, [isMuted, state, handleInterrupt, stopListening, startListening])
+
+  // Initialize
  useEffect(() => {
    if (isSupported) {
-      initializeSpeechRecognition()
+      setupSpeechRecognition()
+      setupAudio()
    }
-  }, [isSupported, initializeSpeechRecognition])
+  }, [isSupported, setupSpeechRecognition, setupAudio])

+  // Auto-start listening when ready
  useEffect(() => {
-    if (isInitialized && !isMuted && !isListening) {
-      const startAudio = async () => {
-        try {
-          if (!mediaStreamRef.current) {
-            const success = await setupAudioVisualization()
-            if (!success) {
-              logger.error('Failed to setup audio visualization')
-              return
-            }
-          }
-
-          setTimeout(() => {
-            if (!isListening && !isMuted && !isStartingRef.current) {
-              startListening()
-            }
-          }, 300)
-        } catch (error) {
-          logger.error('Error setting up audio:', error)
-        }
-      }
-
-      startAudio()
+    if (isInitialized && !isMuted && state === 'idle') {
+      startListening()
    }
-  }, [isInitialized, isMuted, isListening, setupAudioVisualization, startListening])
-
-  // Gain ducking during audio playback
-  useEffect(() => {
-    if (gainNodeRef.current && audioContextRef.current) {
-      const gainNode = gainNodeRef.current
-      const audioContext = audioContextRef.current
-
-      if (isPlayingAudio) {
-        gainNode.gain.setTargetAtTime(0.1, audioContext.currentTime, 0.1)
-      } else {
-        gainNode.gain.setTargetAtTime(1, audioContext.currentTime, 0.2)
-      }
-    }
-  }, [isPlayingAudio])
+  }, [isInitialized, isMuted, state, startListening])

+  // Cleanup when call ends or component unmounts
  useEffect(() => {
    return () => {
-      cleanup()
+      // Stop speech recognition
+      if (recognitionRef.current) {
+        try {
+          recognitionRef.current.abort()
+        } catch (error) {
+          // Ignore
+        }
+        recognitionRef.current = null
+      }
+
+      // Stop media stream
+      if (mediaStreamRef.current) {
+        mediaStreamRef.current.getTracks().forEach((track) => {
+          track.stop()
+        })
+        mediaStreamRef.current = null
+      }
+
+      // Stop audio context
+      if (audioContextRef.current) {
+        audioContextRef.current.close()
+        audioContextRef.current = null
+      }
+
+      // Cancel animation frame
+      if (animationFrameRef.current) {
+        cancelAnimationFrame(animationFrameRef.current)
+        animationFrameRef.current = null
+      }
+
+      // Clear timeouts
+      if (responseTimeoutRef.current) {
+        clearTimeout(responseTimeoutRef.current)
+        responseTimeoutRef.current = null
+      }
    }
-  }, [cleanup])
+  }, [])
+
+  // Get status text
+  const getStatusText = () => {
+    switch (state) {
+      case 'listening':
+        return 'Listening...'
+      case 'agent_speaking':
+        return 'Press Space or tap to interrupt'
+      default:
+        return isInitialized ? 'Ready' : 'Initializing...'
+    }
+  }
+
+  // Get button content
+  const getButtonContent = () => {
+    if (state === 'agent_speaking') {
+      return (
+        <svg className='w-6 h-6' viewBox='0 0 24 24' fill='currentColor'>
+          <rect x='6' y='6' width='12' height='12' rx='2' />
+        </svg>
+      )
+    }
+    return isMuted ? <MicOff className='h-6 w-6' /> : <Mic className='h-6 w-6' />
+  }

  return (
    <div className={cn('fixed inset-0 z-[100] flex flex-col bg-white text-gray-900', className)}>
-      {/* Header with close button */}
-      <div className='flex justify-end p-4'>
-        <Button
-          variant='ghost'
-          size='icon'
-          onClick={handleEndCall}
-          className='h-10 w-10 rounded-full hover:bg-gray-100'
-        >
-          <X className='h-5 w-5' />
-        </Button>
-      </div>
-
-      {/* Main content area */}
+      {/* Main content */}
      <div className='flex flex-1 flex-col items-center justify-center px-8'>
        {/* Voice visualization */}
        <div className='relative mb-16'>
          <ParticlesVisualization
            audioLevels={audioLevels}
-            isListening={isListening}
-            isPlayingAudio={isPlayingAudio}
+            isListening={state === 'listening'}
+            isPlayingAudio={state === 'agent_speaking'}
            isStreaming={isStreaming}
            isMuted={isMuted}
-            isProcessingInterruption={false}
+            className='w-80 h-80 md:w-96 md:h-96'
          />
        </div>

-        {/* Status text */}
-        <div className='mb-8 text-center'>
-          <p className='font-light text-gray-600 text-lg'>
-            {getStatusText()}
-            {isMuted && <span className='ml-2 text-gray-400 text-sm'>(Muted)</span>}
-          </p>
+        {/* Live transcript - subtitle style */}
+        <div className='mb-16 h-24 flex items-center justify-center'>
+          {currentTranscript && (
+            <div className='max-w-2xl px-8'>
+              <p className='text-xl text-gray-700 text-center leading-relaxed overflow-hidden'>
+                {currentTranscript}
+              </p>
+            </div>
+          )}
        </div>
+
+        {/* Status */}
+        <p className='text-lg text-gray-600 mb-8 text-center'>
+          {getStatusText()}
+          {isMuted && <span className='ml-2 text-gray-400 text-sm'>(Muted)</span>}
+        </p>
      </div>

-      {/* Bottom controls */}
+      {/* Controls */}
      <div className='px-8 pb-12'>
        <div className='flex items-center justify-center space-x-12'>
-          {/* End call button */}
+          {/* End call */}
          <Button
-            onClick={handleEndCall}
+            onClick={handleCallEnd}
            variant='outline'
            size='icon'
            className='h-14 w-14 rounded-full border-gray-300 hover:bg-gray-50'
@@ -491,17 +569,18 @@ export function VoiceInterface({
            <Phone className='h-6 w-6 rotate-[135deg]' />
          </Button>

-          {/* Mute/unmute button */}
+          {/* Mic/Stop button */}
          <Button
            onClick={toggleMute}
            variant='outline'
            size='icon'
+            disabled={!isInitialized}
            className={cn(
-              'h-14 w-14 rounded-full border-gray-300 bg-transparent text-gray-600 hover:bg-gray-50',
-              isMuted && 'text-gray-400'
+              'h-14 w-14 rounded-full border-gray-300 bg-transparent hover:bg-gray-50',
+              isMuted ? 'text-gray-400' : 'text-gray-600'
            )}
          >
-            {isMuted ? <MicOff className='h-6 w-6' /> : <Mic className='h-6 w-6' />}
+            {getButtonContent()}
          </Button>
        </div>
      </div>