From b94c83aacc9093ce480aa270971ac5baa177f311 Mon Sep 17 00:00:00 2001 From: Ubbe Date: Thu, 29 Jan 2026 17:46:36 +0700 Subject: [PATCH] feat(frontend): Copilot speech to text via Whisper model (#11871) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Changes 🏗️ https://github.com/user-attachments/assets/d9c12ac0-625c-4b38-8834-e494b5eda9c0 Add a "speech to text" feature in the Chat input fox of Copilot, similar as what you have in ChatGPT. ## Checklist 📋 ### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] Run locally and try the speech to text feature as part of the chat input box ### For configuration changes: We need to add `OPENAI_API_KEY=` to Vercel ( used in the Front-end ) both in Dev and Prod. - [x] `.env.default` is updated or already compatible with my changes --------- Co-authored-by: Claude Opus 4.5 --- AGENTS.md | 24 +- autogpt_platform/CLAUDE.md | 16 +- autogpt_platform/frontend/.env.default | 3 + .../SessionsList/useSessionsPagination.ts | 4 +- .../frontend/src/app/api/transcribe/route.ts | 77 ++++++ .../Chat/components/ChatInput/ChatInput.tsx | 157 +++++++++--- .../ChatInput/components/AudioWaveform.tsx | 142 +++++++++++ .../components/RecordingIndicator.tsx | 26 ++ .../Chat/components/ChatInput/helpers.ts | 6 + .../Chat/components/ChatInput/useChatInput.ts | 4 +- .../components/ChatInput/useVoiceRecording.ts | 240 ++++++++++++++++++ 11 files changed, 626 insertions(+), 73 deletions(-) create mode 100644 autogpt_platform/frontend/src/app/api/transcribe/route.ts create mode 100644 autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/AudioWaveform.tsx create mode 100644 autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/RecordingIndicator.tsx create mode 100644 autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/helpers.ts create mode 100644 autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useVoiceRecording.ts diff --git a/AGENTS.md b/AGENTS.md index cd176f8a2d..202c4c6e02 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -16,7 +16,6 @@ See `docs/content/platform/getting-started.md` for setup instructions. - Format Python code with `poetry run format`. - Format frontend code using `pnpm format`. - ## Frontend guidelines: See `/frontend/CONTRIBUTING.md` for complete patterns. Quick reference: @@ -33,14 +32,17 @@ See `/frontend/CONTRIBUTING.md` for complete patterns. Quick reference: 4. **Styling**: Tailwind CSS only, use design tokens, Phosphor Icons only 5. **Testing**: Add Storybook stories for new components, Playwright for E2E 6. **Code conventions**: Function declarations (not arrow functions) for components/handlers + - Component props should be `interface Props { ... }` (not exported) unless the interface needs to be used outside the component - Separate render logic from business logic (component.tsx + useComponent.ts + helpers.ts) - Colocate state when possible and avoid creating large components, use sub-components ( local `/components` folder next to the parent component ) when sensible - Avoid large hooks, abstract logic into `helpers.ts` files when sensible - Use function declarations for components, arrow functions only for callbacks - No barrel files or `index.ts` re-exports -- Do not use `useCallback` or `useMemo` unless strictly needed - Avoid comments at all times unless the code is very complex +- Do not use `useCallback` or `useMemo` unless asked to optimise a given function +- Do not type hook returns, let Typescript infer as much as possible +- Never type with `any`, if not types available use `unknown` ## Testing @@ -49,22 +51,8 @@ See `/frontend/CONTRIBUTING.md` for complete patterns. Quick reference: Always run the relevant linters and tests before committing. Use conventional commit messages for all commits (e.g. `feat(backend): add API`). - Types: - - feat - - fix - - refactor - - ci - - dx (developer experience) - Scopes: - - platform - - platform/library - - platform/marketplace - - backend - - backend/executor - - frontend - - frontend/library - - frontend/marketplace - - blocks +Types: - feat - fix - refactor - ci - dx (developer experience) +Scopes: - platform - platform/library - platform/marketplace - backend - backend/executor - frontend - frontend/library - frontend/marketplace - blocks ## Pull requests diff --git a/autogpt_platform/CLAUDE.md b/autogpt_platform/CLAUDE.md index 9690178587..a5a588b667 100644 --- a/autogpt_platform/CLAUDE.md +++ b/autogpt_platform/CLAUDE.md @@ -85,17 +85,6 @@ pnpm format pnpm types ``` -**📖 Complete Guide**: See `/frontend/CONTRIBUTING.md` and `/frontend/.cursorrules` for comprehensive frontend patterns. - -**Key Frontend Conventions:** - -- Separate render logic from data/behavior in components -- Use generated API hooks from `@/app/api/__generated__/endpoints/` -- Use function declarations (not arrow functions) for components/handlers -- Use design system components from `src/components/` (atoms, molecules, organisms) -- Only use Phosphor Icons -- Never use `src/components/__legacy__/*` or deprecated `BackendAPI` - ## Architecture Overview ### Backend Architecture @@ -261,14 +250,17 @@ See `/frontend/CONTRIBUTING.md` for complete patterns. Quick reference: 4. **Styling**: Tailwind CSS only, use design tokens, Phosphor Icons only 5. **Testing**: Add Storybook stories for new components, Playwright for E2E 6. **Code conventions**: Function declarations (not arrow functions) for components/handlers + - Component props should be `interface Props { ... }` (not exported) unless the interface needs to be used outside the component - Separate render logic from business logic (component.tsx + useComponent.ts + helpers.ts) - Colocate state when possible and avoid creating large components, use sub-components ( local `/components` folder next to the parent component ) when sensible - Avoid large hooks, abstract logic into `helpers.ts` files when sensible - Use function declarations for components, arrow functions only for callbacks - No barrel files or `index.ts` re-exports -- Do not use `useCallback` or `useMemo` unless strictly needed +- Do not use `useCallback` or `useMemo` unless asked to optimise a given function - Avoid comments at all times unless the code is very complex +- Do not type hook returns, let Typescript infer as much as possible +- Never type with `any`, if not types available use `unknown` ### Security Implementation diff --git a/autogpt_platform/frontend/.env.default b/autogpt_platform/frontend/.env.default index af250fb8bf..7a9d81e39e 100644 --- a/autogpt_platform/frontend/.env.default +++ b/autogpt_platform/frontend/.env.default @@ -34,3 +34,6 @@ NEXT_PUBLIC_PREVIEW_STEALING_DEV= # PostHog Analytics NEXT_PUBLIC_POSTHOG_KEY= NEXT_PUBLIC_POSTHOG_HOST=https://eu.i.posthog.com + +# OpenAI (for voice transcription) +OPENAI_API_KEY= diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/components/CopilotShell/components/SessionsList/useSessionsPagination.ts b/autogpt_platform/frontend/src/app/(platform)/copilot/components/CopilotShell/components/SessionsList/useSessionsPagination.ts index 11ddd937af..61e3e6f37f 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/components/CopilotShell/components/SessionsList/useSessionsPagination.ts +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/components/CopilotShell/components/SessionsList/useSessionsPagination.ts @@ -73,9 +73,9 @@ export function useSessionsPagination({ enabled }: UseSessionsPaginationArgs) { }; const reset = () => { + // Only reset the offset - keep existing sessions visible during refetch + // The effect will replace sessions when new data arrives at offset 0 setOffset(0); - setAccumulatedSessions([]); - setTotalCount(null); }; return { diff --git a/autogpt_platform/frontend/src/app/api/transcribe/route.ts b/autogpt_platform/frontend/src/app/api/transcribe/route.ts new file mode 100644 index 0000000000..10c182cdfa --- /dev/null +++ b/autogpt_platform/frontend/src/app/api/transcribe/route.ts @@ -0,0 +1,77 @@ +import { getServerAuthToken } from "@/lib/autogpt-server-api/helpers"; +import { NextRequest, NextResponse } from "next/server"; + +const WHISPER_API_URL = "https://api.openai.com/v1/audio/transcriptions"; +const MAX_FILE_SIZE = 25 * 1024 * 1024; // 25MB - Whisper's limit + +function getExtensionFromMimeType(mimeType: string): string { + const subtype = mimeType.split("/")[1]?.split(";")[0]; + return subtype || "webm"; +} + +export async function POST(request: NextRequest) { + const token = await getServerAuthToken(); + + if (!token || token === "no-token-found") { + return NextResponse.json({ error: "Unauthorized" }, { status: 401 }); + } + + const apiKey = process.env.OPENAI_API_KEY; + + if (!apiKey) { + return NextResponse.json( + { error: "OpenAI API key not configured" }, + { status: 401 }, + ); + } + + try { + const formData = await request.formData(); + const audioFile = formData.get("audio"); + + if (!audioFile || !(audioFile instanceof Blob)) { + return NextResponse.json( + { error: "No audio file provided" }, + { status: 400 }, + ); + } + + if (audioFile.size > MAX_FILE_SIZE) { + return NextResponse.json( + { error: "File too large. Maximum size is 25MB." }, + { status: 413 }, + ); + } + + const ext = getExtensionFromMimeType(audioFile.type); + const whisperFormData = new FormData(); + whisperFormData.append("file", audioFile, `recording.${ext}`); + whisperFormData.append("model", "whisper-1"); + + const response = await fetch(WHISPER_API_URL, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + }, + body: whisperFormData, + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + console.error("Whisper API error:", errorData); + return NextResponse.json( + { error: errorData.error?.message || "Transcription failed" }, + { status: response.status }, + ); + } + + const result = await response.json(); + return NextResponse.json({ text: result.text }); + } catch (error) { + console.error("Transcription error:", error); + return NextResponse.json( + { error: "Failed to process audio" }, + { status: 500 }, + ); + } +} diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/ChatInput.tsx b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/ChatInput.tsx index c45e8dc250..521f6f6320 100644 --- a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/ChatInput.tsx +++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/ChatInput.tsx @@ -1,7 +1,14 @@ import { Button } from "@/components/atoms/Button/Button"; import { cn } from "@/lib/utils"; -import { ArrowUpIcon, StopIcon } from "@phosphor-icons/react"; +import { + ArrowUpIcon, + CircleNotchIcon, + MicrophoneIcon, + StopIcon, +} from "@phosphor-icons/react"; +import { RecordingIndicator } from "./components/RecordingIndicator"; import { useChatInput } from "./useChatInput"; +import { useVoiceRecording } from "./useVoiceRecording"; export interface Props { onSend: (message: string) => void; @@ -21,13 +28,36 @@ export function ChatInput({ className, }: Props) { const inputId = "chat-input"; - const { value, handleKeyDown, handleSubmit, handleChange, hasMultipleLines } = - useChatInput({ - onSend, - disabled: disabled || isStreaming, - maxRows: 4, - inputId, - }); + const { + value, + setValue, + handleKeyDown: baseHandleKeyDown, + handleSubmit, + handleChange, + hasMultipleLines, + } = useChatInput({ + onSend, + disabled: disabled || isStreaming, + maxRows: 4, + inputId, + }); + + const { + isRecording, + isTranscribing, + elapsedTime, + toggleRecording, + handleKeyDown, + showMicButton, + isInputDisabled, + audioStream, + } = useVoiceRecording({ + setValue, + disabled: disabled || isStreaming, + isStreaming, + value, + baseHandleKeyDown, + }); return (
@@ -35,8 +65,11 @@ export function ChatInput({
@@ -46,48 +79,94 @@ export function ChatInput({ value={value} onChange={handleChange} onKeyDown={handleKeyDown} - placeholder={placeholder} - disabled={disabled || isStreaming} + placeholder={ + isTranscribing + ? "Transcribing..." + : isRecording + ? "" + : placeholder + } + disabled={isInputDisabled} rows={1} className={cn( "w-full resize-none overflow-y-auto border-0 bg-transparent text-[1rem] leading-6 text-black", "placeholder:text-zinc-400", "focus:outline-none focus:ring-0", "disabled:text-zinc-500", - hasMultipleLines ? "pb-6 pl-4 pr-4 pt-2" : "pb-4 pl-4 pr-14 pt-4", + hasMultipleLines + ? "pb-6 pl-4 pr-4 pt-2" + : showMicButton + ? "pb-4 pl-14 pr-14 pt-4" + : "pb-4 pl-4 pr-14 pt-4", )} /> + {isRecording && !value && ( +
+ +
+ )}
- Press Enter to send, Shift+Enter for new line + Press Enter to send, Shift+Enter for new line, Space to record voice - {isStreaming ? ( - - ) : ( - + {showMicButton && ( +
+ +
)} + +
+ {isStreaming ? ( + + ) : ( + + )} +
); diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/AudioWaveform.tsx b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/AudioWaveform.tsx new file mode 100644 index 0000000000..10cbb3fc9f --- /dev/null +++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/AudioWaveform.tsx @@ -0,0 +1,142 @@ +"use client"; + +import { useEffect, useRef, useState } from "react"; + +interface Props { + stream: MediaStream | null; + barCount?: number; + barWidth?: number; + barGap?: number; + barColor?: string; + minBarHeight?: number; + maxBarHeight?: number; +} + +export function AudioWaveform({ + stream, + barCount = 24, + barWidth = 3, + barGap = 2, + barColor = "#ef4444", // red-500 + minBarHeight = 4, + maxBarHeight = 32, +}: Props) { + const [bars, setBars] = useState(() => + Array(barCount).fill(minBarHeight), + ); + const analyserRef = useRef(null); + const audioContextRef = useRef(null); + const sourceRef = useRef(null); + const animationRef = useRef(null); + + useEffect(() => { + if (!stream) { + setBars(Array(barCount).fill(minBarHeight)); + return; + } + + // Create audio context and analyser + const audioContext = new AudioContext(); + const analyser = audioContext.createAnalyser(); + analyser.fftSize = 512; + analyser.smoothingTimeConstant = 0.8; + + // Connect the stream to the analyser + const source = audioContext.createMediaStreamSource(stream); + source.connect(analyser); + + audioContextRef.current = audioContext; + analyserRef.current = analyser; + sourceRef.current = source; + + const timeData = new Uint8Array(analyser.frequencyBinCount); + + const updateBars = () => { + if (!analyserRef.current) return; + + analyserRef.current.getByteTimeDomainData(timeData); + + // Distribute time-domain data across bars + // This shows waveform amplitude, making all bars respond to audio + const newBars: number[] = []; + const samplesPerBar = timeData.length / barCount; + + for (let i = 0; i < barCount; i++) { + // Sample waveform data for this bar + let maxAmplitude = 0; + const startIdx = Math.floor(i * samplesPerBar); + const endIdx = Math.floor((i + 1) * samplesPerBar); + + for (let j = startIdx; j < endIdx && j < timeData.length; j++) { + // Convert to amplitude (distance from center 128) + const amplitude = Math.abs(timeData[j] - 128); + maxAmplitude = Math.max(maxAmplitude, amplitude); + } + + // Map amplitude (0-128) to bar height + const normalized = (maxAmplitude / 128) * 255; + const height = + minBarHeight + (normalized / 255) * (maxBarHeight - minBarHeight); + newBars.push(height); + } + + setBars(newBars); + animationRef.current = requestAnimationFrame(updateBars); + }; + + updateBars(); + + return () => { + if (animationRef.current) { + cancelAnimationFrame(animationRef.current); + } + if (sourceRef.current) { + sourceRef.current.disconnect(); + } + if (audioContextRef.current) { + audioContextRef.current.close(); + } + analyserRef.current = null; + audioContextRef.current = null; + sourceRef.current = null; + }; + }, [stream, barCount, minBarHeight, maxBarHeight]); + + const totalWidth = barCount * barWidth + (barCount - 1) * barGap; + + return ( +
+ {bars.map((height, i) => { + const barHeight = Math.max(minBarHeight, height); + return ( +
+
+
+ ); + })} +
+ ); +} diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/RecordingIndicator.tsx b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/RecordingIndicator.tsx new file mode 100644 index 0000000000..0be0d069bb --- /dev/null +++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/RecordingIndicator.tsx @@ -0,0 +1,26 @@ +import { formatElapsedTime } from "../helpers"; +import { AudioWaveform } from "./AudioWaveform"; + +type Props = { + elapsedTime: number; + audioStream: MediaStream | null; +}; + +export function RecordingIndicator({ elapsedTime, audioStream }: Props) { + return ( +
+ + + {formatElapsedTime(elapsedTime)} + +
+ ); +} diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/helpers.ts b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/helpers.ts new file mode 100644 index 0000000000..26bae8c9d9 --- /dev/null +++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/helpers.ts @@ -0,0 +1,6 @@ +export function formatElapsedTime(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + return `${minutes}:${remainingSeconds.toString().padStart(2, "0")}`; +} diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useChatInput.ts b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useChatInput.ts index 6fa8e7252b..a053e6080f 100644 --- a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useChatInput.ts +++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useChatInput.ts @@ -6,7 +6,7 @@ import { useState, } from "react"; -interface UseChatInputArgs { +interface Args { onSend: (message: string) => void; disabled?: boolean; maxRows?: number; @@ -18,7 +18,7 @@ export function useChatInput({ disabled = false, maxRows = 5, inputId = "chat-input", -}: UseChatInputArgs) { +}: Args) { const [value, setValue] = useState(""); const [hasMultipleLines, setHasMultipleLines] = useState(false); diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useVoiceRecording.ts b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useVoiceRecording.ts new file mode 100644 index 0000000000..13b625e69c --- /dev/null +++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useVoiceRecording.ts @@ -0,0 +1,240 @@ +import { useToast } from "@/components/molecules/Toast/use-toast"; +import React, { + KeyboardEvent, + useCallback, + useEffect, + useRef, + useState, +} from "react"; + +const MAX_RECORDING_DURATION = 2 * 60 * 1000; // 2 minutes in ms + +interface Args { + setValue: React.Dispatch>; + disabled?: boolean; + isStreaming?: boolean; + value: string; + baseHandleKeyDown: (event: KeyboardEvent) => void; +} + +export function useVoiceRecording({ + setValue, + disabled = false, + isStreaming = false, + value, + baseHandleKeyDown, +}: Args) { + const [isRecording, setIsRecording] = useState(false); + const [isTranscribing, setIsTranscribing] = useState(false); + const [error, setError] = useState(null); + const [elapsedTime, setElapsedTime] = useState(0); + + const mediaRecorderRef = useRef(null); + const chunksRef = useRef([]); + const timerRef = useRef(null); + const startTimeRef = useRef(0); + const streamRef = useRef(null); + const isRecordingRef = useRef(false); + + const isSupported = + typeof window !== "undefined" && + !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia); + + const clearTimer = useCallback(() => { + if (timerRef.current) { + clearInterval(timerRef.current); + timerRef.current = null; + } + }, []); + + const cleanup = useCallback(() => { + clearTimer(); + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + streamRef.current = null; + } + mediaRecorderRef.current = null; + chunksRef.current = []; + setElapsedTime(0); + }, [clearTimer]); + + const handleTranscription = useCallback( + (text: string) => { + setValue((prev) => { + const trimmedPrev = prev.trim(); + if (trimmedPrev) { + return `${trimmedPrev} ${text}`; + } + return text; + }); + }, + [setValue], + ); + + const transcribeAudio = useCallback( + async (audioBlob: Blob) => { + setIsTranscribing(true); + setError(null); + + try { + const formData = new FormData(); + formData.append("audio", audioBlob); + + const response = await fetch("/api/transcribe", { + method: "POST", + body: formData, + }); + + if (!response.ok) { + const data = await response.json().catch(() => ({})); + throw new Error(data.error || "Transcription failed"); + } + + const data = await response.json(); + if (data.text) { + handleTranscription(data.text); + } + } catch (err) { + const message = + err instanceof Error ? err.message : "Transcription failed"; + setError(message); + console.error("Transcription error:", err); + } finally { + setIsTranscribing(false); + } + }, + [handleTranscription], + ); + + const stopRecording = useCallback(() => { + if (mediaRecorderRef.current && isRecordingRef.current) { + mediaRecorderRef.current.stop(); + isRecordingRef.current = false; + setIsRecording(false); + clearTimer(); + } + }, [clearTimer]); + + const startRecording = useCallback(async () => { + if (disabled || isRecordingRef.current || isTranscribing) return; + + setError(null); + chunksRef.current = []; + + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + streamRef.current = stream; + + const mediaRecorder = new MediaRecorder(stream, { + mimeType: MediaRecorder.isTypeSupported("audio/webm") + ? "audio/webm" + : "audio/mp4", + }); + + mediaRecorderRef.current = mediaRecorder; + + mediaRecorder.ondataavailable = (event) => { + if (event.data.size > 0) { + chunksRef.current.push(event.data); + } + }; + + mediaRecorder.onstop = async () => { + const audioBlob = new Blob(chunksRef.current, { + type: mediaRecorder.mimeType, + }); + + // Cleanup stream + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + streamRef.current = null; + } + + if (audioBlob.size > 0) { + await transcribeAudio(audioBlob); + } + }; + + mediaRecorder.start(1000); // Collect data every second + isRecordingRef.current = true; + setIsRecording(true); + startTimeRef.current = Date.now(); + + // Start elapsed time timer + timerRef.current = setInterval(() => { + const elapsed = Date.now() - startTimeRef.current; + setElapsedTime(elapsed); + + // Auto-stop at max duration + if (elapsed >= MAX_RECORDING_DURATION) { + stopRecording(); + } + }, 100); + } catch (err) { + console.error("Failed to start recording:", err); + if (err instanceof DOMException && err.name === "NotAllowedError") { + setError("Microphone permission denied"); + } else { + setError("Failed to access microphone"); + } + cleanup(); + } + }, [disabled, isTranscribing, stopRecording, transcribeAudio, cleanup]); + + const toggleRecording = useCallback(() => { + if (isRecording) { + stopRecording(); + } else { + startRecording(); + } + }, [isRecording, startRecording, stopRecording]); + + const { toast } = useToast(); + + useEffect(() => { + if (error) { + toast({ + title: "Voice recording failed", + description: error, + variant: "destructive", + }); + } + }, [error, toast]); + + const handleKeyDown = useCallback( + (event: KeyboardEvent) => { + if (event.key === " " && !value.trim() && !isTranscribing) { + event.preventDefault(); + toggleRecording(); + return; + } + baseHandleKeyDown(event); + }, + [value, isTranscribing, toggleRecording, baseHandleKeyDown], + ); + + const showMicButton = isSupported && !isStreaming; + const isInputDisabled = disabled || isStreaming || isTranscribing; + + // Cleanup on unmount + useEffect(() => { + return () => { + cleanup(); + }; + }, [cleanup]); + + return { + isRecording, + isTranscribing, + error, + elapsedTime, + startRecording, + stopRecording, + toggleRecording, + isSupported, + handleKeyDown, + showMicButton, + isInputDisabled, + audioStream: streamRef.current, + }; +}