From 4994de9d0bd3ea15179de36a4c8bcc7dfeb726f5 Mon Sep 17 00:00:00 2001 From: Lluis Agusti Date: Wed, 28 Jan 2026 23:46:39 +0700 Subject: [PATCH] chore: improvements --- autogpt_platform/frontend/.env.default | 3 + .../SessionsList/useSessionsPagination.ts | 4 +- .../frontend/src/app/api/transcribe/route.ts | 64 ++++++ .../Chat/components/ChatInput/ChatInput.tsx | 191 +++++++++++++---- .../components/ChatInput/useVoiceRecording.ts | 198 ++++++++++++++++++ 5 files changed, 419 insertions(+), 41 deletions(-) create mode 100644 autogpt_platform/frontend/src/app/api/transcribe/route.ts create mode 100644 autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useVoiceRecording.ts diff --git a/autogpt_platform/frontend/.env.default b/autogpt_platform/frontend/.env.default index af250fb8bf..7a9d81e39e 100644 --- a/autogpt_platform/frontend/.env.default +++ b/autogpt_platform/frontend/.env.default @@ -34,3 +34,6 @@ NEXT_PUBLIC_PREVIEW_STEALING_DEV= # PostHog Analytics NEXT_PUBLIC_POSTHOG_KEY= NEXT_PUBLIC_POSTHOG_HOST=https://eu.i.posthog.com + +# OpenAI (for voice transcription) +OPENAI_API_KEY= diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/components/CopilotShell/components/SessionsList/useSessionsPagination.ts b/autogpt_platform/frontend/src/app/(platform)/copilot/components/CopilotShell/components/SessionsList/useSessionsPagination.ts index 11ddd937af..61e3e6f37f 100644 --- a/autogpt_platform/frontend/src/app/(platform)/copilot/components/CopilotShell/components/SessionsList/useSessionsPagination.ts +++ b/autogpt_platform/frontend/src/app/(platform)/copilot/components/CopilotShell/components/SessionsList/useSessionsPagination.ts @@ -73,9 +73,9 @@ export function useSessionsPagination({ enabled }: UseSessionsPaginationArgs) { }; const reset = () => { + // Only reset the offset - keep existing sessions visible during refetch + // The effect will replace sessions when new data arrives at offset 0 setOffset(0); - setAccumulatedSessions([]); - setTotalCount(null); }; return { diff --git a/autogpt_platform/frontend/src/app/api/transcribe/route.ts b/autogpt_platform/frontend/src/app/api/transcribe/route.ts new file mode 100644 index 0000000000..8cba1f5e40 --- /dev/null +++ b/autogpt_platform/frontend/src/app/api/transcribe/route.ts @@ -0,0 +1,64 @@ +import { NextRequest, NextResponse } from "next/server"; + +const WHISPER_API_URL = "https://api.openai.com/v1/audio/transcriptions"; +const MAX_FILE_SIZE = 25 * 1024 * 1024; // 25MB - Whisper's limit + +export async function POST(request: NextRequest) { + const apiKey = process.env.OPENAI_API_KEY; + + if (!apiKey) { + return NextResponse.json( + { error: "OpenAI API key not configured" }, + { status: 401 }, + ); + } + + try { + const formData = await request.formData(); + const audioFile = formData.get("audio"); + + if (!audioFile || !(audioFile instanceof Blob)) { + return NextResponse.json( + { error: "No audio file provided" }, + { status: 400 }, + ); + } + + if (audioFile.size > MAX_FILE_SIZE) { + return NextResponse.json( + { error: "File too large. Maximum size is 25MB." }, + { status: 413 }, + ); + } + + const whisperFormData = new FormData(); + whisperFormData.append("file", audioFile, "recording.webm"); + whisperFormData.append("model", "whisper-1"); + + const response = await fetch(WHISPER_API_URL, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + }, + body: whisperFormData, + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + console.error("Whisper API error:", errorData); + return NextResponse.json( + { error: errorData.error?.message || "Transcription failed" }, + { status: response.status }, + ); + } + + const result = await response.json(); + return NextResponse.json({ text: result.text }); + } catch (error) { + console.error("Transcription error:", error); + return NextResponse.json( + { error: "Failed to process audio" }, + { status: 500 }, + ); + } +} diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/ChatInput.tsx b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/ChatInput.tsx index c45e8dc250..c37839797e 100644 --- a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/ChatInput.tsx +++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/ChatInput.tsx @@ -1,7 +1,21 @@ import { Button } from "@/components/atoms/Button/Button"; import { cn } from "@/lib/utils"; -import { ArrowUpIcon, StopIcon } from "@phosphor-icons/react"; +import { + ArrowUpIcon, + CircleNotchIcon, + MicrophoneIcon, + StopIcon, +} from "@phosphor-icons/react"; +import { KeyboardEvent, useCallback } from "react"; import { useChatInput } from "./useChatInput"; +import { useVoiceRecording } from "./useVoiceRecording"; + +function formatElapsedTime(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + return `${minutes}:${remainingSeconds.toString().padStart(2, "0")}`; +} export interface Props { onSend: (message: string) => void; @@ -21,13 +35,60 @@ export function ChatInput({ className, }: Props) { const inputId = "chat-input"; - const { value, handleKeyDown, handleSubmit, handleChange, hasMultipleLines } = - useChatInput({ - onSend, - disabled: disabled || isStreaming, - maxRows: 4, - inputId, - }); + const { + value, + setValue, + handleKeyDown: baseHandleKeyDown, + handleSubmit, + handleChange, + hasMultipleLines, + } = useChatInput({ + onSend, + disabled: disabled || isStreaming, + maxRows: 4, + inputId, + }); + + const handleTranscription = useCallback( + (text: string) => { + setValue((prev) => { + const trimmedPrev = prev.trim(); + if (trimmedPrev) { + return `${trimmedPrev} ${text}`; + } + return text; + }); + }, + [setValue], + ); + + const { + isRecording, + isTranscribing, + error: voiceError, + elapsedTime, + toggleRecording, + isSupported: isVoiceSupported, + } = useVoiceRecording({ + onTranscription: handleTranscription, + disabled: disabled || isStreaming, + }); + + const handleKeyDown = useCallback( + (event: KeyboardEvent) => { + // Space key toggles recording when input is empty + if (event.key === " " && !value.trim() && !isTranscribing) { + event.preventDefault(); + toggleRecording(); + return; + } + baseHandleKeyDown(event); + }, + [value, isTranscribing, toggleRecording, baseHandleKeyDown], + ); + + const showMicButton = isVoiceSupported && !isStreaming; + const isInputDisabled = disabled || isStreaming || isTranscribing; return (
@@ -35,8 +96,11 @@ export function ChatInput({
@@ -46,48 +110,97 @@ export function ChatInput({ value={value} onChange={handleChange} onKeyDown={handleKeyDown} - placeholder={placeholder} - disabled={disabled || isStreaming} + placeholder={ + isTranscribing + ? "Transcribing..." + : isRecording + ? "Recording... Press Space or click mic to stop" + : placeholder + } + disabled={isInputDisabled} rows={1} className={cn( "w-full resize-none overflow-y-auto border-0 bg-transparent text-[1rem] leading-6 text-black", "placeholder:text-zinc-400", "focus:outline-none focus:ring-0", "disabled:text-zinc-500", - hasMultipleLines ? "pb-6 pl-4 pr-4 pt-2" : "pb-4 pl-4 pr-14 pt-4", + hasMultipleLines + ? "pb-6 pl-4 pr-4 pt-2" + : showMicButton + ? "pb-4 pl-14 pr-14 pt-4" + : "pb-4 pl-4 pr-14 pt-4", )} />
- Press Enter to send, Shift+Enter for new line + Press Enter to send, Shift+Enter for new line, Space to record voice - {isStreaming ? ( - - ) : ( - + {voiceError && ( +
+ {voiceError} +
)} + + {showMicButton && ( +
+ + {isRecording && ( + + {formatElapsedTime(elapsedTime)} + + )} +
+ )} + +
+ {isStreaming ? ( + + ) : ( + + )} +
); diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useVoiceRecording.ts b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useVoiceRecording.ts new file mode 100644 index 0000000000..8c72fb5bc8 --- /dev/null +++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useVoiceRecording.ts @@ -0,0 +1,198 @@ +import { useCallback, useEffect, useRef, useState } from "react"; + +const MAX_RECORDING_DURATION = 2 * 60 * 1000; // 2 minutes in ms + +interface UseVoiceRecordingArgs { + onTranscription: (text: string) => void; + disabled?: boolean; +} + +interface UseVoiceRecordingReturn { + isRecording: boolean; + isTranscribing: boolean; + error: string | null; + elapsedTime: number; + startRecording: () => Promise; + stopRecording: () => void; + toggleRecording: () => void; + isSupported: boolean; +} + +export function useVoiceRecording({ + onTranscription, + disabled = false, +}: UseVoiceRecordingArgs): UseVoiceRecordingReturn { + const [isRecording, setIsRecording] = useState(false); + const [isTranscribing, setIsTranscribing] = useState(false); + const [error, setError] = useState(null); + const [elapsedTime, setElapsedTime] = useState(0); + + const mediaRecorderRef = useRef(null); + const chunksRef = useRef([]); + const timerRef = useRef(null); + const startTimeRef = useRef(0); + const streamRef = useRef(null); + + const isSupported = + typeof window !== "undefined" && + !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia); + + const clearTimer = useCallback(() => { + if (timerRef.current) { + clearInterval(timerRef.current); + timerRef.current = null; + } + }, []); + + const cleanup = useCallback(() => { + clearTimer(); + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + streamRef.current = null; + } + mediaRecorderRef.current = null; + chunksRef.current = []; + setElapsedTime(0); + }, [clearTimer]); + + const transcribeAudio = useCallback( + async (audioBlob: Blob) => { + setIsTranscribing(true); + setError(null); + + try { + const formData = new FormData(); + formData.append("audio", audioBlob); + + const response = await fetch("/api/transcribe", { + method: "POST", + body: formData, + }); + + if (!response.ok) { + const data = await response.json().catch(() => ({})); + throw new Error(data.error || "Transcription failed"); + } + + const data = await response.json(); + if (data.text) { + onTranscription(data.text); + } + } catch (err) { + const message = + err instanceof Error ? err.message : "Transcription failed"; + setError(message); + console.error("Transcription error:", err); + } finally { + setIsTranscribing(false); + } + }, + [onTranscription], + ); + + const stopRecording = useCallback(() => { + if (mediaRecorderRef.current && isRecording) { + mediaRecorderRef.current.stop(); + setIsRecording(false); + clearTimer(); + } + }, [isRecording, clearTimer]); + + const startRecording = useCallback(async () => { + if (disabled || isRecording || isTranscribing) return; + + setError(null); + chunksRef.current = []; + + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + streamRef.current = stream; + + const mediaRecorder = new MediaRecorder(stream, { + mimeType: MediaRecorder.isTypeSupported("audio/webm") + ? "audio/webm" + : "audio/mp4", + }); + + mediaRecorderRef.current = mediaRecorder; + + mediaRecorder.ondataavailable = (event) => { + if (event.data.size > 0) { + chunksRef.current.push(event.data); + } + }; + + mediaRecorder.onstop = async () => { + const audioBlob = new Blob(chunksRef.current, { + type: mediaRecorder.mimeType, + }); + + // Cleanup stream + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + streamRef.current = null; + } + + if (audioBlob.size > 0) { + await transcribeAudio(audioBlob); + } + }; + + mediaRecorder.start(1000); // Collect data every second + setIsRecording(true); + startTimeRef.current = Date.now(); + + // Start elapsed time timer + timerRef.current = setInterval(() => { + const elapsed = Date.now() - startTimeRef.current; + setElapsedTime(elapsed); + + // Auto-stop at max duration + if (elapsed >= MAX_RECORDING_DURATION) { + stopRecording(); + } + }, 100); + } catch (err) { + console.error("Failed to start recording:", err); + if (err instanceof DOMException && err.name === "NotAllowedError") { + setError("Microphone permission denied"); + } else { + setError("Failed to access microphone"); + } + cleanup(); + } + }, [ + disabled, + isRecording, + isTranscribing, + stopRecording, + transcribeAudio, + cleanup, + ]); + + const toggleRecording = useCallback(() => { + if (isRecording) { + stopRecording(); + } else { + startRecording(); + } + }, [isRecording, startRecording, stopRecording]); + + // Cleanup on unmount + useEffect(() => { + return () => { + cleanup(); + }; + }, [cleanup]); + + return { + isRecording, + isTranscribing, + error, + elapsedTime, + startRecording, + stopRecording, + toggleRecording, + isSupported, + }; +}