From b94c83aacc9093ce480aa270971ac5baa177f311 Mon Sep 17 00:00:00 2001
From: Ubbe <hi@ubbe.dev>
Date: Thu, 29 Jan 2026 17:46:36 +0700
Subject: [PATCH] feat(frontend): Copilot speech to text via Whisper model
 (#11871)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Changes 🏗️


https://github.com/user-attachments/assets/d9c12ac0-625c-4b38-8834-e494b5eda9c0

Add a "speech to text" feature in the Chat input fox of Copilot, similar
as what you have in ChatGPT.

## Checklist 📋

### For code changes:
- [x] I have clearly listed my changes in the PR description
- [x] I have made a test plan
- [x] I have tested my changes according to the test plan:
- [x] Run locally and try the speech to text feature as part of the chat
input box

### For configuration changes:

We need to add `OPENAI_API_KEY=` to Vercel ( used in the Front-end )
both in Dev and Prod.

- [x] `.env.default` is updated or already compatible with my changes

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 AGENTS.md                                     |  24 +-
 autogpt_platform/CLAUDE.md                    |  16 +-
 autogpt_platform/frontend/.env.default        |   3 +
 .../SessionsList/useSessionsPagination.ts     |   4 +-
 .../frontend/src/app/api/transcribe/route.ts  |  77 ++++++
 .../Chat/components/ChatInput/ChatInput.tsx   | 157 +++++++++---
 .../ChatInput/components/AudioWaveform.tsx    | 142 +++++++++++
 .../components/RecordingIndicator.tsx         |  26 ++
 .../Chat/components/ChatInput/helpers.ts      |   6 +
 .../Chat/components/ChatInput/useChatInput.ts |   4 +-
 .../components/ChatInput/useVoiceRecording.ts | 240 ++++++++++++++++++
 11 files changed, 626 insertions(+), 73 deletions(-)
 create mode 100644 autogpt_platform/frontend/src/app/api/transcribe/route.ts
 create mode 100644 autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/AudioWaveform.tsx
 create mode 100644 autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/RecordingIndicator.tsx
 create mode 100644 autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/helpers.ts
 create mode 100644 autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useVoiceRecording.ts

diff --git a/AGENTS.md b/AGENTS.md
index cd176f8a2d..202c4c6e02 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -16,7 +16,6 @@ See `docs/content/platform/getting-started.md` for setup instructions.
 - Format Python code with `poetry run format`.
 - Format frontend code using `pnpm format`.
 
-
 ## Frontend guidelines:
 
 See `/frontend/CONTRIBUTING.md` for complete patterns. Quick reference:
@@ -33,14 +32,17 @@ See `/frontend/CONTRIBUTING.md` for complete patterns. Quick reference:
 4. **Styling**: Tailwind CSS only, use design tokens, Phosphor Icons only
 5. **Testing**: Add Storybook stories for new components, Playwright for E2E
 6. **Code conventions**: Function declarations (not arrow functions) for components/handlers
+
 - Component props should be `interface Props { ... }` (not exported) unless the interface needs to be used outside the component
 - Separate render logic from business logic (component.tsx + useComponent.ts + helpers.ts)
 - Colocate state when possible and avoid creating large components, use sub-components ( local `/components` folder next to the parent component ) when sensible
 - Avoid large hooks, abstract logic into `helpers.ts` files when sensible
 - Use function declarations for components, arrow functions only for callbacks
 - No barrel files or `index.ts` re-exports
-- Do not use `useCallback` or `useMemo` unless strictly needed
 - Avoid comments at all times unless the code is very complex
+- Do not use `useCallback` or `useMemo` unless asked to optimise a given function
+- Do not type hook returns, let Typescript infer as much as possible
+- Never type with `any`, if not types available use `unknown`
 
 ## Testing
 
@@ -49,22 +51,8 @@ See `/frontend/CONTRIBUTING.md` for complete patterns. Quick reference:
 
 Always run the relevant linters and tests before committing.
 Use conventional commit messages for all commits (e.g. `feat(backend): add API`).
-  Types:
-    - feat
-    - fix
-    - refactor
-    - ci
-    - dx (developer experience)
-  Scopes:
-    - platform
-      - platform/library
-      - platform/marketplace
-      - backend
-        - backend/executor
-      - frontend
-        - frontend/library
-        - frontend/marketplace
-      - blocks
+Types: - feat - fix - refactor - ci - dx (developer experience)
+Scopes: - platform - platform/library - platform/marketplace - backend - backend/executor - frontend - frontend/library - frontend/marketplace - blocks
 
 ## Pull requests
 
diff --git a/autogpt_platform/CLAUDE.md b/autogpt_platform/CLAUDE.md
index 9690178587..a5a588b667 100644
--- a/autogpt_platform/CLAUDE.md
+++ b/autogpt_platform/CLAUDE.md
@@ -85,17 +85,6 @@ pnpm format
 pnpm types
 ```
 
-**📖 Complete Guide**: See `/frontend/CONTRIBUTING.md` and `/frontend/.cursorrules` for comprehensive frontend patterns.
-
-**Key Frontend Conventions:**
-
-- Separate render logic from data/behavior in components
-- Use generated API hooks from `@/app/api/__generated__/endpoints/`
-- Use function declarations (not arrow functions) for components/handlers
-- Use design system components from `src/components/` (atoms, molecules, organisms)
-- Only use Phosphor Icons
-- Never use `src/components/__legacy__/*` or deprecated `BackendAPI`
-
 ## Architecture Overview
 
 ### Backend Architecture
@@ -261,14 +250,17 @@ See `/frontend/CONTRIBUTING.md` for complete patterns. Quick reference:
 4. **Styling**: Tailwind CSS only, use design tokens, Phosphor Icons only
 5. **Testing**: Add Storybook stories for new components, Playwright for E2E
 6. **Code conventions**: Function declarations (not arrow functions) for components/handlers
+
 - Component props should be `interface Props { ... }` (not exported) unless the interface needs to be used outside the component
 - Separate render logic from business logic (component.tsx + useComponent.ts + helpers.ts)
 - Colocate state when possible and avoid creating large components, use sub-components ( local `/components` folder next to the parent component ) when sensible
 - Avoid large hooks, abstract logic into `helpers.ts` files when sensible
 - Use function declarations for components, arrow functions only for callbacks
 - No barrel files or `index.ts` re-exports
-- Do not use `useCallback` or `useMemo` unless strictly needed
+- Do not use `useCallback` or `useMemo` unless asked to optimise a given function
 - Avoid comments at all times unless the code is very complex
+- Do not type hook returns, let Typescript infer as much as possible
+- Never type with `any`, if not types available use `unknown`
 
 ### Security Implementation
 
diff --git a/autogpt_platform/frontend/.env.default b/autogpt_platform/frontend/.env.default
index af250fb8bf..7a9d81e39e 100644
--- a/autogpt_platform/frontend/.env.default
+++ b/autogpt_platform/frontend/.env.default
@@ -34,3 +34,6 @@ NEXT_PUBLIC_PREVIEW_STEALING_DEV=
 # PostHog Analytics
 NEXT_PUBLIC_POSTHOG_KEY=
 NEXT_PUBLIC_POSTHOG_HOST=https://eu.i.posthog.com
+
+# OpenAI (for voice transcription)
+OPENAI_API_KEY=
diff --git a/autogpt_platform/frontend/src/app/(platform)/copilot/components/CopilotShell/components/SessionsList/useSessionsPagination.ts b/autogpt_platform/frontend/src/app/(platform)/copilot/components/CopilotShell/components/SessionsList/useSessionsPagination.ts
index 11ddd937af..61e3e6f37f 100644
--- a/autogpt_platform/frontend/src/app/(platform)/copilot/components/CopilotShell/components/SessionsList/useSessionsPagination.ts
+++ b/autogpt_platform/frontend/src/app/(platform)/copilot/components/CopilotShell/components/SessionsList/useSessionsPagination.ts
@@ -73,9 +73,9 @@ export function useSessionsPagination({ enabled }: UseSessionsPaginationArgs) {
   };
 
   const reset = () => {
+    // Only reset the offset - keep existing sessions visible during refetch
+    // The effect will replace sessions when new data arrives at offset 0
     setOffset(0);
-    setAccumulatedSessions([]);
-    setTotalCount(null);
   };
 
   return {
diff --git a/autogpt_platform/frontend/src/app/api/transcribe/route.ts b/autogpt_platform/frontend/src/app/api/transcribe/route.ts
new file mode 100644
index 0000000000..10c182cdfa
--- /dev/null
+++ b/autogpt_platform/frontend/src/app/api/transcribe/route.ts
@@ -0,0 +1,77 @@
+import { getServerAuthToken } from "@/lib/autogpt-server-api/helpers";
+import { NextRequest, NextResponse } from "next/server";
+
+const WHISPER_API_URL = "https://api.openai.com/v1/audio/transcriptions";
+const MAX_FILE_SIZE = 25 * 1024 * 1024; // 25MB - Whisper's limit
+
+function getExtensionFromMimeType(mimeType: string): string {
+  const subtype = mimeType.split("/")[1]?.split(";")[0];
+  return subtype || "webm";
+}
+
+export async function POST(request: NextRequest) {
+  const token = await getServerAuthToken();
+
+  if (!token || token === "no-token-found") {
+    return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
+  }
+
+  const apiKey = process.env.OPENAI_API_KEY;
+
+  if (!apiKey) {
+    return NextResponse.json(
+      { error: "OpenAI API key not configured" },
+      { status: 401 },
+    );
+  }
+
+  try {
+    const formData = await request.formData();
+    const audioFile = formData.get("audio");
+
+    if (!audioFile || !(audioFile instanceof Blob)) {
+      return NextResponse.json(
+        { error: "No audio file provided" },
+        { status: 400 },
+      );
+    }
+
+    if (audioFile.size > MAX_FILE_SIZE) {
+      return NextResponse.json(
+        { error: "File too large. Maximum size is 25MB." },
+        { status: 413 },
+      );
+    }
+
+    const ext = getExtensionFromMimeType(audioFile.type);
+    const whisperFormData = new FormData();
+    whisperFormData.append("file", audioFile, `recording.${ext}`);
+    whisperFormData.append("model", "whisper-1");
+
+    const response = await fetch(WHISPER_API_URL, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+      },
+      body: whisperFormData,
+    });
+
+    if (!response.ok) {
+      const errorData = await response.json().catch(() => ({}));
+      console.error("Whisper API error:", errorData);
+      return NextResponse.json(
+        { error: errorData.error?.message || "Transcription failed" },
+        { status: response.status },
+      );
+    }
+
+    const result = await response.json();
+    return NextResponse.json({ text: result.text });
+  } catch (error) {
+    console.error("Transcription error:", error);
+    return NextResponse.json(
+      { error: "Failed to process audio" },
+      { status: 500 },
+    );
+  }
+}
diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/ChatInput.tsx b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/ChatInput.tsx
index c45e8dc250..521f6f6320 100644
--- a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/ChatInput.tsx
+++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/ChatInput.tsx
@@ -1,7 +1,14 @@
 import { Button } from "@/components/atoms/Button/Button";
 import { cn } from "@/lib/utils";
-import { ArrowUpIcon, StopIcon } from "@phosphor-icons/react";
+import {
+  ArrowUpIcon,
+  CircleNotchIcon,
+  MicrophoneIcon,
+  StopIcon,
+} from "@phosphor-icons/react";
+import { RecordingIndicator } from "./components/RecordingIndicator";
 import { useChatInput } from "./useChatInput";
+import { useVoiceRecording } from "./useVoiceRecording";
 
 export interface Props {
   onSend: (message: string) => void;
@@ -21,13 +28,36 @@ export function ChatInput({
   className,
 }: Props) {
   const inputId = "chat-input";
-  const { value, handleKeyDown, handleSubmit, handleChange, hasMultipleLines } =
-    useChatInput({
-      onSend,
-      disabled: disabled || isStreaming,
-      maxRows: 4,
-      inputId,
-    });
+  const {
+    value,
+    setValue,
+    handleKeyDown: baseHandleKeyDown,
+    handleSubmit,
+    handleChange,
+    hasMultipleLines,
+  } = useChatInput({
+    onSend,
+    disabled: disabled || isStreaming,
+    maxRows: 4,
+    inputId,
+  });
+
+  const {
+    isRecording,
+    isTranscribing,
+    elapsedTime,
+    toggleRecording,
+    handleKeyDown,
+    showMicButton,
+    isInputDisabled,
+    audioStream,
+  } = useVoiceRecording({
+    setValue,
+    disabled: disabled || isStreaming,
+    isStreaming,
+    value,
+    baseHandleKeyDown,
+  });
 
   return (
     <form onSubmit={handleSubmit} className={cn("relative flex-1", className)}>
@@ -35,8 +65,11 @@ export function ChatInput({
         <div
           id={`${inputId}-wrapper`}
           className={cn(
-            "relative overflow-hidden border border-neutral-200 bg-white shadow-sm",
-            "focus-within:border-zinc-400 focus-within:ring-1 focus-within:ring-zinc-400",
+            "relative overflow-hidden border bg-white shadow-sm",
+            "focus-within:ring-1",
+            isRecording
+              ? "border-red-400 focus-within:border-red-400 focus-within:ring-red-400"
+              : "border-neutral-200 focus-within:border-zinc-400 focus-within:ring-zinc-400",
             hasMultipleLines ? "rounded-xlarge" : "rounded-full",
           )}
         >
@@ -46,48 +79,94 @@ export function ChatInput({
             value={value}
             onChange={handleChange}
             onKeyDown={handleKeyDown}
-            placeholder={placeholder}
-            disabled={disabled || isStreaming}
+            placeholder={
+              isTranscribing
+                ? "Transcribing..."
+                : isRecording
+                  ? ""
+                  : placeholder
+            }
+            disabled={isInputDisabled}
             rows={1}
             className={cn(
               "w-full resize-none overflow-y-auto border-0 bg-transparent text-[1rem] leading-6 text-black",
               "placeholder:text-zinc-400",
               "focus:outline-none focus:ring-0",
               "disabled:text-zinc-500",
-              hasMultipleLines ? "pb-6 pl-4 pr-4 pt-2" : "pb-4 pl-4 pr-14 pt-4",
+              hasMultipleLines
+                ? "pb-6 pl-4 pr-4 pt-2"
+                : showMicButton
+                  ? "pb-4 pl-14 pr-14 pt-4"
+                  : "pb-4 pl-4 pr-14 pt-4",
             )}
           />
+          {isRecording && !value && (
+            <div className="pointer-events-none absolute inset-0 flex items-center justify-center">
+              <RecordingIndicator
+                elapsedTime={elapsedTime}
+                audioStream={audioStream}
+              />
+            </div>
+          )}
         </div>
         <span id="chat-input-hint" className="sr-only">
-          Press Enter to send, Shift+Enter for new line
+          Press Enter to send, Shift+Enter for new line, Space to record voice
         </span>
 
-        {isStreaming ? (
-          <Button
-            type="button"
-            variant="icon"
-            size="icon"
-            aria-label="Stop generating"
-            onClick={onStop}
-            className="absolute bottom-[7px] right-2 border-red-600 bg-red-600 text-white hover:border-red-800 hover:bg-red-800"
-          >
-            <StopIcon className="h-4 w-4" weight="bold" />
-          </Button>
-        ) : (
-          <Button
-            type="submit"
-            variant="icon"
-            size="icon"
-            aria-label="Send message"
-            className={cn(
-              "absolute bottom-[7px] right-2 border-zinc-800 bg-zinc-800 text-white hover:border-zinc-900 hover:bg-zinc-900",
-              (disabled || !value.trim()) && "opacity-20",
-            )}
-            disabled={disabled || !value.trim()}
-          >
-            <ArrowUpIcon className="h-4 w-4" weight="bold" />
-          </Button>
+        {showMicButton && (
+          <div className="absolute bottom-[7px] left-2 flex items-center gap-1">
+            <Button
+              type="button"
+              variant="icon"
+              size="icon"
+              aria-label={isRecording ? "Stop recording" : "Start recording"}
+              onClick={toggleRecording}
+              disabled={disabled || isTranscribing}
+              className={cn(
+                isRecording
+                  ? "animate-pulse border-red-500 bg-red-500 text-white hover:border-red-600 hover:bg-red-600"
+                  : isTranscribing
+                    ? "border-zinc-300 bg-zinc-100 text-zinc-400"
+                    : "border-zinc-300 bg-white text-zinc-500 hover:border-zinc-400 hover:bg-zinc-50 hover:text-zinc-700",
+              )}
+            >
+              {isTranscribing ? (
+                <CircleNotchIcon className="h-4 w-4 animate-spin" />
+              ) : (
+                <MicrophoneIcon className="h-4 w-4" weight="bold" />
+              )}
+            </Button>
+          </div>
         )}
+
+        <div className="absolute bottom-[7px] right-2 flex items-center gap-1">
+          {isStreaming ? (
+            <Button
+              type="button"
+              variant="icon"
+              size="icon"
+              aria-label="Stop generating"
+              onClick={onStop}
+              className="border-red-600 bg-red-600 text-white hover:border-red-800 hover:bg-red-800"
+            >
+              <StopIcon className="h-4 w-4" weight="bold" />
+            </Button>
+          ) : (
+            <Button
+              type="submit"
+              variant="icon"
+              size="icon"
+              aria-label="Send message"
+              className={cn(
+                "border-zinc-800 bg-zinc-800 text-white hover:border-zinc-900 hover:bg-zinc-900",
+                (disabled || !value.trim() || isRecording) && "opacity-20",
+              )}
+              disabled={disabled || !value.trim() || isRecording}
+            >
+              <ArrowUpIcon className="h-4 w-4" weight="bold" />
+            </Button>
+          )}
+        </div>
       </div>
     </form>
   );
diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/AudioWaveform.tsx b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/AudioWaveform.tsx
new file mode 100644
index 0000000000..10cbb3fc9f
--- /dev/null
+++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/AudioWaveform.tsx
@@ -0,0 +1,142 @@
+"use client";
+
+import { useEffect, useRef, useState } from "react";
+
+interface Props {
+  stream: MediaStream | null;
+  barCount?: number;
+  barWidth?: number;
+  barGap?: number;
+  barColor?: string;
+  minBarHeight?: number;
+  maxBarHeight?: number;
+}
+
+export function AudioWaveform({
+  stream,
+  barCount = 24,
+  barWidth = 3,
+  barGap = 2,
+  barColor = "#ef4444", // red-500
+  minBarHeight = 4,
+  maxBarHeight = 32,
+}: Props) {
+  const [bars, setBars] = useState<number[]>(() =>
+    Array(barCount).fill(minBarHeight),
+  );
+  const analyserRef = useRef<AnalyserNode | null>(null);
+  const audioContextRef = useRef<AudioContext | null>(null);
+  const sourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
+  const animationRef = useRef<number | null>(null);
+
+  useEffect(() => {
+    if (!stream) {
+      setBars(Array(barCount).fill(minBarHeight));
+      return;
+    }
+
+    // Create audio context and analyser
+    const audioContext = new AudioContext();
+    const analyser = audioContext.createAnalyser();
+    analyser.fftSize = 512;
+    analyser.smoothingTimeConstant = 0.8;
+
+    // Connect the stream to the analyser
+    const source = audioContext.createMediaStreamSource(stream);
+    source.connect(analyser);
+
+    audioContextRef.current = audioContext;
+    analyserRef.current = analyser;
+    sourceRef.current = source;
+
+    const timeData = new Uint8Array(analyser.frequencyBinCount);
+
+    const updateBars = () => {
+      if (!analyserRef.current) return;
+
+      analyserRef.current.getByteTimeDomainData(timeData);
+
+      // Distribute time-domain data across bars
+      // This shows waveform amplitude, making all bars respond to audio
+      const newBars: number[] = [];
+      const samplesPerBar = timeData.length / barCount;
+
+      for (let i = 0; i < barCount; i++) {
+        // Sample waveform data for this bar
+        let maxAmplitude = 0;
+        const startIdx = Math.floor(i * samplesPerBar);
+        const endIdx = Math.floor((i + 1) * samplesPerBar);
+
+        for (let j = startIdx; j < endIdx && j < timeData.length; j++) {
+          // Convert to amplitude (distance from center 128)
+          const amplitude = Math.abs(timeData[j] - 128);
+          maxAmplitude = Math.max(maxAmplitude, amplitude);
+        }
+
+        // Map amplitude (0-128) to bar height
+        const normalized = (maxAmplitude / 128) * 255;
+        const height =
+          minBarHeight + (normalized / 255) * (maxBarHeight - minBarHeight);
+        newBars.push(height);
+      }
+
+      setBars(newBars);
+      animationRef.current = requestAnimationFrame(updateBars);
+    };
+
+    updateBars();
+
+    return () => {
+      if (animationRef.current) {
+        cancelAnimationFrame(animationRef.current);
+      }
+      if (sourceRef.current) {
+        sourceRef.current.disconnect();
+      }
+      if (audioContextRef.current) {
+        audioContextRef.current.close();
+      }
+      analyserRef.current = null;
+      audioContextRef.current = null;
+      sourceRef.current = null;
+    };
+  }, [stream, barCount, minBarHeight, maxBarHeight]);
+
+  const totalWidth = barCount * barWidth + (barCount - 1) * barGap;
+
+  return (
+    <div
+      className="flex items-center justify-center"
+      style={{
+        width: totalWidth,
+        height: maxBarHeight,
+        gap: barGap,
+      }}
+    >
+      {bars.map((height, i) => {
+        const barHeight = Math.max(minBarHeight, height);
+        return (
+          <div
+            key={i}
+            className="relative"
+            style={{
+              width: barWidth,
+              height: maxBarHeight,
+            }}
+          >
+            <div
+              className="absolute left-0 rounded-full transition-[height] duration-75"
+              style={{
+                width: barWidth,
+                height: barHeight,
+                top: "50%",
+                transform: "translateY(-50%)",
+                backgroundColor: barColor,
+              }}
+            />
+          </div>
+        );
+      })}
+    </div>
+  );
+}
diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/RecordingIndicator.tsx b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/RecordingIndicator.tsx
new file mode 100644
index 0000000000..0be0d069bb
--- /dev/null
+++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/components/RecordingIndicator.tsx
@@ -0,0 +1,26 @@
+import { formatElapsedTime } from "../helpers";
+import { AudioWaveform } from "./AudioWaveform";
+
+type Props = {
+  elapsedTime: number;
+  audioStream: MediaStream | null;
+};
+
+export function RecordingIndicator({ elapsedTime, audioStream }: Props) {
+  return (
+    <div className="flex items-center gap-3">
+      <AudioWaveform
+        stream={audioStream}
+        barCount={20}
+        barWidth={3}
+        barGap={2}
+        barColor="#ef4444"
+        minBarHeight={4}
+        maxBarHeight={24}
+      />
+      <span className="min-w-[3ch] text-sm font-medium text-red-500">
+        {formatElapsedTime(elapsedTime)}
+      </span>
+    </div>
+  );
+}
diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/helpers.ts b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/helpers.ts
new file mode 100644
index 0000000000..26bae8c9d9
--- /dev/null
+++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/helpers.ts
@@ -0,0 +1,6 @@
+export function formatElapsedTime(ms: number): string {
+  const seconds = Math.floor(ms / 1000);
+  const minutes = Math.floor(seconds / 60);
+  const remainingSeconds = seconds % 60;
+  return `${minutes}:${remainingSeconds.toString().padStart(2, "0")}`;
+}
diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useChatInput.ts b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useChatInput.ts
index 6fa8e7252b..a053e6080f 100644
--- a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useChatInput.ts
+++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useChatInput.ts
@@ -6,7 +6,7 @@ import {
   useState,
 } from "react";
 
-interface UseChatInputArgs {
+interface Args {
   onSend: (message: string) => void;
   disabled?: boolean;
   maxRows?: number;
@@ -18,7 +18,7 @@ export function useChatInput({
   disabled = false,
   maxRows = 5,
   inputId = "chat-input",
-}: UseChatInputArgs) {
+}: Args) {
   const [value, setValue] = useState("");
   const [hasMultipleLines, setHasMultipleLines] = useState(false);
 
diff --git a/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useVoiceRecording.ts b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useVoiceRecording.ts
new file mode 100644
index 0000000000..13b625e69c
--- /dev/null
+++ b/autogpt_platform/frontend/src/components/contextual/Chat/components/ChatInput/useVoiceRecording.ts
@@ -0,0 +1,240 @@
+import { useToast } from "@/components/molecules/Toast/use-toast";
+import React, {
+  KeyboardEvent,
+  useCallback,
+  useEffect,
+  useRef,
+  useState,
+} from "react";
+
+const MAX_RECORDING_DURATION = 2 * 60 * 1000; // 2 minutes in ms
+
+interface Args {
+  setValue: React.Dispatch<React.SetStateAction<string>>;
+  disabled?: boolean;
+  isStreaming?: boolean;
+  value: string;
+  baseHandleKeyDown: (event: KeyboardEvent<HTMLTextAreaElement>) => void;
+}
+
+export function useVoiceRecording({
+  setValue,
+  disabled = false,
+  isStreaming = false,
+  value,
+  baseHandleKeyDown,
+}: Args) {
+  const [isRecording, setIsRecording] = useState(false);
+  const [isTranscribing, setIsTranscribing] = useState(false);
+  const [error, setError] = useState<string | null>(null);
+  const [elapsedTime, setElapsedTime] = useState(0);
+
+  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+  const chunksRef = useRef<Blob[]>([]);
+  const timerRef = useRef<NodeJS.Timeout | null>(null);
+  const startTimeRef = useRef<number>(0);
+  const streamRef = useRef<MediaStream | null>(null);
+  const isRecordingRef = useRef(false);
+
+  const isSupported =
+    typeof window !== "undefined" &&
+    !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia);
+
+  const clearTimer = useCallback(() => {
+    if (timerRef.current) {
+      clearInterval(timerRef.current);
+      timerRef.current = null;
+    }
+  }, []);
+
+  const cleanup = useCallback(() => {
+    clearTimer();
+    if (streamRef.current) {
+      streamRef.current.getTracks().forEach((track) => track.stop());
+      streamRef.current = null;
+    }
+    mediaRecorderRef.current = null;
+    chunksRef.current = [];
+    setElapsedTime(0);
+  }, [clearTimer]);
+
+  const handleTranscription = useCallback(
+    (text: string) => {
+      setValue((prev) => {
+        const trimmedPrev = prev.trim();
+        if (trimmedPrev) {
+          return `${trimmedPrev} ${text}`;
+        }
+        return text;
+      });
+    },
+    [setValue],
+  );
+
+  const transcribeAudio = useCallback(
+    async (audioBlob: Blob) => {
+      setIsTranscribing(true);
+      setError(null);
+
+      try {
+        const formData = new FormData();
+        formData.append("audio", audioBlob);
+
+        const response = await fetch("/api/transcribe", {
+          method: "POST",
+          body: formData,
+        });
+
+        if (!response.ok) {
+          const data = await response.json().catch(() => ({}));
+          throw new Error(data.error || "Transcription failed");
+        }
+
+        const data = await response.json();
+        if (data.text) {
+          handleTranscription(data.text);
+        }
+      } catch (err) {
+        const message =
+          err instanceof Error ? err.message : "Transcription failed";
+        setError(message);
+        console.error("Transcription error:", err);
+      } finally {
+        setIsTranscribing(false);
+      }
+    },
+    [handleTranscription],
+  );
+
+  const stopRecording = useCallback(() => {
+    if (mediaRecorderRef.current && isRecordingRef.current) {
+      mediaRecorderRef.current.stop();
+      isRecordingRef.current = false;
+      setIsRecording(false);
+      clearTimer();
+    }
+  }, [clearTimer]);
+
+  const startRecording = useCallback(async () => {
+    if (disabled || isRecordingRef.current || isTranscribing) return;
+
+    setError(null);
+    chunksRef.current = [];
+
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      streamRef.current = stream;
+
+      const mediaRecorder = new MediaRecorder(stream, {
+        mimeType: MediaRecorder.isTypeSupported("audio/webm")
+          ? "audio/webm"
+          : "audio/mp4",
+      });
+
+      mediaRecorderRef.current = mediaRecorder;
+
+      mediaRecorder.ondataavailable = (event) => {
+        if (event.data.size > 0) {
+          chunksRef.current.push(event.data);
+        }
+      };
+
+      mediaRecorder.onstop = async () => {
+        const audioBlob = new Blob(chunksRef.current, {
+          type: mediaRecorder.mimeType,
+        });
+
+        // Cleanup stream
+        if (streamRef.current) {
+          streamRef.current.getTracks().forEach((track) => track.stop());
+          streamRef.current = null;
+        }
+
+        if (audioBlob.size > 0) {
+          await transcribeAudio(audioBlob);
+        }
+      };
+
+      mediaRecorder.start(1000); // Collect data every second
+      isRecordingRef.current = true;
+      setIsRecording(true);
+      startTimeRef.current = Date.now();
+
+      // Start elapsed time timer
+      timerRef.current = setInterval(() => {
+        const elapsed = Date.now() - startTimeRef.current;
+        setElapsedTime(elapsed);
+
+        // Auto-stop at max duration
+        if (elapsed >= MAX_RECORDING_DURATION) {
+          stopRecording();
+        }
+      }, 100);
+    } catch (err) {
+      console.error("Failed to start recording:", err);
+      if (err instanceof DOMException && err.name === "NotAllowedError") {
+        setError("Microphone permission denied");
+      } else {
+        setError("Failed to access microphone");
+      }
+      cleanup();
+    }
+  }, [disabled, isTranscribing, stopRecording, transcribeAudio, cleanup]);
+
+  const toggleRecording = useCallback(() => {
+    if (isRecording) {
+      stopRecording();
+    } else {
+      startRecording();
+    }
+  }, [isRecording, startRecording, stopRecording]);
+
+  const { toast } = useToast();
+
+  useEffect(() => {
+    if (error) {
+      toast({
+        title: "Voice recording failed",
+        description: error,
+        variant: "destructive",
+      });
+    }
+  }, [error, toast]);
+
+  const handleKeyDown = useCallback(
+    (event: KeyboardEvent<HTMLTextAreaElement>) => {
+      if (event.key === " " && !value.trim() && !isTranscribing) {
+        event.preventDefault();
+        toggleRecording();
+        return;
+      }
+      baseHandleKeyDown(event);
+    },
+    [value, isTranscribing, toggleRecording, baseHandleKeyDown],
+  );
+
+  const showMicButton = isSupported && !isStreaming;
+  const isInputDisabled = disabled || isStreaming || isTranscribing;
+
+  // Cleanup on unmount
+  useEffect(() => {
+    return () => {
+      cleanup();
+    };
+  }, [cleanup]);
+
+  return {
+    isRecording,
+    isTranscribing,
+    error,
+    elapsedTime,
+    startRecording,
+    stopRecording,
+    toggleRecording,
+    isSupported,
+    handleKeyDown,
+    showMicButton,
+    isInputDisabled,
+    audioStream: streamRef.current,
+  };
+}