Compare commits

...

10 Commits

Author SHA1 Message Date
Ubbe
a8bd93dd54 Merge branch 'dev' into feat/copilot-microphone 2026-01-29 11:28:20 +07:00
Lluis Agusti
6bd4318374 chore: fix 2026-01-29 00:56:45 +07:00
Lluis Agusti
b9c6871a8d chore: lint 2026-01-29 00:45:54 +07:00
Lluis Agusti
c99579085b chore: more 2026-01-29 00:35:52 +07:00
Lluis Agusti
acdf92463d refactor: review suggestions (1) 2026-01-29 00:33:29 +07:00
Lluis Agusti
52ad474df3 chore: refinements 2026-01-29 00:30:00 +07:00
Lluis Agusti
951bfbdb71 chore: suggestions 2026-01-29 00:21:03 +07:00
Lluis Agusti
7f9c0d7e65 chore: refine 2026-01-29 00:14:56 +07:00
Lluis Agusti
4994de9d0b chore: improvements 2026-01-28 23:46:39 +07:00
Lluis Agusti
4ff0a7c17f docs: add voice-to-text chat input design
Design document for adding microphone button with Whisper API
transcription to the ChatInput component.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-28 23:13:32 +07:00
10 changed files with 494 additions and 73 deletions

View File

@@ -16,7 +16,6 @@ See `docs/content/platform/getting-started.md` for setup instructions.
- Format Python code with `poetry run format`.
- Format frontend code using `pnpm format`.
## Frontend guidelines:
See `/frontend/CONTRIBUTING.md` for complete patterns. Quick reference:
@@ -33,14 +32,17 @@ See `/frontend/CONTRIBUTING.md` for complete patterns. Quick reference:
4. **Styling**: Tailwind CSS only, use design tokens, Phosphor Icons only
5. **Testing**: Add Storybook stories for new components, Playwright for E2E
6. **Code conventions**: Function declarations (not arrow functions) for components/handlers
- Component props should be `interface Props { ... }` (not exported) unless the interface needs to be used outside the component
- Separate render logic from business logic (component.tsx + useComponent.ts + helpers.ts)
- Colocate state when possible and avoid creating large components, use sub-components ( local `/components` folder next to the parent component ) when sensible
- Avoid large hooks, abstract logic into `helpers.ts` files when sensible
- Use function declarations for components, arrow functions only for callbacks
- No barrel files or `index.ts` re-exports
- Do not use `useCallback` or `useMemo` unless strictly needed
- Avoid comments at all times unless the code is very complex
- Do not use `useCallback` or `useMemo` unless asked to optimise a given function
- Do not type hook returns, let Typescript infer as much as possible
- Never type with `any`, if not types available use `unknown`
## Testing
@@ -49,22 +51,8 @@ See `/frontend/CONTRIBUTING.md` for complete patterns. Quick reference:
Always run the relevant linters and tests before committing.
Use conventional commit messages for all commits (e.g. `feat(backend): add API`).
Types:
- feat
- fix
- refactor
- ci
- dx (developer experience)
Scopes:
- platform
- platform/library
- platform/marketplace
- backend
- backend/executor
- frontend
- frontend/library
- frontend/marketplace
- blocks
Types: - feat - fix - refactor - ci - dx (developer experience)
Scopes: - platform - platform/library - platform/marketplace - backend - backend/executor - frontend - frontend/library - frontend/marketplace - blocks
## Pull requests

View File

@@ -85,17 +85,6 @@ pnpm format
pnpm types
```
**📖 Complete Guide**: See `/frontend/CONTRIBUTING.md` and `/frontend/.cursorrules` for comprehensive frontend patterns.
**Key Frontend Conventions:**
- Separate render logic from data/behavior in components
- Use generated API hooks from `@/app/api/__generated__/endpoints/`
- Use function declarations (not arrow functions) for components/handlers
- Use design system components from `src/components/` (atoms, molecules, organisms)
- Only use Phosphor Icons
- Never use `src/components/__legacy__/*` or deprecated `BackendAPI`
## Architecture Overview
### Backend Architecture
@@ -217,14 +206,17 @@ See `/frontend/CONTRIBUTING.md` for complete patterns. Quick reference:
4. **Styling**: Tailwind CSS only, use design tokens, Phosphor Icons only
5. **Testing**: Add Storybook stories for new components, Playwright for E2E
6. **Code conventions**: Function declarations (not arrow functions) for components/handlers
- Component props should be `interface Props { ... }` (not exported) unless the interface needs to be used outside the component
- Separate render logic from business logic (component.tsx + useComponent.ts + helpers.ts)
- Colocate state when possible and avoid creating large components, use sub-components ( local `/components` folder next to the parent component ) when sensible
- Avoid large hooks, abstract logic into `helpers.ts` files when sensible
- Use function declarations for components, arrow functions only for callbacks
- No barrel files or `index.ts` re-exports
- Do not use `useCallback` or `useMemo` unless strictly needed
- Do not use `useCallback` or `useMemo` unless asked to optimise a given function
- Avoid comments at all times unless the code is very complex
- Do not type hook returns, let Typescript infer as much as possible
- Never type with `any`, if not types available use `unknown`
### Security Implementation

View File

@@ -34,3 +34,6 @@ NEXT_PUBLIC_PREVIEW_STEALING_DEV=
# PostHog Analytics
NEXT_PUBLIC_POSTHOG_KEY=
NEXT_PUBLIC_POSTHOG_HOST=https://eu.i.posthog.com
# OpenAI (for voice transcription)
OPENAI_API_KEY=

View File

@@ -73,9 +73,9 @@ export function useSessionsPagination({ enabled }: UseSessionsPaginationArgs) {
};
const reset = () => {
// Only reset the offset - keep existing sessions visible during refetch
// The effect will replace sessions when new data arrives at offset 0
setOffset(0);
setAccumulatedSessions([]);
setTotalCount(null);
};
return {

View File

@@ -0,0 +1,77 @@
import { getServerAuthToken } from "@/lib/autogpt-server-api/helpers";
import { NextRequest, NextResponse } from "next/server";
const WHISPER_API_URL = "https://api.openai.com/v1/audio/transcriptions";
const MAX_FILE_SIZE = 25 * 1024 * 1024; // 25MB - Whisper's limit
function getExtensionFromMimeType(mimeType: string): string {
const subtype = mimeType.split("/")[1]?.split(";")[0];
return subtype || "webm";
}
export async function POST(request: NextRequest) {
const token = await getServerAuthToken();
if (!token) {
return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
}
const apiKey = process.env.OPENAI_API_KEY;
if (!apiKey) {
return NextResponse.json(
{ error: "OpenAI API key not configured" },
{ status: 401 },
);
}
try {
const formData = await request.formData();
const audioFile = formData.get("audio");
if (!audioFile || !(audioFile instanceof Blob)) {
return NextResponse.json(
{ error: "No audio file provided" },
{ status: 400 },
);
}
if (audioFile.size > MAX_FILE_SIZE) {
return NextResponse.json(
{ error: "File too large. Maximum size is 25MB." },
{ status: 413 },
);
}
const ext = getExtensionFromMimeType(audioFile.type);
const whisperFormData = new FormData();
whisperFormData.append("file", audioFile, `recording.${ext}`);
whisperFormData.append("model", "whisper-1");
const response = await fetch(WHISPER_API_URL, {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
},
body: whisperFormData,
});
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
console.error("Whisper API error:", errorData);
return NextResponse.json(
{ error: errorData.error?.message || "Transcription failed" },
{ status: response.status },
);
}
const result = await response.json();
return NextResponse.json({ text: result.text });
} catch (error) {
console.error("Transcription error:", error);
return NextResponse.json(
{ error: "Failed to process audio" },
{ status: 500 },
);
}
}

View File

@@ -1,7 +1,14 @@
import { Button } from "@/components/atoms/Button/Button";
import { cn } from "@/lib/utils";
import { ArrowUpIcon, StopIcon } from "@phosphor-icons/react";
import {
ArrowUpIcon,
CircleNotchIcon,
MicrophoneIcon,
StopIcon,
} from "@phosphor-icons/react";
import { RecordingIndicator } from "./components/RecordingIndicator";
import { useChatInput } from "./useChatInput";
import { useVoiceRecording } from "./useVoiceRecording";
export interface Props {
onSend: (message: string) => void;
@@ -21,13 +28,35 @@ export function ChatInput({
className,
}: Props) {
const inputId = "chat-input";
const { value, handleKeyDown, handleSubmit, handleChange, hasMultipleLines } =
useChatInput({
onSend,
disabled: disabled || isStreaming,
maxRows: 4,
inputId,
});
const {
value,
setValue,
handleKeyDown: baseHandleKeyDown,
handleSubmit,
handleChange,
hasMultipleLines,
} = useChatInput({
onSend,
disabled: disabled || isStreaming,
maxRows: 4,
inputId,
});
const {
isRecording,
isTranscribing,
elapsedTime,
toggleRecording,
handleKeyDown,
showMicButton,
isInputDisabled,
} = useVoiceRecording({
setValue,
disabled: disabled || isStreaming,
isStreaming,
value,
baseHandleKeyDown,
});
return (
<form onSubmit={handleSubmit} className={cn("relative flex-1", className)}>
@@ -35,8 +64,11 @@ export function ChatInput({
<div
id={`${inputId}-wrapper`}
className={cn(
"relative overflow-hidden border border-neutral-200 bg-white shadow-sm",
"focus-within:border-zinc-400 focus-within:ring-1 focus-within:ring-zinc-400",
"relative overflow-hidden border bg-white shadow-sm",
"focus-within:ring-1",
isRecording
? "border-red-400 focus-within:border-red-400 focus-within:ring-red-400"
: "border-neutral-200 focus-within:border-zinc-400 focus-within:ring-zinc-400",
hasMultipleLines ? "rounded-xlarge" : "rounded-full",
)}
>
@@ -46,48 +78,91 @@ export function ChatInput({
value={value}
onChange={handleChange}
onKeyDown={handleKeyDown}
placeholder={placeholder}
disabled={disabled || isStreaming}
placeholder={
isTranscribing
? "Transcribing..."
: isRecording
? ""
: placeholder
}
disabled={isInputDisabled}
rows={1}
className={cn(
"w-full resize-none overflow-y-auto border-0 bg-transparent text-[1rem] leading-6 text-black",
"placeholder:text-zinc-400",
"focus:outline-none focus:ring-0",
"disabled:text-zinc-500",
hasMultipleLines ? "pb-6 pl-4 pr-4 pt-2" : "pb-4 pl-4 pr-14 pt-4",
hasMultipleLines
? "pb-6 pl-4 pr-4 pt-2"
: showMicButton
? "pb-4 pl-14 pr-14 pt-4"
: "pb-4 pl-4 pr-14 pt-4",
)}
/>
{isRecording && !value && (
<div className="pointer-events-none absolute inset-0 flex items-center justify-center">
<RecordingIndicator elapsedTime={elapsedTime} />
</div>
)}
</div>
<span id="chat-input-hint" className="sr-only">
Press Enter to send, Shift+Enter for new line
Press Enter to send, Shift+Enter for new line, Space to record voice
</span>
{isStreaming ? (
<Button
type="button"
variant="icon"
size="icon"
aria-label="Stop generating"
onClick={onStop}
className="absolute bottom-[7px] right-2 border-red-600 bg-red-600 text-white hover:border-red-800 hover:bg-red-800"
>
<StopIcon className="h-4 w-4" weight="bold" />
</Button>
) : (
<Button
type="submit"
variant="icon"
size="icon"
aria-label="Send message"
className={cn(
"absolute bottom-[7px] right-2 border-zinc-800 bg-zinc-800 text-white hover:border-zinc-900 hover:bg-zinc-900",
(disabled || !value.trim()) && "opacity-20",
)}
disabled={disabled || !value.trim()}
>
<ArrowUpIcon className="h-4 w-4" weight="bold" />
</Button>
{showMicButton && (
<div className="absolute bottom-[7px] left-2 flex items-center gap-1">
<Button
type="button"
variant="icon"
size="icon"
aria-label={isRecording ? "Stop recording" : "Start recording"}
onClick={toggleRecording}
disabled={disabled || isTranscribing}
className={cn(
isRecording
? "animate-pulse border-red-500 bg-red-500 text-white hover:border-red-600 hover:bg-red-600"
: isTranscribing
? "border-zinc-300 bg-zinc-100 text-zinc-400"
: "border-zinc-300 bg-white text-zinc-500 hover:border-zinc-400 hover:bg-zinc-50 hover:text-zinc-700",
)}
>
{isTranscribing ? (
<CircleNotchIcon className="h-4 w-4 animate-spin" />
) : (
<MicrophoneIcon className="h-4 w-4" weight="bold" />
)}
</Button>
</div>
)}
<div className="absolute bottom-[7px] right-2 flex items-center gap-1">
{isStreaming ? (
<Button
type="button"
variant="icon"
size="icon"
aria-label="Stop generating"
onClick={onStop}
className="border-red-600 bg-red-600 text-white hover:border-red-800 hover:bg-red-800"
>
<StopIcon className="h-4 w-4" weight="bold" />
</Button>
) : (
<Button
type="submit"
variant="icon"
size="icon"
aria-label="Send message"
className={cn(
"border-zinc-800 bg-zinc-800 text-white hover:border-zinc-900 hover:bg-zinc-900",
(disabled || !value.trim() || isRecording) && "opacity-20",
)}
disabled={disabled || !value.trim() || isRecording}
>
<ArrowUpIcon className="h-4 w-4" weight="bold" />
</Button>
)}
</div>
</div>
</form>
);

View File

@@ -0,0 +1,41 @@
import { formatElapsedTime } from "../helpers";
type Props = {
elapsedTime: number;
};
export function RecordingIndicator({ elapsedTime }: Props) {
return (
<div className="flex items-center gap-3">
<div className="flex items-center gap-[3px]">
{[0, 1, 2, 3, 4].map((i) => (
<div
key={i}
className="w-[3px] rounded-full bg-red-500"
style={{
animation: `waveform 1s ease-in-out infinite`,
animationDelay: `${i * 0.1}s`,
height: "16px",
}}
/>
))}
</div>
<span className="min-w-[3ch] text-sm font-medium text-red-500">
{formatElapsedTime(elapsedTime)}
</span>
<style jsx>{`
@keyframes waveform {
0%,
100% {
transform: scaleY(0.3);
opacity: 0.5;
}
50% {
transform: scaleY(1);
opacity: 1;
}
}
`}</style>
</div>
);
}

View File

@@ -0,0 +1,6 @@
export function formatElapsedTime(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const remainingSeconds = seconds % 60;
return `${minutes}:${remainingSeconds.toString().padStart(2, "0")}`;
}

View File

@@ -6,7 +6,7 @@ import {
useState,
} from "react";
interface UseChatInputArgs {
interface Args {
onSend: (message: string) => void;
disabled?: boolean;
maxRows?: number;
@@ -18,7 +18,7 @@ export function useChatInput({
disabled = false,
maxRows = 5,
inputId = "chat-input",
}: UseChatInputArgs) {
}: Args) {
const [value, setValue] = useState("");
const [hasMultipleLines, setHasMultipleLines] = useState(false);

View File

@@ -0,0 +1,239 @@
import { useToast } from "@/components/molecules/Toast/use-toast";
import React, {
KeyboardEvent,
useCallback,
useEffect,
useRef,
useState,
} from "react";
const MAX_RECORDING_DURATION = 2 * 60 * 1000; // 2 minutes in ms
interface Args {
setValue: React.Dispatch<React.SetStateAction<string>>;
disabled?: boolean;
isStreaming?: boolean;
value: string;
baseHandleKeyDown: (event: KeyboardEvent<HTMLTextAreaElement>) => void;
}
export function useVoiceRecording({
setValue,
disabled = false,
isStreaming = false,
value,
baseHandleKeyDown,
}: Args) {
const [isRecording, setIsRecording] = useState(false);
const [isTranscribing, setIsTranscribing] = useState(false);
const [error, setError] = useState<string | null>(null);
const [elapsedTime, setElapsedTime] = useState(0);
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const chunksRef = useRef<Blob[]>([]);
const timerRef = useRef<NodeJS.Timeout | null>(null);
const startTimeRef = useRef<number>(0);
const streamRef = useRef<MediaStream | null>(null);
const isRecordingRef = useRef(false);
const isSupported =
typeof window !== "undefined" &&
!!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia);
const clearTimer = useCallback(() => {
if (timerRef.current) {
clearInterval(timerRef.current);
timerRef.current = null;
}
}, []);
const cleanup = useCallback(() => {
clearTimer();
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => track.stop());
streamRef.current = null;
}
mediaRecorderRef.current = null;
chunksRef.current = [];
setElapsedTime(0);
}, [clearTimer]);
const handleTranscription = useCallback(
(text: string) => {
setValue((prev) => {
const trimmedPrev = prev.trim();
if (trimmedPrev) {
return `${trimmedPrev} ${text}`;
}
return text;
});
},
[setValue],
);
const transcribeAudio = useCallback(
async (audioBlob: Blob) => {
setIsTranscribing(true);
setError(null);
try {
const formData = new FormData();
formData.append("audio", audioBlob);
const response = await fetch("/api/transcribe", {
method: "POST",
body: formData,
});
if (!response.ok) {
const data = await response.json().catch(() => ({}));
throw new Error(data.error || "Transcription failed");
}
const data = await response.json();
if (data.text) {
handleTranscription(data.text);
}
} catch (err) {
const message =
err instanceof Error ? err.message : "Transcription failed";
setError(message);
console.error("Transcription error:", err);
} finally {
setIsTranscribing(false);
}
},
[handleTranscription],
);
const stopRecording = useCallback(() => {
if (mediaRecorderRef.current && isRecordingRef.current) {
mediaRecorderRef.current.stop();
isRecordingRef.current = false;
setIsRecording(false);
clearTimer();
}
}, [clearTimer]);
const startRecording = useCallback(async () => {
if (disabled || isRecordingRef.current || isTranscribing) return;
setError(null);
chunksRef.current = [];
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
streamRef.current = stream;
const mediaRecorder = new MediaRecorder(stream, {
mimeType: MediaRecorder.isTypeSupported("audio/webm")
? "audio/webm"
: "audio/mp4",
});
mediaRecorderRef.current = mediaRecorder;
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
chunksRef.current.push(event.data);
}
};
mediaRecorder.onstop = async () => {
const audioBlob = new Blob(chunksRef.current, {
type: mediaRecorder.mimeType,
});
// Cleanup stream
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => track.stop());
streamRef.current = null;
}
if (audioBlob.size > 0) {
await transcribeAudio(audioBlob);
}
};
mediaRecorder.start(1000); // Collect data every second
isRecordingRef.current = true;
setIsRecording(true);
startTimeRef.current = Date.now();
// Start elapsed time timer
timerRef.current = setInterval(() => {
const elapsed = Date.now() - startTimeRef.current;
setElapsedTime(elapsed);
// Auto-stop at max duration
if (elapsed >= MAX_RECORDING_DURATION) {
stopRecording();
}
}, 100);
} catch (err) {
console.error("Failed to start recording:", err);
if (err instanceof DOMException && err.name === "NotAllowedError") {
setError("Microphone permission denied");
} else {
setError("Failed to access microphone");
}
cleanup();
}
}, [disabled, isTranscribing, stopRecording, transcribeAudio, cleanup]);
const toggleRecording = useCallback(() => {
if (isRecording) {
stopRecording();
} else {
startRecording();
}
}, [isRecording, startRecording, stopRecording]);
const { toast } = useToast();
useEffect(() => {
if (error) {
toast({
title: "Voice recording failed",
description: error,
variant: "destructive",
});
}
}, [error, toast]);
const handleKeyDown = useCallback(
(event: KeyboardEvent<HTMLTextAreaElement>) => {
if (event.key === " " && !value.trim() && !isTranscribing) {
event.preventDefault();
toggleRecording();
return;
}
baseHandleKeyDown(event);
},
[value, isTranscribing, toggleRecording, baseHandleKeyDown],
);
const showMicButton = isSupported && !isStreaming;
const isInputDisabled = disabled || isStreaming || isTranscribing;
// Cleanup on unmount
useEffect(() => {
return () => {
cleanup();
};
}, [cleanup]);
return {
isRecording,
isTranscribing,
error,
elapsedTime,
startRecording,
stopRecording,
toggleRecording,
isSupported,
handleKeyDown,
showMicButton,
isInputDisabled,
};
}