Compare commits

..

8 Commits

Author SHA1 Message Date
Waleed
d7da35ba0b v0.6.30: slack trigger enhancements, connectors performance improvements, secrets performance, polling refactors, drag resources in mothership 2026-04-08 01:00:43 -07:00
Theodore Li
d6ec115348 v0.6.29: login improvements, posthog telemetry (#4026)
* feat(posthog): Add tracking on mothership abort (#4023)

Co-authored-by: Theodore Li <theo@sim.ai>

* fix(login): fix captcha headers for manual login  (#4025)

* fix(signup): fix turnstile key loading

* fix(login): fix captcha header passing

* Catch user already exists, remove login form captcha
2026-04-07 19:11:31 -04:00
Waleed
3f508e445f v0.6.28: new docs, delete confirmation standardization, dagster integration, signup method feature flags, SSO improvements 2026-04-07 14:26:42 -07:00
Waleed
316bc8cdcc v0.6.27: new triggers, mothership improvements, files archive, queueing improvements, posthog, secrets mutations 2026-04-06 22:15:29 -07:00
Waleed
d889f32697 v0.6.26: ui improvements, multiple response blocks, docx previews, ollama fix 2026-04-05 12:33:24 -07:00
Waleed
28af223a9f v0.6.25: cloudwatch, cloudformation, live kb sync, linear fixes, posthog upgrade 2026-04-04 18:39:28 -07:00
Waleed
a54dcbe949 v0.6.24: copilot feedback wiring, captcha fixes 2026-04-04 12:52:05 -07:00
Waleed
0b9019d9a2 v0.6.23: MCP fixes, remove local state in favor of server state, mothership workflow edits via sockets, ui improvements 2026-04-03 23:30:26 -07:00
32 changed files with 483 additions and 15760 deletions

View File

@@ -135,21 +135,6 @@ Use your own API keys for AI model providers instead of Sim's hosted keys to pay
When configured, workflows use your key instead of Sim's hosted keys. If removed, workflows automatically fall back to hosted keys with the multiplier.
## Voice Input
Voice input uses ElevenLabs Scribe v2 Realtime for speech-to-text transcription. It is available in the Mothership chat and in deployed chat voice mode.
| Context | Cost per session | Max duration |
|---------|-----------------|--------------|
| Mothership (workspace) | ~5 credits ($0.024) | 3 minutes |
| Deployed chat (voice mode) | ~2 credits ($0.008) | 1 minute |
Each voice session is billed when it starts. In deployed chat voice mode, each conversation turn (speak → agent responds → speak again) is a separate session. Multi-turn conversations are billed per turn.
<Callout type="info">
Voice input requires `ELEVENLABS_API_KEY` to be configured. When the key is not set, voice input controls are hidden.
</Callout>
## Plans
Sim has two paid plan tiers — **Pro** and **Max**. Either can be used individually or with a team. Team plans pool credits across all seats in the organization.

View File

@@ -15,7 +15,6 @@ import {
import { type AuthResult, AuthType, checkHybridAuth } from '@/lib/auth/hybrid'
import { acquireLock, getRedisClient, releaseLock } from '@/lib/core/config/redis'
import { validateUrlWithDNS } from '@/lib/core/security/input-validation.server'
import { getClientIp } from '@/lib/core/utils/request'
import { SSE_HEADERS } from '@/lib/core/utils/sse'
import { getBaseUrl } from '@/lib/core/utils/urls'
import { generateId } from '@/lib/core/utils/uuid'
@@ -53,9 +52,10 @@ function getCallerFingerprint(request: NextRequest, userId?: string | null): str
return `user:${userId}`
}
const clientIp = getClientIp(request)
const forwardedFor = request.headers.get('x-forwarded-for')?.split(',')[0]?.trim()
const realIp = request.headers.get('x-real-ip')?.trim()
const userAgent = request.headers.get('user-agent')?.trim() || 'unknown'
return `public:${clientIp}:${userAgent}`
return `public:${forwardedFor || realIp || 'unknown'}:${userAgent}`
}
function hasCallerAccessToTask(

View File

@@ -3,7 +3,7 @@ import { type NextRequest, NextResponse } from 'next/server'
import { env } from '@/lib/core/config/env'
import type { TokenBucketConfig } from '@/lib/core/rate-limiter'
import { RateLimiter } from '@/lib/core/rate-limiter'
import { generateRequestId, getClientIp } from '@/lib/core/utils/request'
import { generateRequestId } from '@/lib/core/utils/request'
import { getEmailDomain } from '@/lib/core/utils/urls'
import { sendEmail } from '@/lib/messaging/email/mailer'
import { getFromEmailAddress } from '@/lib/messaging/email/utils'
@@ -25,7 +25,7 @@ export async function POST(req: NextRequest) {
const requestId = generateRequestId()
try {
const ip = getClientIp(req)
const ip = req.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ?? 'unknown'
const storageKey = `public:demo-request:${ip}`
const { allowed, remaining, resetAt } = await rateLimiter.checkRateLimitDirect(

View File

@@ -4,7 +4,7 @@ import { z } from 'zod'
import { env } from '@/lib/core/config/env'
import type { TokenBucketConfig } from '@/lib/core/rate-limiter'
import { RateLimiter } from '@/lib/core/rate-limiter'
import { generateRequestId, getClientIp } from '@/lib/core/utils/request'
import { generateRequestId } from '@/lib/core/utils/request'
import { getEmailDomain } from '@/lib/core/utils/urls'
import { sendEmail } from '@/lib/messaging/email/mailer'
import {
@@ -37,7 +37,7 @@ export async function POST(req: NextRequest) {
const requestId = generateRequestId()
try {
const ip = getClientIp(req)
const ip = req.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ?? 'unknown'
const storageKey = `public:integration-request:${ip}`
const { allowed, remaining, resetAt } = await rateLimiter.checkRateLimitDirect(

View File

@@ -222,13 +222,6 @@ export async function PATCH(request: NextRequest, { params }: RouteParams) {
}
if (parsed.data.status !== undefined) {
updates.status = parsed.data.status
if (parsed.data.status === 'active') {
updates.consecutiveFailures = 0
updates.lastSyncError = null
if (updates.nextSyncAt === undefined) {
updates.nextSyncAt = new Date()
}
}
}
await db

View File

@@ -1,11 +0,0 @@
import { NextResponse } from 'next/server'
import { hasSTTService } from '@/lib/speech/config'
/**
* Returns whether server-side STT is configured.
* Unauthenticated — the response is a single boolean,
* not sensitive data, and deployed chat visitors need it.
*/
export async function GET() {
return NextResponse.json({ sttAvailable: hasSTTService() })
}

View File

@@ -1,171 +0,0 @@
import { db } from '@sim/db'
import { chat } from '@sim/db/schema'
import { createLogger } from '@sim/logger'
import { eq } from 'drizzle-orm'
import { type NextRequest, NextResponse } from 'next/server'
import { getSession } from '@/lib/auth'
import { hasExceededCostLimit } from '@/lib/billing/core/subscription'
import { recordUsage } from '@/lib/billing/core/usage-log'
import { env } from '@/lib/core/config/env'
import { getCostMultiplier, isBillingEnabled } from '@/lib/core/config/feature-flags'
import { RateLimiter } from '@/lib/core/rate-limiter'
import { validateAuthToken } from '@/lib/core/security/deployment'
import { getClientIp } from '@/lib/core/utils/request'
const logger = createLogger('SpeechTokenAPI')
export const dynamic = 'force-dynamic'
const ELEVENLABS_TOKEN_URL = 'https://api.elevenlabs.io/v1/single-use-token/realtime_scribe'
const VOICE_SESSION_COST_PER_MIN = 0.008
const WORKSPACE_SESSION_MAX_MINUTES = 3
const CHAT_SESSION_MAX_MINUTES = 1
const STT_TOKEN_RATE_LIMIT = {
maxTokens: 30,
refillRate: 3,
refillIntervalMs: 72 * 1000,
} as const
const rateLimiter = new RateLimiter()
async function validateChatAuth(
request: NextRequest,
chatId: string
): Promise<{ valid: boolean; ownerId?: string }> {
try {
const chatResult = await db
.select({
id: chat.id,
userId: chat.userId,
isActive: chat.isActive,
authType: chat.authType,
password: chat.password,
})
.from(chat)
.where(eq(chat.id, chatId))
.limit(1)
if (chatResult.length === 0 || !chatResult[0].isActive) {
return { valid: false }
}
const chatData = chatResult[0]
if (chatData.authType === 'public') {
return { valid: true, ownerId: chatData.userId }
}
const cookieName = `chat_auth_${chatId}`
const authCookie = request.cookies.get(cookieName)
if (authCookie && validateAuthToken(authCookie.value, chatId, chatData.password)) {
return { valid: true, ownerId: chatData.userId }
}
return { valid: false }
} catch (error) {
logger.error('Error validating chat auth for STT:', error)
return { valid: false }
}
}
export async function POST(request: NextRequest) {
try {
const body = await request.json().catch(() => ({}))
const chatId = body?.chatId as string | undefined
let billingUserId: string | undefined
if (chatId) {
const chatAuth = await validateChatAuth(request, chatId)
if (!chatAuth.valid) {
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
billingUserId = chatAuth.ownerId
} else {
const session = await getSession()
if (!session?.user?.id) {
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
billingUserId = session.user.id
}
if (isBillingEnabled) {
const rateLimitKey = chatId
? `stt-token:chat:${chatId}:${getClientIp(request)}`
: `stt-token:user:${billingUserId}`
const rateCheck = await rateLimiter.checkRateLimitDirect(rateLimitKey, STT_TOKEN_RATE_LIMIT)
if (!rateCheck.allowed) {
return NextResponse.json(
{ error: 'Voice input rate limit exceeded. Please try again later.' },
{
status: 429,
headers: {
'Retry-After': String(Math.ceil((rateCheck.retryAfterMs ?? 60000) / 1000)),
},
}
)
}
}
if (billingUserId && isBillingEnabled) {
const exceeded = await hasExceededCostLimit(billingUserId)
if (exceeded) {
return NextResponse.json(
{ error: 'Usage limit exceeded. Please upgrade your plan to continue.' },
{ status: 402 }
)
}
}
const apiKey = env.ELEVENLABS_API_KEY
if (!apiKey?.trim()) {
return NextResponse.json(
{ error: 'Speech-to-text service is not configured' },
{ status: 503 }
)
}
const response = await fetch(ELEVENLABS_TOKEN_URL, {
method: 'POST',
headers: { 'xi-api-key': apiKey },
})
if (!response.ok) {
const errBody = await response.json().catch(() => ({}))
const message =
errBody.detail || errBody.message || `Token request failed (${response.status})`
logger.error('ElevenLabs token request failed', { status: response.status, message })
return NextResponse.json({ error: message }, { status: 502 })
}
const data = await response.json()
if (billingUserId) {
const maxMinutes = chatId ? CHAT_SESSION_MAX_MINUTES : WORKSPACE_SESSION_MAX_MINUTES
const sessionCost = VOICE_SESSION_COST_PER_MIN * maxMinutes
await recordUsage({
userId: billingUserId,
entries: [
{
category: 'fixed',
source: 'voice-input',
description: `Voice input session (${maxMinutes} min)`,
cost: sessionCost * getCostMultiplier(),
},
],
}).catch((err) => {
logger.warn('Failed to record voice input usage, continuing:', err)
})
}
return NextResponse.json({ token: data.token })
} catch (error) {
const message = error instanceof Error ? error.message : 'Failed to generate speech token'
logger.error('Speech token error:', error)
return NextResponse.json({ error: message }, { status: 500 })
}
}

View File

@@ -127,14 +127,6 @@ export default function ChatClient({ identifier }: { identifier: string }) {
const [authRequired, setAuthRequired] = useState<'password' | 'email' | 'sso' | null>(null)
const [isVoiceFirstMode, setIsVoiceFirstMode] = useState(false)
const [sttAvailable, setSttAvailable] = useState(false)
useEffect(() => {
fetch('/api/settings/voice')
.then((r) => (r.ok ? r.json() : { sttAvailable: false }))
.then((data) => setSttAvailable(data.sttAvailable === true))
.catch(() => setSttAvailable(false))
}, [])
const { isStreamingResponse, abortControllerRef, stopStreaming, handleStreamedResponse } =
useChatStreaming()
const audioContextRef = useRef<AudioContext | null>(null)
@@ -451,9 +443,8 @@ export default function ChatClient({ identifier }: { identifier: string }) {
}, [isStreamingResponse, stopStreaming, setMessages, stopAudio])
const handleVoiceStart = useCallback(() => {
if (!sttAvailable) return
setIsVoiceFirstMode(true)
}, [sttAvailable])
}, [])
const handleExitVoiceMode = useCallback(() => {
setIsVoiceFirstMode(false)
@@ -503,7 +494,6 @@ export default function ChatClient({ identifier }: { identifier: string }) {
isStreaming={isStreamingResponse}
isPlayingAudio={isPlayingAudio}
audioContextRef={audioContextRef}
chatId={chatConfig?.id}
messages={messages.map((msg) => ({
content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content),
type: msg.type,
@@ -539,7 +529,6 @@ export default function ChatClient({ identifier }: { identifier: string }) {
isStreaming={isStreamingResponse}
onStopStreaming={() => stopStreaming(setMessages)}
onVoiceStart={handleVoiceStart}
sttAvailable={sttAvailable}
/>
</div>
</div>

View File

@@ -14,6 +14,14 @@ const logger = createLogger('ChatInput')
const MAX_TEXTAREA_HEIGHT = 200
const IS_STT_AVAILABLE =
typeof window !== 'undefined' &&
!!(
(window as Window & { SpeechRecognition?: unknown; webkitSpeechRecognition?: unknown })
.SpeechRecognition ||
(window as Window & { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition
)
interface AttachedFile {
id: string
name: string
@@ -29,15 +37,7 @@ export const ChatInput: React.FC<{
onStopStreaming?: () => void
onVoiceStart?: () => void
voiceOnly?: boolean
sttAvailable?: boolean
}> = ({
onSubmit,
isStreaming = false,
onStopStreaming,
onVoiceStart,
voiceOnly = false,
sttAvailable = false,
}) => {
}> = ({ onSubmit, isStreaming = false, onStopStreaming, onVoiceStart, voiceOnly = false }) => {
const fileInputRef = useRef<HTMLInputElement>(null)
const textareaRef = useRef<HTMLTextAreaElement>(null)
const [inputValue, setInputValue] = useState('')
@@ -142,7 +142,7 @@ export const ChatInput: React.FC<{
return (
<Tooltip.Provider>
<div className='flex items-center justify-center'>
{sttAvailable && (
{IS_STT_AVAILABLE && (
<Tooltip.Root>
<Tooltip.Trigger asChild>
<div>
@@ -295,7 +295,7 @@ export const ChatInput: React.FC<{
{/* Right: mic + send */}
<div className='flex items-center gap-1.5'>
{sttAvailable && (
{IS_STT_AVAILABLE && (
<Tooltip.Root>
<Tooltip.Trigger asChild>
<button

View File

@@ -1,9 +1,41 @@
'use client'
import { useCallback } from 'react'
import { useCallback, useEffect, useState } from 'react'
import { motion } from 'framer-motion'
import { Mic } from 'lucide-react'
interface SpeechRecognitionEvent extends Event {
resultIndex: number
results: SpeechRecognitionResultList
}
interface SpeechRecognitionErrorEvent extends Event {
error: string
message?: string
}
interface SpeechRecognition extends EventTarget {
continuous: boolean
interimResults: boolean
lang: string
start(): void
stop(): void
abort(): void
onstart: ((this: SpeechRecognition, ev: Event) => any) | null
onend: ((this: SpeechRecognition, ev: Event) => any) | null
onresult: ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => any) | null
onerror: ((this: SpeechRecognition, ev: SpeechRecognitionErrorEvent) => any) | null
}
interface SpeechRecognitionStatic {
new (): SpeechRecognition
}
type WindowWithSpeech = Window & {
SpeechRecognition?: SpeechRecognitionStatic
webkitSpeechRecognition?: SpeechRecognitionStatic
}
interface VoiceInputProps {
onVoiceStart: () => void
isListening?: boolean
@@ -19,11 +51,24 @@ export function VoiceInput({
large = false,
minimal = false,
}: VoiceInputProps) {
const [isSupported, setIsSupported] = useState(false)
// Check if speech recognition is supported
useEffect(() => {
const w = window as WindowWithSpeech
const SpeechRecognitionCtor = w.SpeechRecognition || w.webkitSpeechRecognition
setIsSupported(!!SpeechRecognitionCtor)
}, [])
const handleVoiceClick = useCallback(() => {
if (disabled) return
onVoiceStart()
}, [disabled, onVoiceStart])
if (!isSupported) {
return null
}
if (minimal) {
return (
<button
@@ -43,6 +88,7 @@ export function VoiceInput({
if (large) {
return (
<div className='flex flex-col items-center'>
{/* Large Voice Button */}
<motion.button
type='button'
onClick={handleVoiceClick}
@@ -64,6 +110,7 @@ export function VoiceInput({
return (
<div className='flex items-center'>
{/* Voice Button - Now matches send button styling */}
<motion.button
type='button'
onClick={handleVoiceClick}

View File

@@ -6,13 +6,6 @@ import { Mic, MicOff, Phone } from 'lucide-react'
import dynamic from 'next/dynamic'
import { Button } from '@/components/ui/button'
import { cn } from '@/lib/core/utils/cn'
import { arrayBufferToBase64, floatTo16BitPCM } from '@/lib/speech/audio'
import {
CHUNK_SEND_INTERVAL_MS,
ELEVENLABS_WS_URL,
MAX_CHAT_SESSION_MS,
SAMPLE_RATE,
} from '@/lib/speech/config'
const ParticlesVisualization = dynamic(
() =>
@@ -24,6 +17,38 @@ const ParticlesVisualization = dynamic(
const logger = createLogger('VoiceInterface')
interface SpeechRecognitionEvent extends Event {
resultIndex: number
results: SpeechRecognitionResultList
}
interface SpeechRecognitionErrorEvent extends Event {
error: string
message?: string
}
interface SpeechRecognition extends EventTarget {
continuous: boolean
interimResults: boolean
lang: string
start(): void
stop(): void
abort(): void
onstart: ((this: SpeechRecognition, ev: Event) => any) | null
onend: ((this: SpeechRecognition, ev: Event) => any) | null
onresult: ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => any) | null
onerror: ((this: SpeechRecognition, ev: SpeechRecognitionErrorEvent) => any) | null
}
interface SpeechRecognitionStatic {
new (): SpeechRecognition
}
type WindowWithSpeech = Window & {
SpeechRecognition?: SpeechRecognitionStatic
webkitSpeechRecognition?: SpeechRecognitionStatic
}
interface VoiceInterfaceProps {
onCallEnd?: () => void
onVoiceTranscript?: (transcript: string) => void
@@ -35,7 +60,6 @@ interface VoiceInterfaceProps {
audioContextRef?: RefObject<AudioContext | null>
messages?: Array<{ content: string; type: 'user' | 'assistant' }>
className?: string
chatId?: string
}
export function VoiceInterface({
@@ -49,7 +73,6 @@ export function VoiceInterface({
audioContextRef: sharedAudioContextRef,
messages = [],
className,
chatId,
}: VoiceInterfaceProps) {
const [state, setState] = useState<'idle' | 'listening' | 'agent_speaking'>('idle')
const [isInitialized, setIsInitialized] = useState(false)
@@ -68,177 +91,79 @@ export function VoiceInterface({
currentStateRef.current = next
}, [])
const recognitionRef = useRef<SpeechRecognition | null>(null)
const mediaStreamRef = useRef<MediaStream | null>(null)
const audioContextRef = useRef<AudioContext | null>(null)
const analyserRef = useRef<AnalyserNode | null>(null)
const animationFrameRef = useRef<number | null>(null)
const isMutedRef = useRef(false)
const responseTimeoutRef = useRef<NodeJS.Timeout | null>(null)
const wsRef = useRef<WebSocket | null>(null)
const processorRef = useRef<ScriptProcessorNode | null>(null)
const pcmBufferRef = useRef<Float32Array[]>([])
const sendIntervalRef = useRef<ReturnType<typeof setInterval> | null>(null)
const sessionTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null)
const committedTextRef = useRef('')
const lastPartialRef = useRef('')
const onVoiceTranscriptRef = useRef(onVoiceTranscript)
onVoiceTranscriptRef.current = onVoiceTranscript
const isSupported =
typeof window !== 'undefined' &&
!!(
(window as WindowWithSpeech).SpeechRecognition ||
(window as WindowWithSpeech).webkitSpeechRecognition
)
const updateIsMuted = useCallback((next: boolean) => {
setIsMuted(next)
isMutedRef.current = next
}, [])
const stopSendingAudio = useCallback(() => {
if (sessionTimerRef.current) {
clearTimeout(sessionTimerRef.current)
sessionTimerRef.current = null
const setResponseTimeout = useCallback(() => {
if (responseTimeoutRef.current) {
clearTimeout(responseTimeoutRef.current)
}
if (sendIntervalRef.current) {
clearInterval(sendIntervalRef.current)
sendIntervalRef.current = null
}
pcmBufferRef.current = []
responseTimeoutRef.current = setTimeout(() => {
if (currentStateRef.current === 'listening') {
updateState('idle')
}
}, 5000)
}, [])
const flushAudioBuffer = useCallback(() => {
const ws = wsRef.current
if (!ws || ws.readyState !== WebSocket.OPEN) return
const chunks = pcmBufferRef.current
if (chunks.length === 0) return
pcmBufferRef.current = []
let totalLength = 0
for (const chunk of chunks) totalLength += chunk.length
const merged = new Float32Array(totalLength)
let offset = 0
for (const chunk of chunks) {
merged.set(chunk, offset)
offset += chunk.length
const clearResponseTimeout = useCallback(() => {
if (responseTimeoutRef.current) {
clearTimeout(responseTimeoutRef.current)
responseTimeoutRef.current = null
}
const pcm16 = floatTo16BitPCM(merged)
ws.send(
JSON.stringify({
message_type: 'input_audio_chunk',
audio_base_64: arrayBufferToBase64(pcm16),
sample_rate: SAMPLE_RATE,
commit: false,
})
)
}, [])
const startSendingAudio = useCallback(() => {
if (sendIntervalRef.current) return
pcmBufferRef.current = []
sendIntervalRef.current = setInterval(flushAudioBuffer, CHUNK_SEND_INTERVAL_MS)
}, [flushAudioBuffer])
useEffect(() => {
if (isPlayingAudio && state !== 'agent_speaking') {
clearResponseTimeout()
updateState('agent_speaking')
setCurrentTranscript('')
const closeWebSocket = useCallback(() => {
stopSendingAudio()
if (wsRef.current) {
if (
wsRef.current.readyState === WebSocket.OPEN ||
wsRef.current.readyState === WebSocket.CONNECTING
) {
wsRef.current.close()
}
wsRef.current = null
}
}, [stopSendingAudio])
const connectWebSocket = useCallback(async (): Promise<boolean> => {
try {
const body: Record<string, string> = {}
if (chatId) body.chatId = chatId
const tokenResponse = await fetch('/api/speech/token', {
method: 'POST',
credentials: 'include',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(body),
})
if (!tokenResponse.ok) {
logger.error('Failed to get STT token', { status: tokenResponse.status })
return false
updateIsMuted(true)
if (mediaStreamRef.current) {
mediaStreamRef.current.getAudioTracks().forEach((track) => {
track.enabled = false
})
}
const { token } = await tokenResponse.json()
const params = new URLSearchParams({
token,
model_id: 'scribe_v2_realtime',
audio_format: 'pcm_16000',
commit_strategy: 'vad',
vad_silence_threshold_secs: '1.0',
})
const ws = new WebSocket(`${ELEVENLABS_WS_URL}?${params.toString()}`)
wsRef.current = ws
committedTextRef.current = ''
return new Promise<boolean>((resolve) => {
ws.onopen = () => resolve(true)
ws.onerror = () => {
logger.error('STT WebSocket connection error')
resolve(false)
if (recognitionRef.current) {
try {
recognitionRef.current.abort()
} catch (error) {
logger.debug('Error aborting speech recognition:', error)
}
}
} else if (!isPlayingAudio && state === 'agent_speaking') {
updateState('idle')
setCurrentTranscript('')
ws.onmessage = (event) => {
if (isCallEndedRef.current) return
try {
const msg = JSON.parse(event.data)
if (msg.message_type === 'partial_transcript') {
if (msg.text) {
lastPartialRef.current = msg.text
setCurrentTranscript(msg.text)
}
} else if (
msg.message_type === 'committed_transcript' ||
msg.message_type === 'committed_transcript_with_timestamps'
) {
const finalText = msg.text || lastPartialRef.current
lastPartialRef.current = ''
if (finalText) {
committedTextRef.current = committedTextRef.current
? `${committedTextRef.current} ${finalText}`
: finalText
setCurrentTranscript('')
onVoiceTranscriptRef.current?.(finalText)
}
} else if (
msg.message_type === 'error' ||
msg.message_type === 'auth_error' ||
msg.message_type === 'quota_exceeded'
) {
logger.error('ElevenLabs STT error', { type: msg.message_type, error: msg.error })
}
} catch {
// Ignore non-JSON messages
}
}
ws.onclose = () => {
wsRef.current = null
if (currentStateRef.current === 'listening' && !isCallEndedRef.current) {
stopSendingAudio()
updateState('idle')
}
}
})
} catch (error) {
logger.error('Failed to connect STT WebSocket', error)
return false
updateIsMuted(false)
if (mediaStreamRef.current) {
mediaStreamRef.current.getAudioTracks().forEach((track) => {
track.enabled = true
})
}
}
}, [chatId])
}, [isPlayingAudio, state, clearResponseTimeout, updateState, updateIsMuted])
const setupAudioPipeline = useCallback(async () => {
const setupAudio = useCallback(async () => {
try {
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
@@ -246,40 +171,33 @@ export function VoiceInterface({
noiseSuppression: true,
autoGainControl: true,
channelCount: 1,
sampleRate: SAMPLE_RATE,
},
})
setPermissionStatus('granted')
mediaStreamRef.current = stream
const ac = new AudioContext({ sampleRate: SAMPLE_RATE })
audioContextRef.current = ac
if (ac.state === 'suspended') {
await ac.resume()
if (!audioContextRef.current) {
const AudioContext = window.AudioContext || window.webkitAudioContext
audioContextRef.current = new AudioContext()
}
const source = ac.createMediaStreamSource(stream)
const audioContext = audioContextRef.current
if (audioContext.state === 'suspended') {
await audioContext.resume()
}
const analyser = ac.createAnalyser()
const source = audioContext.createMediaStreamSource(stream)
const analyser = audioContext.createAnalyser()
analyser.fftSize = 256
analyser.smoothingTimeConstant = 0.8
source.connect(analyser)
analyserRef.current = analyser
const processor = ac.createScriptProcessor(4096, 1, 1)
processor.onaudioprocess = (e) => {
if (!isMutedRef.current && currentStateRef.current === 'listening') {
pcmBufferRef.current.push(new Float32Array(e.inputBuffer.getChannelData(0)))
}
}
source.connect(processor)
processor.connect(ac.destination)
processorRef.current = processor
const updateVisualization = () => {
if (!analyserRef.current) return
const bufferLength = analyserRef.current.frequencyBinCount
const dataArray = new Uint8Array(bufferLength)
analyserRef.current.getByteFrequencyData(dataArray)
@@ -294,71 +212,144 @@ export function VoiceInterface({
setAudioLevels(levels)
animationFrameRef.current = requestAnimationFrame(updateVisualization)
}
updateVisualization()
updateVisualization()
setIsInitialized(true)
return true
} catch (error) {
logger.error('Error setting up audio pipeline:', error)
logger.error('Error setting up audio:', error)
setPermissionStatus('denied')
return false
}
}, [])
const startListening = useCallback(async () => {
if (currentStateRef.current !== 'idle' || isMutedRef.current || isCallEndedRef.current) return
const setupSpeechRecognition = useCallback(() => {
if (!isSupported) return
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
const connected = await connectWebSocket()
if (!connected || isCallEndedRef.current) return
const w = window as WindowWithSpeech
const SpeechRecognition = w.SpeechRecognition || w.webkitSpeechRecognition
if (!SpeechRecognition) return
const recognition = new SpeechRecognition()
recognition.continuous = true
recognition.interimResults = true
recognition.lang = 'en-US'
recognition.onstart = () => {}
recognition.onresult = (event: SpeechRecognitionEvent) => {
const currentState = currentStateRef.current
if (isMutedRef.current || currentState !== 'listening') {
return
}
let finalTranscript = ''
let interimTranscript = ''
for (let i = event.resultIndex; i < event.results.length; i++) {
const result = event.results[i]
const transcript = result[0].transcript
if (result.isFinal) {
finalTranscript += transcript
} else {
interimTranscript += transcript
}
}
setCurrentTranscript(interimTranscript || finalTranscript)
if (finalTranscript.trim()) {
setCurrentTranscript('')
if (recognitionRef.current) {
try {
recognitionRef.current.stop()
} catch (error) {
// Ignore
}
}
setResponseTimeout()
onVoiceTranscript?.(finalTranscript)
}
}
recognition.onend = () => {
if (isCallEndedRef.current) return
const currentState = currentStateRef.current
if (currentState === 'listening' && !isMutedRef.current) {
setTimeout(() => {
if (isCallEndedRef.current) return
if (
recognitionRef.current &&
currentStateRef.current === 'listening' &&
!isMutedRef.current
) {
try {
recognitionRef.current.start()
} catch (error) {
logger.debug('Error restarting speech recognition:', error)
}
}
}, 1000)
}
}
recognition.onerror = (event: SpeechRecognitionErrorEvent) => {
if (event.error === 'aborted') {
return
}
if (event.error === 'not-allowed') {
setPermissionStatus('denied')
}
}
recognitionRef.current = recognition
}, [isSupported, onVoiceTranscript, setResponseTimeout])
const startListening = useCallback(() => {
if (!isInitialized || isMuted || state !== 'idle') {
return
}
updateState('listening')
setCurrentTranscript('')
startSendingAudio()
sessionTimerRef.current = setTimeout(() => {
logger.info('Voice session reached max duration, stopping')
stopSendingAudio()
closeWebSocket()
updateState('idle')
}, MAX_CHAT_SESSION_MS)
}, [connectWebSocket, updateState, startSendingAudio, stopSendingAudio, closeWebSocket])
const stopListening = useCallback(() => {
stopSendingAudio()
updateState('idle')
setCurrentTranscript('')
}, [updateState, stopSendingAudio])
useEffect(() => {
if (isPlayingAudio && state === 'listening') {
stopSendingAudio()
closeWebSocket()
updateState('agent_speaking')
setCurrentTranscript('')
updateIsMuted(true)
if (mediaStreamRef.current) {
mediaStreamRef.current.getAudioTracks().forEach((track) => {
track.enabled = false
})
}
} else if (!isPlayingAudio && state === 'agent_speaking') {
updateState('idle')
setCurrentTranscript('')
updateIsMuted(false)
if (mediaStreamRef.current) {
mediaStreamRef.current.getAudioTracks().forEach((track) => {
track.enabled = true
})
if (recognitionRef.current) {
try {
recognitionRef.current.start()
} catch (error) {
logger.error('Error starting recognition:', error)
}
}
}, [isPlayingAudio, state, updateState, updateIsMuted, stopSendingAudio, closeWebSocket])
}, [isInitialized, isMuted, state, updateState])
const stopListening = useCallback(() => {
updateState('idle')
setCurrentTranscript('')
if (recognitionRef.current) {
try {
recognitionRef.current.stop()
} catch (error) {
// Ignore
}
}
}, [updateState])
const handleInterrupt = useCallback(() => {
if (state === 'agent_speaking') {
onInterrupt?.()
updateState('listening')
setCurrentTranscript('')
updateIsMuted(false)
if (mediaStreamRef.current) {
@@ -367,43 +358,35 @@ export function VoiceInterface({
})
}
updateState('idle')
setCurrentTranscript('')
if (recognitionRef.current) {
try {
recognitionRef.current.start()
} catch (error) {
logger.error('Could not start recognition after interrupt:', error)
}
}
}
}, [state, onInterrupt, updateState, updateIsMuted])
const handleCallEnd = useCallback(() => {
isCallEndedRef.current = true
stopSendingAudio()
closeWebSocket()
updateState('idle')
setCurrentTranscript('')
updateIsMuted(false)
if (processorRef.current) {
processorRef.current.disconnect()
processorRef.current = null
}
if (mediaStreamRef.current) {
mediaStreamRef.current.getTracks().forEach((track) => track.stop())
mediaStreamRef.current = null
}
if (audioContextRef.current && audioContextRef.current.state !== 'closed') {
audioContextRef.current.close().catch(() => {})
audioContextRef.current = null
}
if (animationFrameRef.current) {
cancelAnimationFrame(animationFrameRef.current)
animationFrameRef.current = null
if (recognitionRef.current) {
try {
recognitionRef.current.abort()
} catch (error) {
logger.error('Error stopping speech recognition:', error)
}
}
clearResponseTimeout()
onInterrupt?.()
onCallEnd?.()
}, [onCallEnd, onInterrupt, updateState, updateIsMuted, stopSendingAudio, closeWebSocket])
}, [onCallEnd, onInterrupt, clearResponseTimeout, updateState, updateIsMuted])
useEffect(() => {
const handleKeyDown = (event: KeyboardEvent) => {
@@ -440,25 +423,11 @@ export function VoiceInterface({
}, [isMuted, state, handleInterrupt, stopListening, startListening, updateIsMuted])
useEffect(() => {
isCallEndedRef.current = false
let cancelled = false
async function init() {
const audioOk = await setupAudioPipeline()
if (!audioOk || cancelled) return
const wsOk = await connectWebSocket()
if (!wsOk || cancelled) return
setIsInitialized(true)
if (isSupported) {
setupSpeechRecognition()
setupAudio()
}
init()
return () => {
cancelled = true
}
}, [setupAudioPipeline, connectWebSocket])
}, [isSupported, setupSpeechRecognition, setupAudio])
useEffect(() => {
if (isInitialized && !isMuted && state === 'idle') {
@@ -470,16 +439,13 @@ export function VoiceInterface({
return () => {
isCallEndedRef.current = true
stopSendingAudio()
if (wsRef.current) {
wsRef.current.close()
wsRef.current = null
}
if (processorRef.current) {
processorRef.current.disconnect()
processorRef.current = null
if (recognitionRef.current) {
try {
recognitionRef.current.abort()
} catch (_e) {
// Ignore
}
recognitionRef.current = null
}
if (mediaStreamRef.current) {
@@ -496,8 +462,13 @@ export function VoiceInterface({
cancelAnimationFrame(animationFrameRef.current)
animationFrameRef.current = null
}
if (responseTimeoutRef.current) {
clearTimeout(responseTimeoutRef.current)
responseTimeoutRef.current = null
}
}
}, [stopSendingAudio])
}, [])
const getStatusText = () => {
switch (state) {

View File

@@ -9,7 +9,13 @@ import { cn } from '@/lib/core/utils/cn'
import { CHAT_ACCEPT_ATTRIBUTE } from '@/lib/uploads/utils/validation'
import { ContextMentionIcon } from '@/app/workspace/[workspaceId]/home/components/context-mention-icon'
import { useAvailableResources } from '@/app/workspace/[workspaceId]/home/components/mothership-view/components/add-resource-dropdown'
import type { PlusMenuHandle } from '@/app/workspace/[workspaceId]/home/components/user-input/components'
import type {
PlusMenuHandle,
SpeechRecognitionErrorEvent,
SpeechRecognitionEvent,
SpeechRecognitionInstance,
WindowWithSpeech,
} from '@/app/workspace/[workspaceId]/home/components/user-input/components'
import {
AnimatedPlaceholderEffect,
AttachedFilesList,
@@ -21,6 +27,7 @@ import {
OVERLAY_CLASSES,
PlusMenuDropdown,
SendButton,
SPEECH_RECOGNITION_LANG,
TEXTAREA_BASE_CLASSES,
} from '@/app/workspace/[workspaceId]/home/components/user-input/components'
import type {
@@ -39,7 +46,6 @@ import {
extractContextTokens,
} from '@/app/workspace/[workspaceId]/w/[workflowId]/components/panel/components/copilot/components/user-input/utils'
import { useWorkflowMap } from '@/hooks/queries/workflows'
import { useSpeechToText } from '@/hooks/use-speech-to-text'
import type { ChatContext } from '@/stores/panel'
export type { FileAttachmentForApi } from '@/app/workspace/[workspaceId]/home/types'
@@ -229,29 +235,10 @@ export function UserInput({
const canSubmit = (value.trim().length > 0 || hasFiles) && !isSending
const [isListening, setIsListening] = useState(false)
const recognitionRef = useRef<SpeechRecognitionInstance | null>(null)
const prefixRef = useRef('')
const valueRef = useRef(value)
const sttPrefixRef = useRef('')
const handleTranscript = useCallback((text: string) => {
const prefix = sttPrefixRef.current
const newVal = prefix ? `${prefix} ${text}` : text
setValue(newVal)
valueRef.current = newVal
}, [])
const {
isListening,
isSupported: isSttSupported,
toggleListening: rawToggle,
resetTranscript,
} = useSpeechToText({ onTranscript: handleTranscript })
const toggleListening = useCallback(() => {
if (!isListening) {
sttPrefixRef.current = valueRef.current
}
rawToggle()
}, [isListening, rawToggle])
const filesRef = useRef(files)
filesRef.current = files
@@ -262,6 +249,12 @@ export function UserInput({
const isSendingRef = useRef(isSending)
isSendingRef.current = isSending
useEffect(() => {
return () => {
recognitionRef.current?.abort()
}
}, [])
useEffect(() => {
valueRef.current = value
}, [value])
@@ -411,6 +404,84 @@ export function UserInput({
[textareaRef]
)
const startRecognition = useCallback((): boolean => {
const w = window as WindowWithSpeech
const SpeechRecognitionAPI = w.SpeechRecognition || w.webkitSpeechRecognition
if (!SpeechRecognitionAPI) return false
const recognition = new SpeechRecognitionAPI()
recognition.continuous = true
recognition.interimResults = true
recognition.lang = SPEECH_RECOGNITION_LANG
recognition.onresult = (event: SpeechRecognitionEvent) => {
let transcript = ''
for (let i = 0; i < event.results.length; i++) {
transcript += event.results[i][0].transcript
}
const prefix = prefixRef.current
const newVal = prefix ? `${prefix} ${transcript}` : transcript
setValue(newVal)
valueRef.current = newVal
}
recognition.onend = () => {
if (recognitionRef.current === recognition) {
prefixRef.current = valueRef.current
try {
recognition.start()
} catch {
recognitionRef.current = null
setIsListening(false)
}
}
}
recognition.onerror = (e: SpeechRecognitionErrorEvent) => {
if (recognitionRef.current !== recognition) return
if (e.error === 'aborted' || e.error === 'not-allowed') {
recognitionRef.current = null
setIsListening(false)
}
}
recognitionRef.current = recognition
try {
recognition.start()
return true
} catch {
recognitionRef.current = null
return false
}
}, [])
const restartRecognition = useCallback(
(newPrefix: string) => {
if (!recognitionRef.current) return
prefixRef.current = newPrefix
recognitionRef.current.abort()
recognitionRef.current = null
if (!startRecognition()) {
setIsListening(false)
}
},
[startRecognition]
)
const toggleListening = useCallback(() => {
if (isListening) {
recognitionRef.current?.stop()
recognitionRef.current = null
setIsListening(false)
return
}
prefixRef.current = valueRef.current
if (startRecognition()) {
setIsListening(true)
}
}, [isListening, startRecognition])
const handleSubmit = useCallback(() => {
const currentFiles = filesRef.current
const currentContext = contextRef.current
@@ -432,16 +503,14 @@ export function UserInput({
currentContext.selectedContexts.length > 0 ? currentContext.selectedContexts : undefined
)
setValue('')
valueRef.current = ''
sttPrefixRef.current = ''
resetTranscript()
restartRecognition('')
currentFiles.clearAttachedFiles()
currentContext.clearContexts()
if (textareaRef.current) {
textareaRef.current.style.height = 'auto'
}
}, [onSubmit, textareaRef, resetTranscript])
}, [onSubmit, restartRecognition, textareaRef])
const handleKeyDown = useCallback(
(e: React.KeyboardEvent<HTMLTextAreaElement>) => {
@@ -520,27 +589,32 @@ export function UserInput({
[handleSubmit, mentionTokensWithContext, value, textareaRef]
)
const handleInputChange = useCallback((e: React.ChangeEvent<HTMLTextAreaElement>) => {
const newValue = e.target.value
const caret = e.target.selectionStart ?? newValue.length
const handleInputChange = useCallback(
(e: React.ChangeEvent<HTMLTextAreaElement>) => {
const newValue = e.target.value
const caret = e.target.selectionStart ?? newValue.length
if (
caret > 0 &&
newValue.charAt(caret - 1) === '@' &&
(caret === 1 || /\s/.test(newValue.charAt(caret - 2)))
) {
const before = newValue.slice(0, caret - 1)
const after = newValue.slice(caret)
const adjusted = `${before}${after}`
setValue(adjusted)
atInsertPosRef.current = caret - 1
const anchor = getCaretAnchor(e.target, caret - 1)
plusMenuRef.current?.open(anchor)
return
}
if (
caret > 0 &&
newValue.charAt(caret - 1) === '@' &&
(caret === 1 || /\s/.test(newValue.charAt(caret - 2)))
) {
const before = newValue.slice(0, caret - 1)
const after = newValue.slice(caret)
const adjusted = `${before}${after}`
setValue(adjusted)
atInsertPosRef.current = caret - 1
const anchor = getCaretAnchor(e.target, caret - 1)
plusMenuRef.current?.open(anchor)
restartRecognition(adjusted)
return
}
setValue(newValue)
}, [])
setValue(newValue)
restartRecognition(newValue)
},
[restartRecognition]
)
const handleSelectAdjust = useCallback(() => {
const textarea = textareaRef.current
@@ -729,7 +803,7 @@ export function UserInput({
/>
</div>
<div className='flex items-center gap-1.5'>
{isSttSupported && <MicButton isListening={isListening} onToggle={toggleListening} />}
<MicButton isListening={isListening} onToggle={toggleListening} />
<SendButton
isSending={isSending}
canSubmit={canSubmit}

View File

@@ -5,7 +5,6 @@ import { createLogger } from '@sim/logger'
import { format, formatDistanceToNow, isPast } from 'date-fns'
import {
AlertCircle,
AlertTriangle,
CheckCircle2,
ChevronDown,
Loader2,
@@ -67,7 +66,6 @@ const STATUS_CONFIG = {
syncing: { label: 'Syncing', variant: 'amber' as const },
error: { label: 'Error', variant: 'red' as const },
paused: { label: 'Paused', variant: 'gray' as const },
disabled: { label: 'Disabled', variant: 'amber' as const },
} as const
export function ConnectorsSection({
@@ -161,10 +159,7 @@ export function ConnectorsSection({
knowledgeBaseId,
connectorId: connector.id,
updates: {
status:
connector.status === 'paused' || connector.status === 'disabled'
? 'active'
: 'paused',
status: connector.status === 'paused' ? 'active' : 'paused',
},
},
{
@@ -357,12 +352,7 @@ function ConnectorCard({
<div className='rounded-lg border border-[var(--border-1)]'>
<div className='flex items-center justify-between px-3 py-2.5'>
<div className='flex items-center gap-2.5'>
<div className='relative flex-shrink-0'>
{Icon && <Icon className='h-5 w-5' />}
{connector.status === 'disabled' && (
<AlertTriangle className='-right-1 -top-1 absolute h-3 w-3 text-amber-500' />
)}
</div>
{Icon && <Icon className='h-5 w-5 flex-shrink-0' />}
<div className='flex flex-col gap-0.5'>
<div className='flex items-center gap-2'>
<span className='flex items-center gap-1.5 font-medium text-[var(--text-primary)] text-small'>
@@ -417,12 +407,7 @@ function ConnectorCard({
variant='ghost'
className='h-7 w-7 p-0'
onClick={onSync}
disabled={
connector.status === 'syncing' ||
connector.status === 'disabled' ||
isSyncPending ||
syncCooldown
}
disabled={connector.status === 'syncing' || isSyncPending || syncCooldown}
>
<RefreshCw
className={cn(
@@ -456,7 +441,7 @@ function ConnectorCard({
>
{isUpdating ? (
<Loader2 className='h-3.5 w-3.5 animate-spin' />
) : connector.status === 'paused' || connector.status === 'disabled' ? (
) : connector.status === 'paused' ? (
<Play className='h-3.5 w-3.5' />
) : (
<Pause className='h-3.5 w-3.5' />
@@ -464,9 +449,7 @@ function ConnectorCard({
</Button>
</Tooltip.Trigger>
<Tooltip.Content>
{connector.status === 'paused' || connector.status === 'disabled'
? 'Resume'
: 'Pause'}
{connector.status === 'paused' ? 'Resume' : 'Pause'}
</Tooltip.Content>
</Tooltip.Root>
@@ -498,46 +481,7 @@ function ConnectorCard({
</div>
</div>
{connector.status === 'disabled' && (
<div className='border-[var(--border-1)] border-t px-3 py-2'>
<div className='flex flex-col gap-1 rounded-sm border border-amber-200 bg-amber-50 px-2 py-1.5 dark:border-amber-900 dark:bg-amber-950'>
<div className='flex items-center gap-1.5 font-medium text-amber-800 text-caption dark:text-amber-200'>
<AlertTriangle className='h-3 w-3 flex-shrink-0' />
Connector disabled after repeated sync failures
</div>
<p className='text-amber-700 text-micro dark:text-amber-300'>
Syncing has been paused due to {connector.consecutiveFailures} consecutive failures.
{serviceId
? ' Reconnect your account to resume syncing.'
: ' Use the resume button to re-enable syncing.'}
</p>
{canEdit && serviceId && providerId && (
<Button
variant='active'
onClick={() => {
if (connector.credentialId) {
writeOAuthReturnContext({
origin: 'kb-connectors',
knowledgeBaseId,
displayName: connectorDef?.name ?? connector.connectorType,
providerId: providerId!,
preCount: credentials?.length ?? 0,
workspaceId,
requestedAt: Date.now(),
})
}
setShowOAuthModal(true)
}}
className='w-full px-2 py-1 font-medium text-caption'
>
Reconnect
</Button>
)}
</div>
</div>
)}
{missingScopes.length > 0 && connector.status !== 'disabled' && (
{missingScopes.length > 0 && (
<div className='border-[var(--border-1)] border-t px-3 py-2'>
<div className='flex flex-col gap-1 rounded-sm border bg-[var(--surface-2)] px-2 py-1.5'>
<div className='flex items-center font-medium text-caption'>

View File

@@ -127,8 +127,6 @@ export const KnowledgeBlock: BlockConfig = {
title: 'Document',
type: 'document-selector',
canonicalParamId: 'documentId',
serviceId: 'knowledge',
selectorKey: 'knowledge.documents',
placeholder: 'Select document',
dependsOn: ['knowledgeBaseSelector'],
required: true,

View File

@@ -12,7 +12,7 @@ export interface ConnectorData {
sourceConfig: Record<string, unknown>
syncMode: string
syncIntervalMinutes: number
status: 'active' | 'paused' | 'syncing' | 'error' | 'disabled'
status: 'active' | 'paused' | 'syncing' | 'error'
lastSyncAt: string | null
lastSyncError: string | null
lastSyncDocCount: number | null

View File

@@ -1,384 +0,0 @@
'use client'
import { useCallback, useEffect, useRef, useState } from 'react'
import { createLogger } from '@sim/logger'
import { arrayBufferToBase64, floatTo16BitPCM } from '@/lib/speech/audio'
import {
CHUNK_SEND_INTERVAL_MS,
ELEVENLABS_WS_URL,
MAX_SESSION_MS,
SAMPLE_RATE,
} from '@/lib/speech/config'
const logger = createLogger('useSpeechToText')
export { MAX_SESSION_MS } from '@/lib/speech/config'
export type PermissionState = 'prompt' | 'granted' | 'denied'
interface UseSpeechToTextProps {
onTranscript: (text: string) => void
language?: string
}
interface UseSpeechToTextReturn {
isListening: boolean
isSupported: boolean
permissionState: PermissionState
toggleListening: () => void
resetTranscript: () => void
}
export function useSpeechToText({
onTranscript,
language,
}: UseSpeechToTextProps): UseSpeechToTextReturn {
const [isListening, setIsListening] = useState(false)
const [isSupported, setIsSupported] = useState(false)
const [permissionState, setPermissionState] = useState<PermissionState>('prompt')
const onTranscriptRef = useRef(onTranscript)
const languageRef = useRef(language)
const mountedRef = useRef(true)
const startingRef = useRef(false)
const wsRef = useRef<WebSocket | null>(null)
const streamRef = useRef<MediaStream | null>(null)
const audioContextRef = useRef<AudioContext | null>(null)
const processorRef = useRef<ScriptProcessorNode | null>(null)
const pcmBufferRef = useRef<Float32Array[]>([])
const sendIntervalRef = useRef<ReturnType<typeof setInterval> | null>(null)
const sessionTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null)
const stopStreamingRef = useRef<() => void>(() => {})
const isFirstChunkRef = useRef(true)
const committedTextRef = useRef('')
onTranscriptRef.current = onTranscript
languageRef.current = language
useEffect(() => {
const browserOk =
typeof window !== 'undefined' &&
typeof AudioContext !== 'undefined' &&
typeof WebSocket !== 'undefined' &&
typeof navigator?.mediaDevices?.getUserMedia === 'function'
if (!browserOk) {
setIsSupported(false)
return
}
fetch('/api/settings/voice', { credentials: 'include' })
.then((r) => (r.ok ? r.json() : { sttAvailable: false }))
.then((data) => {
if (mountedRef.current) setIsSupported(data.sttAvailable === true)
})
.catch(() => {
if (mountedRef.current) setIsSupported(false)
})
}, [])
const flushAudioBuffer = useCallback(() => {
const ws = wsRef.current
if (!ws || ws.readyState !== WebSocket.OPEN) return
const chunks = pcmBufferRef.current
if (chunks.length === 0) return
pcmBufferRef.current = []
let totalLength = 0
for (const chunk of chunks) totalLength += chunk.length
const merged = new Float32Array(totalLength)
let offset = 0
for (const chunk of chunks) {
merged.set(chunk, offset)
offset += chunk.length
}
const pcm16 = floatTo16BitPCM(merged)
const message: Record<string, unknown> = {
message_type: 'input_audio_chunk',
audio_base_64: arrayBufferToBase64(pcm16),
sample_rate: SAMPLE_RATE,
commit: false,
}
if (isFirstChunkRef.current) {
isFirstChunkRef.current = false
if (committedTextRef.current) {
message.previous_text = committedTextRef.current
}
}
ws.send(JSON.stringify(message))
}, [])
const cleanup = useCallback(() => {
if (sessionTimerRef.current) {
clearTimeout(sessionTimerRef.current)
sessionTimerRef.current = null
}
if (sendIntervalRef.current) {
clearInterval(sendIntervalRef.current)
sendIntervalRef.current = null
}
if (processorRef.current) {
processorRef.current.disconnect()
processorRef.current = null
}
if (audioContextRef.current && audioContextRef.current.state !== 'closed') {
audioContextRef.current.close().catch(() => {})
audioContextRef.current = null
}
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => track.stop())
streamRef.current = null
}
if (wsRef.current) {
if (
wsRef.current.readyState === WebSocket.OPEN ||
wsRef.current.readyState === WebSocket.CONNECTING
) {
wsRef.current.close()
}
wsRef.current = null
}
pcmBufferRef.current = []
isFirstChunkRef.current = true
}, [])
const startStreaming = useCallback(async () => {
if (startingRef.current) return false
startingRef.current = true
try {
const tokenResponse = await fetch('/api/speech/token', {
method: 'POST',
credentials: 'include',
})
if (!tokenResponse.ok) {
const body = await tokenResponse.json().catch(() => ({}))
throw new Error(body.error || 'Failed to get speech token')
}
const { token } = await tokenResponse.json()
if (!mountedRef.current) return false
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
channelCount: 1,
sampleRate: SAMPLE_RATE,
},
})
if (!mountedRef.current) {
stream.getTracks().forEach((track) => track.stop())
return false
}
setPermissionState('granted')
streamRef.current = stream
const params = new URLSearchParams({
token,
model_id: 'scribe_v2_realtime',
audio_format: 'pcm_16000',
commit_strategy: 'vad',
vad_silence_threshold_secs: '1.0',
})
if (languageRef.current) {
params.set('language_code', languageRef.current)
}
const ws = new WebSocket(`${ELEVENLABS_WS_URL}?${params.toString()}`)
wsRef.current = ws
committedTextRef.current = ''
isFirstChunkRef.current = true
await new Promise<void>((resolve, reject) => {
ws.onopen = () => resolve()
ws.onerror = () => reject(new Error('WebSocket connection failed'))
ws.onmessage = (event) => {
try {
const msg = JSON.parse(event.data)
if (msg.message_type === 'partial_transcript' && msg.text) {
if (mountedRef.current) {
const full = committedTextRef.current
? `${committedTextRef.current} ${msg.text}`
: msg.text
onTranscriptRef.current(full)
}
} else if (
(msg.message_type === 'committed_transcript' ||
msg.message_type === 'committed_transcript_with_timestamps') &&
msg.text
) {
committedTextRef.current = committedTextRef.current
? `${committedTextRef.current} ${msg.text}`
: msg.text
if (mountedRef.current) {
onTranscriptRef.current(committedTextRef.current)
}
} else if (
msg.message_type === 'error' ||
msg.message_type === 'auth_error' ||
msg.message_type === 'quota_exceeded'
) {
logger.error('ElevenLabs STT error', { type: msg.message_type, error: msg.error })
}
} catch {
// Ignore non-JSON messages
}
}
ws.onclose = () => {
if (mountedRef.current) {
setIsListening(false)
}
cleanup()
}
})
if (!mountedRef.current) {
ws.close()
stream.getTracks().forEach((track) => track.stop())
return false
}
const audioContext = new AudioContext({ sampleRate: SAMPLE_RATE })
audioContextRef.current = audioContext
const source = audioContext.createMediaStreamSource(stream)
const processor = audioContext.createScriptProcessor(4096, 1, 1)
processorRef.current = processor
processor.onaudioprocess = (e) => {
const input = e.inputBuffer.getChannelData(0)
pcmBufferRef.current.push(new Float32Array(input))
}
source.connect(processor)
processor.connect(audioContext.destination)
sendIntervalRef.current = setInterval(flushAudioBuffer, CHUNK_SEND_INTERVAL_MS)
setIsListening(true)
sessionTimerRef.current = setTimeout(() => {
logger.info('Voice input session reached max duration, stopping')
stopStreamingRef.current()
}, MAX_SESSION_MS)
return true
} catch (error) {
logger.error('Failed to start speech streaming', error)
cleanup()
if (error instanceof DOMException && error.name === 'NotAllowedError') {
setPermissionState('denied')
}
return false
} finally {
startingRef.current = false
}
}, [cleanup, flushAudioBuffer])
const stopStreaming = useCallback(() => {
if (sessionTimerRef.current) {
clearTimeout(sessionTimerRef.current)
sessionTimerRef.current = null
}
flushAudioBuffer()
const ws = wsRef.current
if (ws && ws.readyState === WebSocket.OPEN) {
ws.send(
JSON.stringify({
message_type: 'input_audio_chunk',
audio_base_64: '',
sample_rate: SAMPLE_RATE,
commit: true,
})
)
}
if (sendIntervalRef.current) {
clearInterval(sendIntervalRef.current)
sendIntervalRef.current = null
}
if (processorRef.current) {
processorRef.current.disconnect()
processorRef.current = null
}
if (audioContextRef.current && audioContextRef.current.state !== 'closed') {
audioContextRef.current.close().catch(() => {})
audioContextRef.current = null
}
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => track.stop())
streamRef.current = null
}
const wsToClose = wsRef.current
wsRef.current = null
if (wsToClose) {
setTimeout(() => {
if (
wsToClose.readyState === WebSocket.OPEN ||
wsToClose.readyState === WebSocket.CONNECTING
) {
wsToClose.close()
}
}, 2000)
}
setIsListening(false)
}, [flushAudioBuffer])
stopStreamingRef.current = stopStreaming
const resetTranscript = useCallback(() => {
committedTextRef.current = ''
isFirstChunkRef.current = true
}, [])
const toggleListening = useCallback(() => {
if (isListening) {
stopStreaming()
} else {
startStreaming()
}
}, [isListening, startStreaming, stopStreaming])
useEffect(() => {
mountedRef.current = true
return () => {
mountedRef.current = false
cleanup()
}
}, [cleanup])
return {
isListening,
isSupported,
permissionState,
toggleListening,
resetTranscript,
}
}

View File

@@ -8,7 +8,6 @@
import { createLogger } from '@sim/logger'
import { env } from '@/lib/core/config/env'
import { isHosted } from '@/lib/core/config/feature-flags'
import { getClientIp } from '@/lib/core/utils/request'
import { getBaseDomain } from '@/lib/core/utils/urls'
const logger = createLogger('ProfoundAnalytics')
@@ -102,10 +101,10 @@ export function sendToProfound(request: Request, statusCode: number): void {
host: getBaseDomain(),
path: url.pathname,
status_code: statusCode,
ip: (() => {
const resolved = getClientIp(request)
return resolved === 'unknown' ? '0.0.0.0' : resolved
})(),
ip:
request.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ||
request.headers.get('x-real-ip') ||
'0.0.0.0',
user_agent: request.headers.get('user-agent') || '',
...(Object.keys(queryParams).length > 0 && { query_params: queryParams }),
...(request.headers.get('referer') && { referer: request.headers.get('referer')! }),

View File

@@ -2,7 +2,6 @@ import { auditLog, db } from '@sim/db'
import { user } from '@sim/db/schema'
import { createLogger } from '@sim/logger'
import { eq } from 'drizzle-orm'
import { getClientIp } from '@/lib/core/utils/request'
import { generateShortId } from '@/lib/core/utils/uuid'
const logger = createLogger('AuditLog')
@@ -237,7 +236,10 @@ export function recordAudit(params: AuditLogParams): void {
}
async function insertAuditLog(params: AuditLogParams): Promise<void> {
const ipAddress = params.request ? getClientIp(params.request) : undefined
const ipAddress =
params.request?.headers.get('x-forwarded-for')?.split(',')[0].trim() ??
params.request?.headers.get('x-real-ip') ??
undefined
const userAgent = params.request?.headers.get('user-agent') ?? undefined
let { actorName, actorEmail } = params

View File

@@ -3,7 +3,6 @@ import { jwtVerify, SignJWT } from 'jose'
import { type NextRequest, NextResponse } from 'next/server'
import { env } from '@/lib/core/config/env'
import { safeCompare } from '@/lib/core/security/encryption'
import { getClientIp } from '@/lib/core/utils/request'
const logger = createLogger('CronAuth')
@@ -74,7 +73,7 @@ export function verifyCronAuth(request: NextRequest, context?: string): NextResp
if (!env.CRON_SECRET) {
const contextInfo = context ? ` for ${context}` : ''
logger.warn(`CRON endpoint accessed but CRON_SECRET is not configured${contextInfo}`, {
ip: getClientIp(request),
ip: request.headers.get('x-forwarded-for') ?? request.headers.get('x-real-ip') ?? 'unknown',
userAgent: request.headers.get('user-agent') ?? 'unknown',
context,
})
@@ -88,7 +87,7 @@ export function verifyCronAuth(request: NextRequest, context?: string): NextResp
const contextInfo = context ? ` for ${context}` : ''
logger.warn(`Unauthorized CRON access attempt${contextInfo}`, {
providedAuth: authHeader,
ip: getClientIp(request),
ip: request.headers.get('x-forwarded-for') ?? request.headers.get('x-real-ip') ?? 'unknown',
userAgent: request.headers.get('user-agent') ?? 'unknown',
context,
})

View File

@@ -23,7 +23,6 @@ export type UsageLogSource =
| 'mcp_copilot'
| 'mothership_block'
| 'knowledge-base'
| 'voice-input'
/**
* Metadata for 'model' category charges

View File

@@ -95,8 +95,6 @@ export const buildTimeCSPDirectives: CSPDirectives = {
? ['http://localhost:3002', 'ws://localhost:3002']
: []),
'https://api.browser-use.com',
'https://api.elevenlabs.io',
'wss://api.elevenlabs.io',
'https://api.exa.ai',
'https://api.firecrawl.dev',
'https://*.googleapis.com',
@@ -199,7 +197,7 @@ export function generateRuntimeCSP(): string {
img-src 'self' data: blob: https://*.googleusercontent.com https://*.google.com https://*.atlassian.com https://cdn.discordapp.com https://*.githubusercontent.com https://*.s3.amazonaws.com https://s3.amazonaws.com https://*.amazonaws.com https://*.blob.core.windows.net https://github.com/* https://collector.onedollarstats.com ${gtmImg} ${brandLogoDomain} ${brandFaviconDomain};
media-src 'self' blob:;
font-src 'self' https://fonts.gstatic.com;
connect-src 'self' ${appUrl} ${ollamaUrl} ${socketUrl} ${socketWsUrl} https://api.browser-use.com https://api.elevenlabs.io wss://api.elevenlabs.io https://api.exa.ai https://api.firecrawl.dev https://*.googleapis.com https://*.amazonaws.com https://*.s3.amazonaws.com https://*.blob.core.windows.net https://api.github.com https://github.com/* https://*.atlassian.com https://*.supabase.co https://challenges.cloudflare.com https://collector.onedollarstats.com ${gtmConnect} ${dynamicDomainsStr};
connect-src 'self' ${appUrl} ${ollamaUrl} ${socketUrl} ${socketWsUrl} https://api.browser-use.com https://api.exa.ai https://api.firecrawl.dev https://*.googleapis.com https://*.amazonaws.com https://*.s3.amazonaws.com https://*.blob.core.windows.net https://api.github.com https://github.com/* https://*.atlassian.com https://*.supabase.co https://challenges.cloudflare.com https://collector.onedollarstats.com ${gtmConnect} ${dynamicDomainsStr};
frame-src 'self' https://challenges.cloudflare.com https://drive.google.com https://docs.google.com https://*.google.com ${gtmFrame};
frame-ancestors 'self';
form-action 'self';

View File

@@ -6,17 +6,6 @@ export function generateRequestId(): string {
return generateId().slice(0, 8)
}
/**
* Extract the client IP from a request, checking `x-forwarded-for` then `x-real-ip`.
*/
export function getClientIp(request: { headers: { get(name: string): string | null } }): string {
return (
request.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ||
request.headers.get('x-real-ip')?.trim() ||
'unknown'
)
}
/**
* No-operation function for use as default callback
*/

View File

@@ -46,7 +46,6 @@ const MAX_PAGES = 500
const MAX_SAFE_TITLE_LENGTH = 200
const STALE_PROCESSING_MINUTES = 45
const RETRY_WINDOW_DAYS = 7
const MAX_CONSECUTIVE_FAILURES = 10
/** Sanitizes a document title for use in S3 storage keys. */
function sanitizeStorageTitle(title: string): string {
@@ -231,7 +230,7 @@ async function resolveAccessToken(
connector: { credentialId: string | null; encryptedApiKey: string | null },
connectorConfig: { auth: ConnectorAuthConfig },
userId: string
): Promise<string> {
): Promise<string | null> {
if (connectorConfig.auth.mode === 'apiKey') {
if (!connector.encryptedApiKey) {
throw new Error('API key connector is missing encrypted API key')
@@ -244,22 +243,11 @@ async function resolveAccessToken(
throw new Error('OAuth connector is missing credential ID')
}
const requestId = `sync-${connector.credentialId}`
const token = await refreshAccessTokenIfNeeded(connector.credentialId, userId, requestId)
if (!token) {
logger.error(`[${requestId}] refreshAccessTokenIfNeeded returned null`, {
credentialId: connector.credentialId,
userId,
authMode: connectorConfig.auth.mode,
authProvider: connectorConfig.auth.provider,
})
throw new Error(
`Failed to obtain access token for credential ${connector.credentialId} (provider: ${connectorConfig.auth.provider})`
)
}
return token
return refreshAccessTokenIfNeeded(
connector.credentialId,
userId,
`sync-${connector.credentialId}`
)
}
/**
@@ -317,6 +305,12 @@ export async function executeSync(
const userId = kbRows[0].userId
const sourceConfig = connector.sourceConfig as Record<string, unknown>
let accessToken = await resolveAccessToken(connector, connectorConfig, userId)
if (!accessToken) {
throw new Error('Failed to obtain access token')
}
const lockResult = await db
.update(knowledgeConnector)
.set({ status: 'syncing', updatedAt: new Date() })
@@ -347,8 +341,6 @@ export async function executeSync(
let syncExitedCleanly = false
try {
let accessToken = await resolveAccessToken(connector, connectorConfig, userId)
const externalDocs: ExternalDocument[] = []
let cursor: string | undefined
let hasMore = true
@@ -365,7 +357,8 @@ export async function executeSync(
for (let pageNum = 0; hasMore && pageNum < MAX_PAGES; pageNum++) {
if (pageNum > 0 && connectorConfig.auth.mode === 'oauth') {
accessToken = await resolveAccessToken(connector, connectorConfig, userId)
const refreshed = await resolveAccessToken(connector, connectorConfig, userId)
if (refreshed) accessToken = refreshed
}
const page = await connectorConfig.listDocuments(
@@ -503,7 +496,8 @@ export async function executeSync(
if (deferredOps.length > 0) {
if (connectorConfig.auth.mode === 'oauth') {
accessToken = await resolveAccessToken(connector, connectorConfig, userId)
const refreshed = await resolveAccessToken(connector, connectorConfig, userId)
if (refreshed) accessToken = refreshed
}
const hydrated = await Promise.allSettled(
@@ -795,25 +789,15 @@ export async function executeSync(
const now = new Date()
const failures = (connector.consecutiveFailures ?? 0) + 1
const disabled = failures >= MAX_CONSECUTIVE_FAILURES
const backoffMinutes = Math.min(failures * 30, 1440)
const nextSync = disabled ? null : new Date(now.getTime() + backoffMinutes * 60 * 1000)
if (disabled) {
logger.warn('Connector disabled after repeated failures', {
connectorId,
consecutiveFailures: failures,
})
}
const nextSync = new Date(now.getTime() + backoffMinutes * 60 * 1000)
await db
.update(knowledgeConnector)
.set({
status: disabled ? 'disabled' : 'error',
status: 'error',
lastSyncAt: now,
lastSyncError: disabled
? 'Connector disabled after repeated sync failures. Please reconnect.'
: errorMessage,
lastSyncError: errorMessage,
nextSyncAt: nextSync,
consecutiveFailures: failures,
updatedAt: now,

View File

@@ -1,25 +0,0 @@
/**
* Convert Float32 PCM samples to 16-bit signed integer PCM.
* Required for ElevenLabs realtime STT (pcm_16000 format).
*/
export function floatTo16BitPCM(float32Array: Float32Array): ArrayBuffer {
const buffer = new ArrayBuffer(float32Array.length * 2)
const view = new DataView(buffer)
for (let i = 0; i < float32Array.length; i++) {
const s = Math.max(-1, Math.min(1, float32Array[i]))
view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true)
}
return buffer
}
/**
* Encode an ArrayBuffer as a base64 string for WebSocket transport.
*/
export function arrayBufferToBase64(buffer: ArrayBuffer): string {
const bytes = new Uint8Array(buffer)
let binary = ''
for (let i = 0; i < bytes.length; i++) {
binary += String.fromCharCode(bytes[i])
}
return btoa(binary)
}

View File

@@ -1,16 +0,0 @@
import { env } from '@/lib/core/config/env'
export const ELEVENLABS_WS_URL = 'wss://api.elevenlabs.io/v1/speech-to-text/realtime'
export const SAMPLE_RATE = 16000
export const CHUNK_SEND_INTERVAL_MS = 250
export const MAX_SESSION_MS = 3 * 60 * 1000
export const MAX_CHAT_SESSION_MS = 1 * 60 * 1000
/**
* Whether a speech-to-text provider is configured.
* Currently checks for `ELEVENLABS_API_KEY`.
* To add a new provider: add its env check here.
*/
export function hasSTTService(): boolean {
return !!env.ELEVENLABS_API_KEY?.trim()
}

View File

@@ -1,6 +1,5 @@
import { createLogger } from '@sim/logger'
import { NextResponse } from 'next/server'
import { getClientIp } from '@/lib/core/utils/request'
import type {
AuthContext,
EventFilterContext,
@@ -31,7 +30,10 @@ export const genericHandler: WebhookProviderHandler = {
const allowedIps = providerConfig.allowedIps
if (allowedIps && Array.isArray(allowedIps) && allowedIps.length > 0) {
const clientIp = getClientIp(request)
const clientIp =
request.headers.get('x-forwarded-for')?.split(',')[0].trim() ||
request.headers.get('x-real-ip') ||
'unknown'
if (clientIp === 'unknown' || !allowedIps.includes(clientIp)) {
logger.warn(`[${requestId}] Forbidden webhook access attempt - IP not allowed: ${clientIp}`)

View File

@@ -4,7 +4,6 @@ import { type NextRequest, NextResponse } from 'next/server'
import { sendToProfound } from './lib/analytics/profound'
import { isAuthDisabled, isHosted } from './lib/core/config/feature-flags'
import { generateRuntimeCSP } from './lib/core/security/csp'
import { getClientIp } from './lib/core/utils/request'
const logger = createLogger('Proxy')
@@ -115,7 +114,7 @@ function handleSecurityFiltering(request: NextRequest): NextResponse | null {
if (isSuspicious && !isWebhookEndpoint && !isMcpEndpoint && !isMcpOauthDiscoveryEndpoint) {
logger.warn('Blocked suspicious request', {
userAgent,
ip: getClientIp(request),
ip: request.headers.get('x-forwarded-for') || 'unknown',
url: request.url,
method: request.method,
pattern: SUSPICIOUS_UA_PATTERNS.find((pattern) => pattern.test(userAgent))?.toString(),

View File

@@ -1 +0,0 @@
ALTER TYPE "public"."usage_log_source" ADD VALUE 'voice-input';

File diff suppressed because it is too large Load Diff

View File

@@ -1303,13 +1303,6 @@
"when": 1775525922688,
"tag": "0186_greedy_jocasta",
"breakpoints": true
},
{
"idx": 187,
"version": "7",
"when": 1775630332078,
"tag": "0187_fuzzy_wendell_vaughn",
"breakpoints": true
}
]
}

View File

@@ -2276,7 +2276,6 @@ export const usageLogSourceEnum = pgEnum('usage_log_source', [
'mcp_copilot',
'mothership_block',
'knowledge-base',
'voice-input',
])
export const usageLog = pgTable(

View File

@@ -1434,8 +1434,13 @@ function parseConstFieldContent(
* 1. Const reference (e.g., `outputs: GIT_REF_OUTPUT_PROPERTIES,`)
* 2. Inline object (e.g., `outputs: { id: { type: 'string', ... } }`)
*/
function extractOutputsFromToolContent(content: string, toolPrefix: string): Record<string, any> {
const constMatch = content.match(/(?<![a-zA-Z_])outputs\s*:\s*([A-Z][A-Z_0-9]+)\s*(?:,|\}|$)/)
function extractOutputsFromToolContent(
content: string,
toolPrefix: string
): Record<string, any> {
const constMatch = content.match(
/(?<![a-zA-Z_])outputs\s*:\s*([A-Z][A-Z_0-9]+)\s*(?:,|\}|$)/
)
if (constMatch) {
const resolved = resolveConstReference(constMatch[1], toolPrefix)
if (resolved && typeof resolved === 'object') {
@@ -1644,7 +1649,8 @@ function extractToolInfo(
lastExportMatch = m
}
if (lastExportMatch && lastExportMatch.index !== undefined) {
const bracePos = lastExportMatch.index + lastExportMatch[0].length - 1
const bracePos =
lastExportMatch.index + lastExportMatch[0].length - 1
const ep = findMatchingClose(fileContent, bracePos)
if (ep !== -1) {
fullToolBlock = fileContent.substring(bracePos, ep)
@@ -1659,7 +1665,8 @@ function extractToolInfo(
)
const baseToolMatch = fileContent.match(baseToolRegex)
if (baseToolMatch && baseToolMatch.index !== undefined) {
const baseStart = baseToolMatch.index + baseToolMatch[0].length - 1
const baseStart =
baseToolMatch.index + baseToolMatch[0].length - 1
const endIdx = findMatchingClose(fileContent, baseStart)
if (endIdx !== -1) {
const baseToolContent = fileContent.substring(baseStart, endIdx)