diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md index 00711cd8a6..4d6208f245 100644 --- a/docs/nodes/audio.md +++ b/docs/nodes/audio.md @@ -107,8 +107,27 @@ Note: Binary detection is best-effort across macOS/Linux/Windows; ensure the CLI - Transcript is available to templates as `{{Transcript}}`. - CLI stdout is capped (5MB); keep CLI output concise. +## Mention Detection in Groups + +When `requireMention: true` is set for a group chat, OpenClaw now transcribes audio **before** checking for mentions. This allows voice notes to be processed even when they contain mentions. + +**How it works:** + +1. If a voice message has no text body and the group requires mentions, OpenClaw performs a "preflight" transcription. +2. The transcript is checked for mention patterns (e.g., `@BotName`, emoji triggers). +3. If a mention is found, the message proceeds through the full reply pipeline. +4. The transcript is used for mention detection so voice notes can pass the mention gate. + +**Fallback behavior:** + +- If transcription fails during preflight (timeout, API error, etc.), the message is processed based on text-only mention detection. +- This ensures that mixed messages (text + audio) are never incorrectly dropped. + +**Example:** A user sends a voice note saying "Hey @Claude, what's the weather?" in a Telegram group with `requireMention: true`. The voice note is transcribed, the mention is detected, and the agent replies. + ## Gotchas - Scope rules use first-match wins. `chatType` is normalized to `direct`, `group`, or `room`. - Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`. - Keep timeouts reasonable (`timeoutSeconds`, default 60s) to avoid blocking the reply queue. +- Preflight transcription only processes the **first** audio attachment for mention detection. Additional audio is processed during the main media understanding phase. diff --git a/src/auto-reply/reply/mentions.ts b/src/auto-reply/reply/mentions.ts index d0a6c253d0..2997aa9b1c 100644 --- a/src/auto-reply/reply/mentions.ts +++ b/src/auto-reply/reply/mentions.ts @@ -90,18 +90,24 @@ export function matchesMentionWithExplicit(params: { text: string; mentionRegexes: RegExp[]; explicit?: ExplicitMentionSignal; + transcript?: string; }): boolean { const cleaned = normalizeMentionText(params.text ?? ""); const explicit = params.explicit?.isExplicitlyMentioned === true; const explicitAvailable = params.explicit?.canResolveExplicit === true; const hasAnyMention = params.explicit?.hasAnyMention === true; + + // Check transcript if text is empty and transcript is provided + const transcriptCleaned = params.transcript ? normalizeMentionText(params.transcript) : ""; + const textToCheck = cleaned || transcriptCleaned; + if (hasAnyMention && explicitAvailable) { - return explicit || params.mentionRegexes.some((re) => re.test(cleaned)); + return explicit || params.mentionRegexes.some((re) => re.test(textToCheck)); } - if (!cleaned) { + if (!textToCheck) { return explicit; } - return explicit || params.mentionRegexes.some((re) => re.test(cleaned)); + return explicit || params.mentionRegexes.some((re) => re.test(textToCheck)); } export function stripStructuralPrefixes(text: string): string { diff --git a/src/discord/monitor/message-handler.preflight.ts b/src/discord/monitor/message-handler.preflight.ts index 38126a050e..0ef2eac186 100644 --- a/src/discord/monitor/message-handler.preflight.ts +++ b/src/discord/monitor/message-handler.preflight.ts @@ -242,28 +242,6 @@ export async function preflightDiscordMessage( (message.mentionedUsers?.length ?? 0) > 0 || (message.mentionedRoles?.length ?? 0) > 0), ); - const wasMentioned = - !isDirectMessage && - matchesMentionWithExplicit({ - text: baseText, - mentionRegexes, - explicit: { - hasAnyMention, - isExplicitlyMentioned: explicitlyMentioned, - canResolveExplicit: Boolean(botId), - }, - }); - const implicitMention = Boolean( - !isDirectMessage && - botId && - message.referencedMessage?.author?.id && - message.referencedMessage.author.id === botId, - ); - if (shouldLogVerbose()) { - logVerbose( - `discord: inbound id=${message.id} guild=${message.guild?.id ?? "dm"} channel=${message.channelId} mention=${wasMentioned ? "yes" : "no"} type=${isDirectMessage ? "dm" : isGroupDm ? "group-dm" : "guild"} content=${messageText ? "yes" : "no"}`, - ); - } if ( isGuildMessage && @@ -400,6 +378,74 @@ export async function preflightDiscordMessage( channelConfig, guildInfo, }); + + // Preflight audio transcription for mention detection in guilds + // This allows voice notes to be checked for mentions before being dropped + let preflightTranscript: string | undefined; + const hasAudioAttachment = message.attachments?.some((att: { contentType?: string }) => + att.contentType?.startsWith("audio/"), + ); + const needsPreflightTranscription = + !isDirectMessage && + shouldRequireMention && + hasAudioAttachment && + !baseText && + mentionRegexes.length > 0; + + if (needsPreflightTranscription) { + try { + const { transcribeFirstAudio } = await import("../../media-understanding/audio-preflight.js"); + const audioPaths = + message.attachments + ?.filter((att: { contentType?: string; url: string }) => + att.contentType?.startsWith("audio/"), + ) + .map((att: { url: string }) => att.url) ?? []; + if (audioPaths.length > 0) { + const tempCtx = { + MediaUrls: audioPaths, + MediaTypes: message.attachments + ?.filter((att: { contentType?: string; url: string }) => + att.contentType?.startsWith("audio/"), + ) + .map((att: { contentType?: string }) => att.contentType) + .filter(Boolean) as string[], + }; + preflightTranscript = await transcribeFirstAudio({ + ctx: tempCtx, + cfg: params.cfg, + agentDir: undefined, + }); + } + } catch (err) { + logVerbose(`discord: audio preflight transcription failed: ${String(err)}`); + } + } + + const wasMentioned = + !isDirectMessage && + matchesMentionWithExplicit({ + text: baseText, + mentionRegexes, + explicit: { + hasAnyMention, + isExplicitlyMentioned: explicitlyMentioned, + canResolveExplicit: Boolean(botId), + }, + transcript: preflightTranscript, + }); + const implicitMention = Boolean( + !isDirectMessage && + botId && + message.referencedMessage?.author?.id && + message.referencedMessage.author.id === botId, + ); + if (shouldLogVerbose()) { + logVerbose( + `discord: inbound id=${message.id} guild=${message.guild?.id ?? "dm"} channel=${message.channelId} mention=${wasMentioned ? "yes" : "no"} type=${isDirectMessage ? "dm" : isGroupDm ? "group-dm" : "guild"} content=${messageText ? "yes" : "no"}`, + ); + } + const allowTextCommands = shouldHandleTextCommands({ cfg: params.cfg, surface: "discord", diff --git a/src/media-understanding/attachments.ts b/src/media-understanding/attachments.ts index 0c2449208f..939a55f96d 100644 --- a/src/media-understanding/attachments.ts +++ b/src/media-understanding/attachments.ts @@ -182,6 +182,10 @@ export function selectAttachments(params: { }): MediaAttachment[] { const { capability, attachments, policy } = params; const matches = attachments.filter((item) => { + // Skip already-transcribed audio attachments from preflight + if (capability === "audio" && item.alreadyTranscribed) { + return false; + } if (capability === "image") { return isImageAttachment(item); } diff --git a/src/media-understanding/audio-preflight.ts b/src/media-understanding/audio-preflight.ts new file mode 100644 index 0000000000..0db4a22821 --- /dev/null +++ b/src/media-understanding/audio-preflight.ts @@ -0,0 +1,97 @@ +import type { MsgContext } from "../auto-reply/templating.js"; +import type { OpenClawConfig } from "../config/config.js"; +import type { MediaUnderstandingProvider } from "./types.js"; +import { logVerbose, shouldLogVerbose } from "../globals.js"; +import { isAudioAttachment } from "./attachments.js"; +import { + type ActiveMediaModel, + buildProviderRegistry, + createMediaAttachmentCache, + normalizeMediaAttachments, + runCapability, +} from "./runner.js"; + +/** + * Transcribes the first audio attachment BEFORE mention checking. + * This allows voice notes to be processed in group chats with requireMention: true. + * Returns the transcript or undefined if transcription fails or no audio is found. + */ +export async function transcribeFirstAudio(params: { + ctx: MsgContext; + cfg: OpenClawConfig; + agentDir?: string; + providers?: Record; + activeModel?: ActiveMediaModel; +}): Promise { + const { ctx, cfg } = params; + + // Check if audio transcription is enabled in config + const audioConfig = cfg.tools?.media?.audio; + if (!audioConfig || audioConfig.enabled === false) { + return undefined; + } + + const attachments = normalizeMediaAttachments(ctx); + if (!attachments || attachments.length === 0) { + return undefined; + } + + // Find first audio attachment + const firstAudio = attachments.find( + (att) => att && isAudioAttachment(att) && !att.alreadyTranscribed, + ); + + if (!firstAudio) { + return undefined; + } + + if (shouldLogVerbose()) { + logVerbose(`audio-preflight: transcribing attachment ${firstAudio.index} for mention check`); + } + + const providerRegistry = buildProviderRegistry(params.providers); + const cache = createMediaAttachmentCache(attachments); + + try { + const result = await runCapability({ + capability: "audio", + cfg, + ctx, + attachments: cache, + media: attachments, + agentDir: params.agentDir, + providerRegistry, + config: audioConfig, + activeModel: params.activeModel, + }); + + if (!result || result.outputs.length === 0) { + return undefined; + } + + // Extract transcript from first audio output + const audioOutput = result.outputs.find((output) => output.kind === "audio.transcription"); + if (!audioOutput || !audioOutput.text) { + return undefined; + } + + // Mark this attachment as transcribed to avoid double-processing + firstAudio.alreadyTranscribed = true; + + if (shouldLogVerbose()) { + logVerbose( + `audio-preflight: transcribed ${audioOutput.text.length} chars from attachment ${firstAudio.index}`, + ); + } + + return audioOutput.text; + } catch (err) { + // Log but don't throw - let the message proceed with text-only mention check + if (shouldLogVerbose()) { + logVerbose(`audio-preflight: transcription failed: ${String(err)}`); + } + return undefined; + } finally { + await cache.cleanup(); + } +} diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts index 252559a7a4..60c425626d 100644 --- a/src/media-understanding/types.ts +++ b/src/media-understanding/types.ts @@ -10,6 +10,7 @@ export type MediaAttachment = { url?: string; mime?: string; index: number; + alreadyTranscribed?: boolean; }; export type MediaUnderstandingOutput = { diff --git a/src/telegram/bot-message-context.ts b/src/telegram/bot-message-context.ts index 710b38ed5a..041c93eab9 100644 --- a/src/telegram/bot-message-context.ts +++ b/src/telegram/bot-message-context.ts @@ -1,4 +1,5 @@ import type { Bot } from "grammy"; +import type { MsgContext } from "../auto-reply/templating.js"; import type { OpenClawConfig } from "../config/config.js"; import type { DmPolicy, TelegramGroupConfig, TelegramTopicConfig } from "../config/types.js"; import type { StickerMetadata, TelegramContext } from "./bot/types.js"; @@ -203,6 +204,21 @@ export const buildTelegramMessageContext = async ({ return null; } + // Compute requireMention early for preflight transcription gating + const activationOverride = resolveGroupActivation({ + chatId, + messageThreadId: resolvedThreadId, + sessionKey: sessionKey, + agentId: route.agentId, + }); + const baseRequireMention = resolveGroupRequireMention(chatId); + const requireMention = firstDefined( + activationOverride, + topicConfig?.requireMention, + groupConfig?.requireMention, + baseRequireMention, + ); + const sendTyping = async () => { await withTelegramApiErrorLogging({ operation: "sendChatAction", @@ -370,6 +386,7 @@ export const buildTelegramMessageContext = async ({ const locationText = locationData ? formatLocationText(locationData) : undefined; const rawTextSource = msg.text ?? msg.caption ?? ""; const rawText = expandTextLinks(rawTextSource, msg.entities ?? msg.caption_entities).trim(); + const hasUserText = Boolean(rawText || locationText); let rawBody = [rawText, locationText].filter(Boolean).join("\n").trim(); if (!rawBody) { rawBody = placeholder; @@ -386,6 +403,35 @@ export const buildTelegramMessageContext = async ({ (ent) => ent.type === "mention", ); const explicitlyMentioned = botUsername ? hasBotMention(msg, botUsername) : false; + + // Preflight audio transcription for mention detection in groups + // This allows voice notes to be checked for mentions before being dropped + let preflightTranscript: string | undefined; + const hasAudio = allMedia.some((media) => media.contentType?.startsWith("audio/")); + const needsPreflightTranscription = + isGroup && requireMention && hasAudio && !hasUserText && mentionRegexes.length > 0; + + if (needsPreflightTranscription) { + try { + const { transcribeFirstAudio } = await import("../media-understanding/audio-preflight.js"); + // Build a minimal context for transcription + const tempCtx: MsgContext = { + MediaPaths: allMedia.length > 0 ? allMedia.map((m) => m.path) : undefined, + MediaTypes: + allMedia.length > 0 + ? (allMedia.map((m) => m.contentType).filter(Boolean) as string[]) + : undefined, + }; + preflightTranscript = await transcribeFirstAudio({ + ctx: tempCtx, + cfg, + agentDir: undefined, + }); + } catch (err) { + logVerbose(`telegram: audio preflight transcription failed: ${String(err)}`); + } + } + const computedWasMentioned = matchesMentionWithExplicit({ text: msg.text ?? msg.caption ?? "", mentionRegexes, @@ -394,6 +440,7 @@ export const buildTelegramMessageContext = async ({ isExplicitlyMentioned: explicitlyMentioned, canResolveExplicit: Boolean(botUsername), }, + transcript: preflightTranscript, }); const wasMentioned = options?.forceWasMentioned === true ? true : computedWasMentioned; if (isGroup && commandGate.shouldBlock) { @@ -405,19 +452,6 @@ export const buildTelegramMessageContext = async ({ }); return null; } - const activationOverride = resolveGroupActivation({ - chatId, - messageThreadId: resolvedThreadId, - sessionKey: sessionKey, - agentId: route.agentId, - }); - const baseRequireMention = resolveGroupRequireMention(chatId); - const requireMention = firstDefined( - activationOverride, - topicConfig?.requireMention, - groupConfig?.requireMention, - baseRequireMention, - ); // Reply-chain detection: replying to a bot message acts like an implicit mention. const botId = primaryCtx.me?.id; const replyFromId = msg.reply_to_message?.from?.id;