fix: fix: transcribe audio before mention check in groups with requireMention (openclaw#9973) thanks @mcinteerj

Verified: - pnpm install --frozen-lockfile - pnpm build - pnpm check - pnpm test Co-authored-by: mcinteerj <3613653+mcinteerj@users.noreply.github.com>
2026-02-19 18:39:20 -05:00 · 2026-02-13 04:58:01 +13:00
parent a5ab9fac0c
commit a2ddcdadeb
7 changed files with 245 additions and 38 deletions
--- a/docs/nodes/audio.md
+++ b/docs/nodes/audio.md
@@ -107,8 +107,27 @@ Note: Binary detection is best-effort across macOS/Linux/Windows; ensure the CLI
 - Transcript is available to templates as `{{Transcript}}`.
 - CLI stdout is capped (5MB); keep CLI output concise.

+## Mention Detection in Groups
+
+When `requireMention: true` is set for a group chat, OpenClaw now transcribes audio **before** checking for mentions. This allows voice notes to be processed even when they contain mentions.
+
+**How it works:**
+
+1. If a voice message has no text body and the group requires mentions, OpenClaw performs a "preflight" transcription.
+2. The transcript is checked for mention patterns (e.g., `@BotName`, emoji triggers).
+3. If a mention is found, the message proceeds through the full reply pipeline.
+4. The transcript is used for mention detection so voice notes can pass the mention gate.
+
+**Fallback behavior:**
+
+- If transcription fails during preflight (timeout, API error, etc.), the message is processed based on text-only mention detection.
+- This ensures that mixed messages (text + audio) are never incorrectly dropped.
+
+**Example:** A user sends a voice note saying "Hey @Claude, what's the weather?" in a Telegram group with `requireMention: true`. The voice note is transcribed, the mention is detected, and the agent replies.
+
 ## Gotchas

 - Scope rules use first-match wins. `chatType` is normalized to `direct`, `group`, or `room`.
 - Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`.
 - Keep timeouts reasonable (`timeoutSeconds`, default 60s) to avoid blocking the reply queue.
+- Preflight transcription only processes the **first** audio attachment for mention detection. Additional audio is processed during the main media understanding phase.
--- a/src/auto-reply/reply/mentions.ts
+++ b/src/auto-reply/reply/mentions.ts
@@ -90,18 +90,24 @@ export function matchesMentionWithExplicit(params: {
  text: string;
  mentionRegexes: RegExp[];
  explicit?: ExplicitMentionSignal;
+  transcript?: string;
 }): boolean {
  const cleaned = normalizeMentionText(params.text ?? "");
  const explicit = params.explicit?.isExplicitlyMentioned === true;
  const explicitAvailable = params.explicit?.canResolveExplicit === true;
  const hasAnyMention = params.explicit?.hasAnyMention === true;
+
+  // Check transcript if text is empty and transcript is provided
+  const transcriptCleaned = params.transcript ? normalizeMentionText(params.transcript) : "";
+  const textToCheck = cleaned || transcriptCleaned;
+
  if (hasAnyMention && explicitAvailable) {
-    return explicit || params.mentionRegexes.some((re) => re.test(cleaned));
+    return explicit || params.mentionRegexes.some((re) => re.test(textToCheck));
  }
-  if (!cleaned) {
+  if (!textToCheck) {
    return explicit;
  }
-  return explicit || params.mentionRegexes.some((re) => re.test(cleaned));
+  return explicit || params.mentionRegexes.some((re) => re.test(textToCheck));
 }

 export function stripStructuralPrefixes(text: string): string {
--- a/src/discord/monitor/message-handler.preflight.ts
+++ b/src/discord/monitor/message-handler.preflight.ts
@@ -242,28 +242,6 @@ export async function preflightDiscordMessage(
      (message.mentionedUsers?.length ?? 0) > 0 ||
      (message.mentionedRoles?.length ?? 0) > 0),
  );
-  const wasMentioned =
-    !isDirectMessage &&
-    matchesMentionWithExplicit({
-      text: baseText,
-      mentionRegexes,
-      explicit: {
-        hasAnyMention,
-        isExplicitlyMentioned: explicitlyMentioned,
-        canResolveExplicit: Boolean(botId),
-      },
-    });
-  const implicitMention = Boolean(
-    !isDirectMessage &&
-    botId &&
-    message.referencedMessage?.author?.id &&
-    message.referencedMessage.author.id === botId,
-  );
-  if (shouldLogVerbose()) {
-    logVerbose(
-      `discord: inbound id=${message.id} guild=${message.guild?.id ?? "dm"} channel=${message.channelId} mention=${wasMentioned ? "yes" : "no"} type=${isDirectMessage ? "dm" : isGroupDm ? "group-dm" : "guild"} content=${messageText ? "yes" : "no"}`,
-    );
-  }

  if (
    isGuildMessage &&
@@ -400,6 +378,74 @@ export async function preflightDiscordMessage(
    channelConfig,
    guildInfo,
  });
+
+  // Preflight audio transcription for mention detection in guilds
+  // This allows voice notes to be checked for mentions before being dropped
+  let preflightTranscript: string | undefined;
+  const hasAudioAttachment = message.attachments?.some((att: { contentType?: string }) =>
+    att.contentType?.startsWith("audio/"),
+  );
+  const needsPreflightTranscription =
+    !isDirectMessage &&
+    shouldRequireMention &&
+    hasAudioAttachment &&
+    !baseText &&
+    mentionRegexes.length > 0;
+
+  if (needsPreflightTranscription) {
+    try {
+      const { transcribeFirstAudio } = await import("../../media-understanding/audio-preflight.js");
+      const audioPaths =
+        message.attachments
+          ?.filter((att: { contentType?: string; url: string }) =>
+            att.contentType?.startsWith("audio/"),
+          )
+          .map((att: { url: string }) => att.url) ?? [];
+      if (audioPaths.length > 0) {
+        const tempCtx = {
+          MediaUrls: audioPaths,
+          MediaTypes: message.attachments
+            ?.filter((att: { contentType?: string; url: string }) =>
+              att.contentType?.startsWith("audio/"),
+            )
+            .map((att: { contentType?: string }) => att.contentType)
+            .filter(Boolean) as string[],
+        };
+        preflightTranscript = await transcribeFirstAudio({
+          ctx: tempCtx,
+          cfg: params.cfg,
+          agentDir: undefined,
+        });
+      }
+    } catch (err) {
+      logVerbose(`discord: audio preflight transcription failed: ${String(err)}`);
+    }
+  }
+
+  const wasMentioned =
+    !isDirectMessage &&
+    matchesMentionWithExplicit({
+      text: baseText,
+      mentionRegexes,
+      explicit: {
+        hasAnyMention,
+        isExplicitlyMentioned: explicitlyMentioned,
+        canResolveExplicit: Boolean(botId),
+      },
+      transcript: preflightTranscript,
+    });
+  const implicitMention = Boolean(
+    !isDirectMessage &&
+    botId &&
+    message.referencedMessage?.author?.id &&
+    message.referencedMessage.author.id === botId,
+  );
+  if (shouldLogVerbose()) {
+    logVerbose(
+      `discord: inbound id=${message.id} guild=${message.guild?.id ?? "dm"} channel=${message.channelId} mention=${wasMentioned ? "yes" : "no"} type=${isDirectMessage ? "dm" : isGroupDm ? "group-dm" : "guild"} content=${messageText ? "yes" : "no"}`,
+    );
+  }
+
  const allowTextCommands = shouldHandleTextCommands({
    cfg: params.cfg,
    surface: "discord",
--- a/src/media-understanding/attachments.ts
+++ b/src/media-understanding/attachments.ts
@@ -182,6 +182,10 @@ export function selectAttachments(params: {
 }): MediaAttachment[] {
  const { capability, attachments, policy } = params;
  const matches = attachments.filter((item) => {
+    // Skip already-transcribed audio attachments from preflight
+    if (capability === "audio" && item.alreadyTranscribed) {
+      return false;
+    }
    if (capability === "image") {
      return isImageAttachment(item);
    }
--- a/src/media-understanding/audio-preflight.ts
+++ b/src/media-understanding/audio-preflight.ts
@@ -0,0 +1,97 @@
+import type { MsgContext } from "../auto-reply/templating.js";
+import type { OpenClawConfig } from "../config/config.js";
+import type { MediaUnderstandingProvider } from "./types.js";
+import { logVerbose, shouldLogVerbose } from "../globals.js";
+import { isAudioAttachment } from "./attachments.js";
+import {
+  type ActiveMediaModel,
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
+
+/**
+ * Transcribes the first audio attachment BEFORE mention checking.
+ * This allows voice notes to be processed in group chats with requireMention: true.
+ * Returns the transcript or undefined if transcription fails or no audio is found.
+ */
+export async function transcribeFirstAudio(params: {
+  ctx: MsgContext;
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  providers?: Record<string, MediaUnderstandingProvider>;
+  activeModel?: ActiveMediaModel;
+}): Promise<string | undefined> {
+  const { ctx, cfg } = params;
+
+  // Check if audio transcription is enabled in config
+  const audioConfig = cfg.tools?.media?.audio;
+  if (!audioConfig || audioConfig.enabled === false) {
+    return undefined;
+  }
+
+  const attachments = normalizeMediaAttachments(ctx);
+  if (!attachments || attachments.length === 0) {
+    return undefined;
+  }
+
+  // Find first audio attachment
+  const firstAudio = attachments.find(
+    (att) => att && isAudioAttachment(att) && !att.alreadyTranscribed,
+  );
+
+  if (!firstAudio) {
+    return undefined;
+  }
+
+  if (shouldLogVerbose()) {
+    logVerbose(`audio-preflight: transcribing attachment ${firstAudio.index} for mention check`);
+  }
+
+  const providerRegistry = buildProviderRegistry(params.providers);
+  const cache = createMediaAttachmentCache(attachments);
+
+  try {
+    const result = await runCapability({
+      capability: "audio",
+      cfg,
+      ctx,
+      attachments: cache,
+      media: attachments,
+      agentDir: params.agentDir,
+      providerRegistry,
+      config: audioConfig,
+      activeModel: params.activeModel,
+    });
+
+    if (!result || result.outputs.length === 0) {
+      return undefined;
+    }
+
+    // Extract transcript from first audio output
+    const audioOutput = result.outputs.find((output) => output.kind === "audio.transcription");
+    if (!audioOutput || !audioOutput.text) {
+      return undefined;
+    }
+
+    // Mark this attachment as transcribed to avoid double-processing
+    firstAudio.alreadyTranscribed = true;
+
+    if (shouldLogVerbose()) {
+      logVerbose(
+        `audio-preflight: transcribed ${audioOutput.text.length} chars from attachment ${firstAudio.index}`,
+      );
+    }
+
+    return audioOutput.text;
+  } catch (err) {
+    // Log but don't throw - let the message proceed with text-only mention check
+    if (shouldLogVerbose()) {
+      logVerbose(`audio-preflight: transcription failed: ${String(err)}`);
+    }
+    return undefined;
+  } finally {
+    await cache.cleanup();
+  }
+}
--- a/src/media-understanding/types.ts
+++ b/src/media-understanding/types.ts
@@ -10,6 +10,7 @@ export type MediaAttachment = {
  url?: string;
  mime?: string;
  index: number;
+  alreadyTranscribed?: boolean;
 };

 export type MediaUnderstandingOutput = {
--- a/src/telegram/bot-message-context.ts
+++ b/src/telegram/bot-message-context.ts
@@ -1,4 +1,5 @@
 import type { Bot } from "grammy";
+import type { MsgContext } from "../auto-reply/templating.js";
 import type { OpenClawConfig } from "../config/config.js";
 import type { DmPolicy, TelegramGroupConfig, TelegramTopicConfig } from "../config/types.js";
 import type { StickerMetadata, TelegramContext } from "./bot/types.js";
@@ -203,6 +204,21 @@ export const buildTelegramMessageContext = async ({
    return null;
  }

+  // Compute requireMention early for preflight transcription gating
+  const activationOverride = resolveGroupActivation({
+    chatId,
+    messageThreadId: resolvedThreadId,
+    sessionKey: sessionKey,
+    agentId: route.agentId,
+  });
+  const baseRequireMention = resolveGroupRequireMention(chatId);
+  const requireMention = firstDefined(
+    activationOverride,
+    topicConfig?.requireMention,
+    groupConfig?.requireMention,
+    baseRequireMention,
+  );
+
  const sendTyping = async () => {
    await withTelegramApiErrorLogging({
      operation: "sendChatAction",
@@ -370,6 +386,7 @@ export const buildTelegramMessageContext = async ({
  const locationText = locationData ? formatLocationText(locationData) : undefined;
  const rawTextSource = msg.text ?? msg.caption ?? "";
  const rawText = expandTextLinks(rawTextSource, msg.entities ?? msg.caption_entities).trim();
+  const hasUserText = Boolean(rawText || locationText);
  let rawBody = [rawText, locationText].filter(Boolean).join("\n").trim();
  if (!rawBody) {
    rawBody = placeholder;
@@ -386,6 +403,35 @@ export const buildTelegramMessageContext = async ({
    (ent) => ent.type === "mention",
  );
  const explicitlyMentioned = botUsername ? hasBotMention(msg, botUsername) : false;
+
+  // Preflight audio transcription for mention detection in groups
+  // This allows voice notes to be checked for mentions before being dropped
+  let preflightTranscript: string | undefined;
+  const hasAudio = allMedia.some((media) => media.contentType?.startsWith("audio/"));
+  const needsPreflightTranscription =
+    isGroup && requireMention && hasAudio && !hasUserText && mentionRegexes.length > 0;
+
+  if (needsPreflightTranscription) {
+    try {
+      const { transcribeFirstAudio } = await import("../media-understanding/audio-preflight.js");
+      // Build a minimal context for transcription
+      const tempCtx: MsgContext = {
+        MediaPaths: allMedia.length > 0 ? allMedia.map((m) => m.path) : undefined,
+        MediaTypes:
+          allMedia.length > 0
+            ? (allMedia.map((m) => m.contentType).filter(Boolean) as string[])
+            : undefined,
+      };
+      preflightTranscript = await transcribeFirstAudio({
+        ctx: tempCtx,
+        cfg,
+        agentDir: undefined,
+      });
+    } catch (err) {
+      logVerbose(`telegram: audio preflight transcription failed: ${String(err)}`);
+    }
+  }
+
  const computedWasMentioned = matchesMentionWithExplicit({
    text: msg.text ?? msg.caption ?? "",
    mentionRegexes,
@@ -394,6 +440,7 @@ export const buildTelegramMessageContext = async ({
      isExplicitlyMentioned: explicitlyMentioned,
      canResolveExplicit: Boolean(botUsername),
    },
+    transcript: preflightTranscript,
  });
  const wasMentioned = options?.forceWasMentioned === true ? true : computedWasMentioned;
  if (isGroup && commandGate.shouldBlock) {
@@ -405,19 +452,6 @@ export const buildTelegramMessageContext = async ({
    });
    return null;
  }
-  const activationOverride = resolveGroupActivation({
-    chatId,
-    messageThreadId: resolvedThreadId,
-    sessionKey: sessionKey,
-    agentId: route.agentId,
-  });
-  const baseRequireMention = resolveGroupRequireMention(chatId);
-  const requireMention = firstDefined(
-    activationOverride,
-    topicConfig?.requireMention,
-    groupConfig?.requireMention,
-    baseRequireMention,
-  );
  // Reply-chain detection: replying to a bot message acts like an implicit mention.
  const botId = primaryCtx.me?.id;
  const replyFromId = msg.reply_to_message?.from?.id;