Discord: add voice docs and TTS overrides

2026-02-19 18:39:20 -05:00 · 2026-02-16 19:06:18 -06:00
parent 3d24b92d85
commit b78b165c38
8 changed files with 120 additions and 5 deletions
--- a/docs/channels/discord.md
+++ b/docs/channels/discord.md
@@ -603,6 +603,47 @@ Example:
 }
 ```

+## Voice channels
+
+OpenClaw can join Discord voice channels for realtime, continuous conversations. This is separate from voice message attachments.
+
+Requirements:
+
+- Enable native commands (`commands.native` or `channels.discord.commands.native`).
+- Configure `channels.discord.voice`.
+- The bot needs Connect + Speak permissions in the target voice channel.
+
+Use the Discord-only native command `/vc join|leave|status` to control sessions. The command uses the account default agent and follows the same allowlist and group policy rules as other Discord commands.
+
+Auto-join example:
+
+```json5
+{
+  channels: {
+    discord: {
+      voice: {
+        enabled: true,
+        autoJoin: [
+          {
+            guildId: "123456789012345678",
+            channelId: "234567890123456789",
+          },
+        ],
+        tts: {
+          provider: "openai",
+          openai: { voice: "alloy" },
+        },
+      },
+    },
+  },
+}
+```
+
+Notes:
+
+- `voice.tts` overrides `messages.tts` for voice playback only.
+- Omit the `voice` block to keep voice support disabled for the account.
+
 ## Voice messages

 Discord voice messages show a waveform preview and require OGG/Opus audio plus metadata. OpenClaw generates the waveform automatically, but it needs `ffmpeg` and `ffprobe` available on the gateway host to inspect and convert audio files.
--- a/docs/gateway/configuration-reference.md
+++ b/docs/gateway/configuration-reference.md
@@ -216,6 +216,19 @@ WhatsApp runs through the gateway's web channel (Baileys Web). It starts automat
          accentColor: "#5865F2",
        },
      },
+      voice: {
+        enabled: true,
+        autoJoin: [
+          {
+            guildId: "123456789012345678",
+            channelId: "234567890123456789",
+          },
+        ],
+        tts: {
+          provider: "openai",
+          openai: { voice: "alloy" },
+        },
+      },
      retry: {
        attempts: 3,
        minDelayMs: 500,
@@ -233,6 +246,7 @@ WhatsApp runs through the gateway's web channel (Baileys Web). It starts automat
 - Bot-authored messages are ignored by default. `allowBots: true` enables them (own messages still filtered).
 - `maxLinesPerMessage` (default 17) splits tall messages even when under 2000 chars.
 - `channels.discord.ui.components.accentColor` sets the accent color for Discord components v2 containers.
+- `channels.discord.voice` enables Discord voice channel conversations and optional auto-join + TTS overrides.

 **Reaction notification modes:** `off` (none), `own` (bot's messages, default), `all` (all messages), `allowlist` (from `guilds.<id>.users` on all messages).

--- a/docs/tools/slash-commands.md
+++ b/docs/tools/slash-commands.md
@@ -118,6 +118,7 @@ Notes:
 - `/allowlist add|remove` requires `commands.config=true` and honors channel `configWrites`.
 - `/usage` controls the per-response usage footer; `/usage cost` prints a local cost summary from OpenClaw session logs.
 - `/restart` is disabled by default; set `commands.restart: true` to enable it.
+- Discord-only native command: `/vc join|leave|status` controls voice channels (requires `channels.discord.voice` and native commands; not available as text).
 - `/verbose` is meant for debugging and extra visibility; keep it **off** in normal use.
 - `/reasoning` (and `/verbose`) are risky in group settings: they may reveal internal reasoning or tool output you did not intend to expose. Prefer leaving them off, especially in group chats.
 - **Fast path:** command-only messages from allowlisted senders are handled immediately (bypass queue + model).
--- a/src/config/schema.labels.ts
+++ b/src/config/schema.labels.ts
@@ -265,6 +265,8 @@ export const FIELD_LABELS: Record<string, string> = {
  "channels.discord.ui.components.accentColor": "Discord Component Accent Color",
  "channels.discord.intents.presence": "Discord Presence Intent",
  "channels.discord.intents.guildMembers": "Discord Guild Members Intent",
+  "channels.discord.voice.enabled": "Discord Voice Enabled",
+  "channels.discord.voice.autoJoin": "Discord Voice Auto-Join",
  "channels.discord.pluralkit.enabled": "Discord PluralKit Enabled",
  "channels.discord.pluralkit.token": "Discord PluralKit Token",
  "channels.discord.activity": "Discord Presence Activity",
--- a/src/config/types.discord.ts
+++ b/src/config/types.discord.ts
@@ -10,6 +10,7 @@ import type {
 import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js";
 import type { DmConfig, ProviderCommandsConfig } from "./types.messages.js";
 import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js";
+import type { TtsConfig } from "./types.tts.js";

 export type DiscordDmConfig = {
  /** If false, ignore all incoming Discord DMs. Default: true. */
@@ -103,6 +104,8 @@ export type DiscordVoiceConfig = {
  enabled?: boolean;
  /** Voice channels to auto-join on startup. */
  autoJoin?: DiscordVoiceAutoJoinConfig[];
+  /** Optional TTS overrides for Discord voice output. */
+  tts?: TtsConfig;
 };

 export type DiscordExecApprovalConfig = {
--- a/src/config/zod-schema.providers-core.ts
+++ b/src/config/zod-schema.providers-core.ts
@@ -19,6 +19,7 @@ import {
  ProviderCommandsSchema,
  ReplyToModeSchema,
  RetryConfigSchema,
+  TtsConfigSchema,
  requireOpenAllowFrom,
 } from "./zod-schema.core.js";
 import { sensitive } from "./zod-schema.sensitive.js";
@@ -279,6 +280,7 @@ const DiscordVoiceSchema = z
  .object({
    enabled: z.boolean().optional(),
    autoJoin: z.array(DiscordVoiceAutoJoinSchema).optional(),
+    tts: TtsConfigSchema,
  })
  .strict()
  .optional();
--- a/src/discord/monitor/provider.ts
+++ b/src/discord/monitor/provider.ts
@@ -564,7 +564,6 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) {
    });
    voiceManagerRef.current = voiceManager;
    registerDiscordListener(client.listeners, new DiscordVoiceReadyListener(voiceManager));
-    void voiceManager.autoJoin();
  }

  const messageHandler = createDiscordMessageHandler({
--- a/src/discord/voice/manager.ts
+++ b/src/discord/voice/manager.ts
@@ -18,7 +18,7 @@ import fs from "node:fs/promises";
 import path from "node:path";
 import type { MsgContext } from "../../auto-reply/templating.js";
 import type { OpenClawConfig } from "../../config/config.js";
-import type { DiscordAccountConfig } from "../../config/types.js";
+import type { DiscordAccountConfig, TtsConfig } from "../../config/types.js";
 import type { RuntimeEnv } from "../../runtime.js";
 import { resolveAgentDir } from "../../agents/agent-scope.js";
 import { agentCommand } from "../../commands/agent.js";
@@ -33,7 +33,7 @@ import {
 } from "../../media-understanding/runner.js";
 import { resolveAgentRoute } from "../../routing/resolve-route.js";
 import { parseTtsDirectives } from "../../tts/tts-core.js";
-import { textToSpeech, resolveTtsConfig } from "../../tts/tts.js";
+import { resolveTtsConfig, textToSpeech, type ResolvedTtsConfig } from "../../tts/tts.js";

 const SAMPLE_RATE = 48_000;
 const CHANNELS = 2;
@@ -64,6 +64,56 @@ type VoiceSessionEntry = {
  stop: () => void;
 };

+function mergeTtsConfig(base: TtsConfig, override?: TtsConfig): TtsConfig {
+  if (!override) {
+    return base;
+  }
+  return {
+    ...base,
+    ...override,
+    modelOverrides: {
+      ...base.modelOverrides,
+      ...override.modelOverrides,
+    },
+    elevenlabs: {
+      ...base.elevenlabs,
+      ...override.elevenlabs,
+      voiceSettings: {
+        ...base.elevenlabs?.voiceSettings,
+        ...override.elevenlabs?.voiceSettings,
+      },
+    },
+    openai: {
+      ...base.openai,
+      ...override.openai,
+    },
+    edge: {
+      ...base.edge,
+      ...override.edge,
+    },
+  };
+}
+
+function resolveVoiceTtsConfig(params: { cfg: OpenClawConfig; override?: TtsConfig }): {
+  cfg: OpenClawConfig;
+  resolved: ResolvedTtsConfig;
+} {
+  if (!params.override) {
+    return { cfg: params.cfg, resolved: resolveTtsConfig(params.cfg) };
+  }
+  const base = params.cfg.messages?.tts ?? {};
+  const merged = mergeTtsConfig(base, params.override);
+  const messages = params.cfg.messages ?? {};
+  const cfg = {
+    ...params.cfg,
+    messages: {
+      ...messages,
+      tts: merged,
+    },
+  };
+  return { cfg, resolved: resolveTtsConfig(cfg) };
+}
+
 function buildWavBuffer(pcm: Buffer): Buffer {
  const blockAlign = (CHANNELS * BIT_DEPTH) / 8;
  const byteRate = SAMPLE_RATE * blockAlign;
@@ -461,7 +511,10 @@ export class DiscordVoiceManager {
      return;
    }

-    const ttsConfig = resolveTtsConfig(this.params.cfg);
+    const { cfg: ttsCfg, resolved: ttsConfig } = resolveVoiceTtsConfig({
+      cfg: this.params.cfg,
+      override: this.params.discordConfig.voice?.tts,
+    });
    const directive = parseTtsDirectives(replyText, ttsConfig.modelOverrides);
    const speakText = directive.overrides.ttsText ?? directive.cleanedText.trim();
    if (!speakText) {
@@ -470,7 +523,7 @@ export class DiscordVoiceManager {

    const ttsResult = await textToSpeech({
      text: speakText,
-      cfg: this.params.cfg,
+      cfg: ttsCfg,
      channel: "discord",
      overrides: directive.overrides,
    });