diff --git a/docs/channels/discord.md b/docs/channels/discord.md index cfe020b4e1..427443ffaf 100644 --- a/docs/channels/discord.md +++ b/docs/channels/discord.md @@ -603,6 +603,47 @@ Example: } ``` +## Voice channels + +OpenClaw can join Discord voice channels for realtime, continuous conversations. This is separate from voice message attachments. + +Requirements: + +- Enable native commands (`commands.native` or `channels.discord.commands.native`). +- Configure `channels.discord.voice`. +- The bot needs Connect + Speak permissions in the target voice channel. + +Use the Discord-only native command `/vc join|leave|status` to control sessions. The command uses the account default agent and follows the same allowlist and group policy rules as other Discord commands. + +Auto-join example: + +```json5 +{ + channels: { + discord: { + voice: { + enabled: true, + autoJoin: [ + { + guildId: "123456789012345678", + channelId: "234567890123456789", + }, + ], + tts: { + provider: "openai", + openai: { voice: "alloy" }, + }, + }, + }, + }, +} +``` + +Notes: + +- `voice.tts` overrides `messages.tts` for voice playback only. +- Omit the `voice` block to keep voice support disabled for the account. + ## Voice messages Discord voice messages show a waveform preview and require OGG/Opus audio plus metadata. OpenClaw generates the waveform automatically, but it needs `ffmpeg` and `ffprobe` available on the gateway host to inspect and convert audio files. diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md index dcdb86b48c..89e06f2ccd 100644 --- a/docs/gateway/configuration-reference.md +++ b/docs/gateway/configuration-reference.md @@ -216,6 +216,19 @@ WhatsApp runs through the gateway's web channel (Baileys Web). It starts automat accentColor: "#5865F2", }, }, + voice: { + enabled: true, + autoJoin: [ + { + guildId: "123456789012345678", + channelId: "234567890123456789", + }, + ], + tts: { + provider: "openai", + openai: { voice: "alloy" }, + }, + }, retry: { attempts: 3, minDelayMs: 500, @@ -233,6 +246,7 @@ WhatsApp runs through the gateway's web channel (Baileys Web). It starts automat - Bot-authored messages are ignored by default. `allowBots: true` enables them (own messages still filtered). - `maxLinesPerMessage` (default 17) splits tall messages even when under 2000 chars. - `channels.discord.ui.components.accentColor` sets the accent color for Discord components v2 containers. +- `channels.discord.voice` enables Discord voice channel conversations and optional auto-join + TTS overrides. **Reaction notification modes:** `off` (none), `own` (bot's messages, default), `all` (all messages), `allowlist` (from `guilds..users` on all messages). diff --git a/docs/tools/slash-commands.md b/docs/tools/slash-commands.md index 081e4933b6..c0b1ab2490 100644 --- a/docs/tools/slash-commands.md +++ b/docs/tools/slash-commands.md @@ -118,6 +118,7 @@ Notes: - `/allowlist add|remove` requires `commands.config=true` and honors channel `configWrites`. - `/usage` controls the per-response usage footer; `/usage cost` prints a local cost summary from OpenClaw session logs. - `/restart` is disabled by default; set `commands.restart: true` to enable it. +- Discord-only native command: `/vc join|leave|status` controls voice channels (requires `channels.discord.voice` and native commands; not available as text). - `/verbose` is meant for debugging and extra visibility; keep it **off** in normal use. - `/reasoning` (and `/verbose`) are risky in group settings: they may reveal internal reasoning or tool output you did not intend to expose. Prefer leaving them off, especially in group chats. - **Fast path:** command-only messages from allowlisted senders are handled immediately (bypass queue + model). diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index e7fc90854c..5cd97e07d7 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -265,6 +265,8 @@ export const FIELD_LABELS: Record = { "channels.discord.ui.components.accentColor": "Discord Component Accent Color", "channels.discord.intents.presence": "Discord Presence Intent", "channels.discord.intents.guildMembers": "Discord Guild Members Intent", + "channels.discord.voice.enabled": "Discord Voice Enabled", + "channels.discord.voice.autoJoin": "Discord Voice Auto-Join", "channels.discord.pluralkit.enabled": "Discord PluralKit Enabled", "channels.discord.pluralkit.token": "Discord PluralKit Token", "channels.discord.activity": "Discord Presence Activity", diff --git a/src/config/types.discord.ts b/src/config/types.discord.ts index c4fb68f43f..a4cfe26ad7 100644 --- a/src/config/types.discord.ts +++ b/src/config/types.discord.ts @@ -10,6 +10,7 @@ import type { import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js"; import type { DmConfig, ProviderCommandsConfig } from "./types.messages.js"; import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js"; +import type { TtsConfig } from "./types.tts.js"; export type DiscordDmConfig = { /** If false, ignore all incoming Discord DMs. Default: true. */ @@ -103,6 +104,8 @@ export type DiscordVoiceConfig = { enabled?: boolean; /** Voice channels to auto-join on startup. */ autoJoin?: DiscordVoiceAutoJoinConfig[]; + /** Optional TTS overrides for Discord voice output. */ + tts?: TtsConfig; }; export type DiscordExecApprovalConfig = { diff --git a/src/config/zod-schema.providers-core.ts b/src/config/zod-schema.providers-core.ts index e5b1346358..9362edd996 100644 --- a/src/config/zod-schema.providers-core.ts +++ b/src/config/zod-schema.providers-core.ts @@ -19,6 +19,7 @@ import { ProviderCommandsSchema, ReplyToModeSchema, RetryConfigSchema, + TtsConfigSchema, requireOpenAllowFrom, } from "./zod-schema.core.js"; import { sensitive } from "./zod-schema.sensitive.js"; @@ -279,6 +280,7 @@ const DiscordVoiceSchema = z .object({ enabled: z.boolean().optional(), autoJoin: z.array(DiscordVoiceAutoJoinSchema).optional(), + tts: TtsConfigSchema, }) .strict() .optional(); diff --git a/src/discord/monitor/provider.ts b/src/discord/monitor/provider.ts index deab66f8d8..95bd6cc632 100644 --- a/src/discord/monitor/provider.ts +++ b/src/discord/monitor/provider.ts @@ -564,7 +564,6 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) { }); voiceManagerRef.current = voiceManager; registerDiscordListener(client.listeners, new DiscordVoiceReadyListener(voiceManager)); - void voiceManager.autoJoin(); } const messageHandler = createDiscordMessageHandler({ diff --git a/src/discord/voice/manager.ts b/src/discord/voice/manager.ts index c2e28f283b..98ee844603 100644 --- a/src/discord/voice/manager.ts +++ b/src/discord/voice/manager.ts @@ -18,7 +18,7 @@ import fs from "node:fs/promises"; import path from "node:path"; import type { MsgContext } from "../../auto-reply/templating.js"; import type { OpenClawConfig } from "../../config/config.js"; -import type { DiscordAccountConfig } from "../../config/types.js"; +import type { DiscordAccountConfig, TtsConfig } from "../../config/types.js"; import type { RuntimeEnv } from "../../runtime.js"; import { resolveAgentDir } from "../../agents/agent-scope.js"; import { agentCommand } from "../../commands/agent.js"; @@ -33,7 +33,7 @@ import { } from "../../media-understanding/runner.js"; import { resolveAgentRoute } from "../../routing/resolve-route.js"; import { parseTtsDirectives } from "../../tts/tts-core.js"; -import { textToSpeech, resolveTtsConfig } from "../../tts/tts.js"; +import { resolveTtsConfig, textToSpeech, type ResolvedTtsConfig } from "../../tts/tts.js"; const SAMPLE_RATE = 48_000; const CHANNELS = 2; @@ -64,6 +64,56 @@ type VoiceSessionEntry = { stop: () => void; }; +function mergeTtsConfig(base: TtsConfig, override?: TtsConfig): TtsConfig { + if (!override) { + return base; + } + return { + ...base, + ...override, + modelOverrides: { + ...base.modelOverrides, + ...override.modelOverrides, + }, + elevenlabs: { + ...base.elevenlabs, + ...override.elevenlabs, + voiceSettings: { + ...base.elevenlabs?.voiceSettings, + ...override.elevenlabs?.voiceSettings, + }, + }, + openai: { + ...base.openai, + ...override.openai, + }, + edge: { + ...base.edge, + ...override.edge, + }, + }; +} + +function resolveVoiceTtsConfig(params: { cfg: OpenClawConfig; override?: TtsConfig }): { + cfg: OpenClawConfig; + resolved: ResolvedTtsConfig; +} { + if (!params.override) { + return { cfg: params.cfg, resolved: resolveTtsConfig(params.cfg) }; + } + const base = params.cfg.messages?.tts ?? {}; + const merged = mergeTtsConfig(base, params.override); + const messages = params.cfg.messages ?? {}; + const cfg = { + ...params.cfg, + messages: { + ...messages, + tts: merged, + }, + }; + return { cfg, resolved: resolveTtsConfig(cfg) }; +} + function buildWavBuffer(pcm: Buffer): Buffer { const blockAlign = (CHANNELS * BIT_DEPTH) / 8; const byteRate = SAMPLE_RATE * blockAlign; @@ -461,7 +511,10 @@ export class DiscordVoiceManager { return; } - const ttsConfig = resolveTtsConfig(this.params.cfg); + const { cfg: ttsCfg, resolved: ttsConfig } = resolveVoiceTtsConfig({ + cfg: this.params.cfg, + override: this.params.discordConfig.voice?.tts, + }); const directive = parseTtsDirectives(replyText, ttsConfig.modelOverrides); const speakText = directive.overrides.ttsText ?? directive.cleanedText.trim(); if (!speakText) { @@ -470,7 +523,7 @@ export class DiscordVoiceManager { const ttsResult = await textToSpeech({ text: speakText, - cfg: this.params.cfg, + cfg: ttsCfg, channel: "discord", overrides: directive.overrides, });