diff --git a/src/docker-setup.test.ts b/src/docker-setup.test.ts index 3201c9a822..c3b9f19dd6 100644 --- a/src/docker-setup.test.ts +++ b/src/docker-setup.test.ts @@ -140,7 +140,9 @@ describe("docker-setup.sh", () => { const assocCheck = spawnSync(systemBash, ["-c", "declare -A _t=()"], { encoding: "utf8", }); - if (assocCheck.status === null || assocCheck.status === 0) { + if (assocCheck.status === 0 || assocCheck.status === null) { + // Skip runtime check when system bash supports associative arrays + // (not Bash 3.2) or when /bin/bash is unavailable (e.g. Windows). return; } diff --git a/src/tts/prepare-text.test.ts b/src/tts/prepare-text.test.ts new file mode 100644 index 0000000000..fc538e4cd6 --- /dev/null +++ b/src/tts/prepare-text.test.ts @@ -0,0 +1,67 @@ +import { describe, expect, it } from "vitest"; +import { stripMarkdown } from "../line/markdown-to-line.js"; + +/** + * Tests that stripMarkdown (used in the TTS pipeline via maybeApplyTtsToPayload) + * produces clean text suitable for speech synthesis. + * + * The TTS pipeline calls stripMarkdown() before sending text to TTS engines + * (OpenAI, ElevenLabs, Edge) so that formatting symbols are not read aloud + * (e.g. "hashtag hashtag hashtag" for ### headers). + */ +describe("TTS text preparation – stripMarkdown", () => { + it("strips markdown headers before TTS", () => { + expect(stripMarkdown("### System Design Basics")).toBe("System Design Basics"); + expect(stripMarkdown("## Heading\nSome text")).toBe("Heading\nSome text"); + }); + + it("strips bold and italic markers before TTS", () => { + expect(stripMarkdown("This is **important** and *useful*")).toBe( + "This is important and useful", + ); + }); + + it("strips inline code markers before TTS", () => { + expect(stripMarkdown("Use `consistent hashing` for distribution")).toBe( + "Use consistent hashing for distribution", + ); + }); + + it("handles a typical LLM reply with mixed markdown", () => { + const input = `## Heading with **bold** and *italic* + +> A blockquote with \`code\` + +Some ~~deleted~~ content.`; + + const result = stripMarkdown(input); + + expect(result).toBe(`Heading with bold and italic + +A blockquote with code + +Some deleted content.`); + }); + + it("handles markdown-heavy system design explanation", () => { + const input = `### B-tree vs LSM-tree + +**B-tree** uses _in-place updates_ while **LSM-tree** uses _append-only writes_. + +> Key insight: LSM-tree optimizes for write-heavy workloads. + +--- + +Use \`B-tree\` for read-heavy, \`LSM-tree\` for write-heavy.`; + + const result = stripMarkdown(input); + + expect(result).not.toContain("#"); + expect(result).not.toContain("**"); + expect(result).not.toContain("`"); + expect(result).not.toContain(">"); + expect(result).not.toContain("---"); + expect(result).toContain("B-tree vs LSM-tree"); + expect(result).toContain("B-tree uses in-place updates"); + }); +}); diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index 0e94d5d8c1..c759298577 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -471,6 +471,31 @@ describe("tts", () => { process.env.OPENCLAW_TTS_PREFS = prevPrefs; }); + it("skips auto-TTS when markdown stripping leaves text too short", async () => { + const prevPrefs = process.env.OPENCLAW_TTS_PREFS; + process.env.OPENCLAW_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`; + const originalFetch = globalThis.fetch; + const fetchMock = vi.fn(async () => ({ + ok: true, + arrayBuffer: async () => new ArrayBuffer(1), + })); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + const payload = { text: "### **bold**" }; + const result = await maybeApplyTtsToPayload({ + payload, + cfg: baseCfg, + kind: "final", + inboundAudio: true, + }); + + expect(result).toBe(payload); + expect(fetchMock).not.toHaveBeenCalled(); + + globalThis.fetch = originalFetch; + process.env.OPENCLAW_TTS_PREFS = prevPrefs; + }); + it("attempts auto-TTS when inbound audio gating is on and the message is audio", async () => { const prevPrefs = process.env.OPENCLAW_TTS_PREFS; process.env.OPENCLAW_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`; diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 0f47c02a97..800ef9b743 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -32,6 +32,7 @@ import { import { resolveModel } from "../agents/pi-embedded-runner/model.js"; import { normalizeChannelId } from "../channels/plugins/index.js"; import { logVerbose } from "../globals.js"; +import { stripMarkdown } from "../line/markdown-to-line.js"; import { isVoiceCompatibleAudio } from "../media/audio.js"; import { CONFIG_DIR, resolveUserPath } from "../utils.js"; @@ -1492,13 +1493,11 @@ export async function maybeApplyTtsToPayload(params: { if (textForAudio.length > maxLength) { if (!isSummarizationEnabled(prefsPath)) { - // Truncate text when summarization is disabled logVerbose( `TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`, ); textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`; } else { - // Summarize text when enabled try { const summary = await summarizeText({ text: textForAudio, @@ -1523,6 +1522,11 @@ export async function maybeApplyTtsToPayload(params: { } } + textForAudio = stripMarkdown(textForAudio).trim(); // strip markdown for TTS (### → "hashtag" etc.) + if (textForAudio.length < 10) { + return nextPayload; + } + const ttsStart = Date.now(); const result = await textToSpeech({ text: textForAudio,