fix(tts): strip markdown before sending text to TTS engines (#13237)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: 163c68539f
Co-authored-by: danielwanwx <144515713+danielwanwx@users.noreply.github.com>
Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com>
Reviewed-by: @sebslight
This commit is contained in:
danielwanwx
2026-02-12 07:46:57 -08:00
committed by GitHub
parent 4736fe7fde
commit a5ab9fac0c
4 changed files with 101 additions and 3 deletions

View File

@@ -140,7 +140,9 @@ describe("docker-setup.sh", () => {
const assocCheck = spawnSync(systemBash, ["-c", "declare -A _t=()"], {
encoding: "utf8",
});
if (assocCheck.status === null || assocCheck.status === 0) {
if (assocCheck.status === 0 || assocCheck.status === null) {
// Skip runtime check when system bash supports associative arrays
// (not Bash 3.2) or when /bin/bash is unavailable (e.g. Windows).
return;
}

View File

@@ -0,0 +1,67 @@
import { describe, expect, it } from "vitest";
import { stripMarkdown } from "../line/markdown-to-line.js";
/**
* Tests that stripMarkdown (used in the TTS pipeline via maybeApplyTtsToPayload)
* produces clean text suitable for speech synthesis.
*
* The TTS pipeline calls stripMarkdown() before sending text to TTS engines
* (OpenAI, ElevenLabs, Edge) so that formatting symbols are not read aloud
* (e.g. "hashtag hashtag hashtag" for ### headers).
*/
describe("TTS text preparation stripMarkdown", () => {
it("strips markdown headers before TTS", () => {
expect(stripMarkdown("### System Design Basics")).toBe("System Design Basics");
expect(stripMarkdown("## Heading\nSome text")).toBe("Heading\nSome text");
});
it("strips bold and italic markers before TTS", () => {
expect(stripMarkdown("This is **important** and *useful*")).toBe(
"This is important and useful",
);
});
it("strips inline code markers before TTS", () => {
expect(stripMarkdown("Use `consistent hashing` for distribution")).toBe(
"Use consistent hashing for distribution",
);
});
it("handles a typical LLM reply with mixed markdown", () => {
const input = `## Heading with **bold** and *italic*
> A blockquote with \`code\`
Some ~~deleted~~ content.`;
const result = stripMarkdown(input);
expect(result).toBe(`Heading with bold and italic
A blockquote with code
Some deleted content.`);
});
it("handles markdown-heavy system design explanation", () => {
const input = `### B-tree vs LSM-tree
**B-tree** uses _in-place updates_ while **LSM-tree** uses _append-only writes_.
> Key insight: LSM-tree optimizes for write-heavy workloads.
---
Use \`B-tree\` for read-heavy, \`LSM-tree\` for write-heavy.`;
const result = stripMarkdown(input);
expect(result).not.toContain("#");
expect(result).not.toContain("**");
expect(result).not.toContain("`");
expect(result).not.toContain(">");
expect(result).not.toContain("---");
expect(result).toContain("B-tree vs LSM-tree");
expect(result).toContain("B-tree uses in-place updates");
});
});

View File

@@ -471,6 +471,31 @@ describe("tts", () => {
process.env.OPENCLAW_TTS_PREFS = prevPrefs;
});
it("skips auto-TTS when markdown stripping leaves text too short", async () => {
const prevPrefs = process.env.OPENCLAW_TTS_PREFS;
process.env.OPENCLAW_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
const originalFetch = globalThis.fetch;
const fetchMock = vi.fn(async () => ({
ok: true,
arrayBuffer: async () => new ArrayBuffer(1),
}));
globalThis.fetch = fetchMock as unknown as typeof fetch;
const payload = { text: "### **bold**" };
const result = await maybeApplyTtsToPayload({
payload,
cfg: baseCfg,
kind: "final",
inboundAudio: true,
});
expect(result).toBe(payload);
expect(fetchMock).not.toHaveBeenCalled();
globalThis.fetch = originalFetch;
process.env.OPENCLAW_TTS_PREFS = prevPrefs;
});
it("attempts auto-TTS when inbound audio gating is on and the message is audio", async () => {
const prevPrefs = process.env.OPENCLAW_TTS_PREFS;
process.env.OPENCLAW_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;

View File

@@ -32,6 +32,7 @@ import {
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
import { normalizeChannelId } from "../channels/plugins/index.js";
import { logVerbose } from "../globals.js";
import { stripMarkdown } from "../line/markdown-to-line.js";
import { isVoiceCompatibleAudio } from "../media/audio.js";
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
@@ -1492,13 +1493,11 @@ export async function maybeApplyTtsToPayload(params: {
if (textForAudio.length > maxLength) {
if (!isSummarizationEnabled(prefsPath)) {
// Truncate text when summarization is disabled
logVerbose(
`TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
);
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
} else {
// Summarize text when enabled
try {
const summary = await summarizeText({
text: textForAudio,
@@ -1523,6 +1522,11 @@ export async function maybeApplyTtsToPayload(params: {
}
}
textForAudio = stripMarkdown(textForAudio).trim(); // strip markdown for TTS (### → "hashtag" etc.)
if (textForAudio.length < 10) {
return nextPayload;
}
const ttsStart = Date.now();
const result = await textToSpeech({
text: textForAudio,