mirror of
https://github.com/openclaw/openclaw.git
synced 2026-02-19 18:39:20 -05:00
fix(tts): strip markdown before sending text to TTS engines (#13237)
Merged via /review-pr -> /prepare-pr -> /merge-pr.
Prepared head SHA: 163c68539f
Co-authored-by: danielwanwx <144515713+danielwanwx@users.noreply.github.com>
Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com>
Reviewed-by: @sebslight
This commit is contained in:
@@ -140,7 +140,9 @@ describe("docker-setup.sh", () => {
|
||||
const assocCheck = spawnSync(systemBash, ["-c", "declare -A _t=()"], {
|
||||
encoding: "utf8",
|
||||
});
|
||||
if (assocCheck.status === null || assocCheck.status === 0) {
|
||||
if (assocCheck.status === 0 || assocCheck.status === null) {
|
||||
// Skip runtime check when system bash supports associative arrays
|
||||
// (not Bash 3.2) or when /bin/bash is unavailable (e.g. Windows).
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
67
src/tts/prepare-text.test.ts
Normal file
67
src/tts/prepare-text.test.ts
Normal file
@@ -0,0 +1,67 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { stripMarkdown } from "../line/markdown-to-line.js";
|
||||
|
||||
/**
|
||||
* Tests that stripMarkdown (used in the TTS pipeline via maybeApplyTtsToPayload)
|
||||
* produces clean text suitable for speech synthesis.
|
||||
*
|
||||
* The TTS pipeline calls stripMarkdown() before sending text to TTS engines
|
||||
* (OpenAI, ElevenLabs, Edge) so that formatting symbols are not read aloud
|
||||
* (e.g. "hashtag hashtag hashtag" for ### headers).
|
||||
*/
|
||||
describe("TTS text preparation – stripMarkdown", () => {
|
||||
it("strips markdown headers before TTS", () => {
|
||||
expect(stripMarkdown("### System Design Basics")).toBe("System Design Basics");
|
||||
expect(stripMarkdown("## Heading\nSome text")).toBe("Heading\nSome text");
|
||||
});
|
||||
|
||||
it("strips bold and italic markers before TTS", () => {
|
||||
expect(stripMarkdown("This is **important** and *useful*")).toBe(
|
||||
"This is important and useful",
|
||||
);
|
||||
});
|
||||
|
||||
it("strips inline code markers before TTS", () => {
|
||||
expect(stripMarkdown("Use `consistent hashing` for distribution")).toBe(
|
||||
"Use consistent hashing for distribution",
|
||||
);
|
||||
});
|
||||
|
||||
it("handles a typical LLM reply with mixed markdown", () => {
|
||||
const input = `## Heading with **bold** and *italic*
|
||||
|
||||
> A blockquote with \`code\`
|
||||
|
||||
Some ~~deleted~~ content.`;
|
||||
|
||||
const result = stripMarkdown(input);
|
||||
|
||||
expect(result).toBe(`Heading with bold and italic
|
||||
|
||||
A blockquote with code
|
||||
|
||||
Some deleted content.`);
|
||||
});
|
||||
|
||||
it("handles markdown-heavy system design explanation", () => {
|
||||
const input = `### B-tree vs LSM-tree
|
||||
|
||||
**B-tree** uses _in-place updates_ while **LSM-tree** uses _append-only writes_.
|
||||
|
||||
> Key insight: LSM-tree optimizes for write-heavy workloads.
|
||||
|
||||
---
|
||||
|
||||
Use \`B-tree\` for read-heavy, \`LSM-tree\` for write-heavy.`;
|
||||
|
||||
const result = stripMarkdown(input);
|
||||
|
||||
expect(result).not.toContain("#");
|
||||
expect(result).not.toContain("**");
|
||||
expect(result).not.toContain("`");
|
||||
expect(result).not.toContain(">");
|
||||
expect(result).not.toContain("---");
|
||||
expect(result).toContain("B-tree vs LSM-tree");
|
||||
expect(result).toContain("B-tree uses in-place updates");
|
||||
});
|
||||
});
|
||||
@@ -471,6 +471,31 @@ describe("tts", () => {
|
||||
process.env.OPENCLAW_TTS_PREFS = prevPrefs;
|
||||
});
|
||||
|
||||
it("skips auto-TTS when markdown stripping leaves text too short", async () => {
|
||||
const prevPrefs = process.env.OPENCLAW_TTS_PREFS;
|
||||
process.env.OPENCLAW_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
|
||||
const originalFetch = globalThis.fetch;
|
||||
const fetchMock = vi.fn(async () => ({
|
||||
ok: true,
|
||||
arrayBuffer: async () => new ArrayBuffer(1),
|
||||
}));
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
|
||||
const payload = { text: "### **bold**" };
|
||||
const result = await maybeApplyTtsToPayload({
|
||||
payload,
|
||||
cfg: baseCfg,
|
||||
kind: "final",
|
||||
inboundAudio: true,
|
||||
});
|
||||
|
||||
expect(result).toBe(payload);
|
||||
expect(fetchMock).not.toHaveBeenCalled();
|
||||
|
||||
globalThis.fetch = originalFetch;
|
||||
process.env.OPENCLAW_TTS_PREFS = prevPrefs;
|
||||
});
|
||||
|
||||
it("attempts auto-TTS when inbound audio gating is on and the message is audio", async () => {
|
||||
const prevPrefs = process.env.OPENCLAW_TTS_PREFS;
|
||||
process.env.OPENCLAW_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
|
||||
|
||||
@@ -32,6 +32,7 @@ import {
|
||||
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
|
||||
import { normalizeChannelId } from "../channels/plugins/index.js";
|
||||
import { logVerbose } from "../globals.js";
|
||||
import { stripMarkdown } from "../line/markdown-to-line.js";
|
||||
import { isVoiceCompatibleAudio } from "../media/audio.js";
|
||||
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
|
||||
|
||||
@@ -1492,13 +1493,11 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
|
||||
if (textForAudio.length > maxLength) {
|
||||
if (!isSummarizationEnabled(prefsPath)) {
|
||||
// Truncate text when summarization is disabled
|
||||
logVerbose(
|
||||
`TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
|
||||
);
|
||||
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
|
||||
} else {
|
||||
// Summarize text when enabled
|
||||
try {
|
||||
const summary = await summarizeText({
|
||||
text: textForAudio,
|
||||
@@ -1523,6 +1522,11 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
}
|
||||
}
|
||||
|
||||
textForAudio = stripMarkdown(textForAudio).trim(); // strip markdown for TTS (### → "hashtag" etc.)
|
||||
if (textForAudio.length < 10) {
|
||||
return nextPayload;
|
||||
}
|
||||
|
||||
const ttsStart = Date.now();
|
||||
const result = await textToSpeech({
|
||||
text: textForAudio,
|
||||
|
||||
Reference in New Issue
Block a user