fix(tts): strip markdown before sending text to TTS engines (#13237)

Merged via /review-pr -> /prepare-pr -> /merge-pr. Prepared head SHA: 163c68539f Co-authored-by: danielwanwx <144515713+danielwanwx@users.noreply.github.com> Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com> Reviewed-by: @sebslight
2026-02-19 18:39:20 -05:00 · 2026-02-12 07:46:57 -08:00
parent 4736fe7fde
commit a5ab9fac0c
4 changed files with 101 additions and 3 deletions
--- a/src/docker-setup.test.ts
+++ b/src/docker-setup.test.ts
@@ -140,7 +140,9 @@ describe("docker-setup.sh", () => {
    const assocCheck = spawnSync(systemBash, ["-c", "declare -A _t=()"], {
      encoding: "utf8",
    });
-    if (assocCheck.status === null || assocCheck.status === 0) {
+    if (assocCheck.status === 0 || assocCheck.status === null) {
+      // Skip runtime check when system bash supports associative arrays
+      // (not Bash 3.2) or when /bin/bash is unavailable (e.g. Windows).
      return;
    }

--- a/src/tts/prepare-text.test.ts
+++ b/src/tts/prepare-text.test.ts
@@ -0,0 +1,67 @@
+import { describe, expect, it } from "vitest";
+import { stripMarkdown } from "../line/markdown-to-line.js";
+
+/**
+ * Tests that stripMarkdown (used in the TTS pipeline via maybeApplyTtsToPayload)
+ * produces clean text suitable for speech synthesis.
+ *
+ * The TTS pipeline calls stripMarkdown() before sending text to TTS engines
+ * (OpenAI, ElevenLabs, Edge) so that formatting symbols are not read aloud
+ * (e.g. "hashtag hashtag hashtag" for ### headers).
+ */
+describe("TTS text preparation – stripMarkdown", () => {
+  it("strips markdown headers before TTS", () => {
+    expect(stripMarkdown("### System Design Basics")).toBe("System Design Basics");
+    expect(stripMarkdown("## Heading\nSome text")).toBe("Heading\nSome text");
+  });
+
+  it("strips bold and italic markers before TTS", () => {
+    expect(stripMarkdown("This is **important** and *useful*")).toBe(
+      "This is important and useful",
+    );
+  });
+
+  it("strips inline code markers before TTS", () => {
+    expect(stripMarkdown("Use `consistent hashing` for distribution")).toBe(
+      "Use consistent hashing for distribution",
+    );
+  });
+
+  it("handles a typical LLM reply with mixed markdown", () => {
+    const input = `## Heading with **bold** and *italic*
+
+> A blockquote with \`code\`
+
+Some ~~deleted~~ content.`;
+
+    const result = stripMarkdown(input);
+
+    expect(result).toBe(`Heading with bold and italic
+
+A blockquote with code
+
+Some deleted content.`);
+  });
+
+  it("handles markdown-heavy system design explanation", () => {
+    const input = `### B-tree vs LSM-tree
+
+**B-tree** uses _in-place updates_ while **LSM-tree** uses _append-only writes_.
+
+> Key insight: LSM-tree optimizes for write-heavy workloads.
+
+---
+
+Use \`B-tree\` for read-heavy, \`LSM-tree\` for write-heavy.`;
+
+    const result = stripMarkdown(input);
+
+    expect(result).not.toContain("#");
+    expect(result).not.toContain("**");
+    expect(result).not.toContain("`");
+    expect(result).not.toContain(">");
+    expect(result).not.toContain("---");
+    expect(result).toContain("B-tree vs LSM-tree");
+    expect(result).toContain("B-tree uses in-place updates");
+  });
+});
--- a/src/tts/tts.test.ts
+++ b/src/tts/tts.test.ts
@@ -471,6 +471,31 @@ describe("tts", () => {
      process.env.OPENCLAW_TTS_PREFS = prevPrefs;
    });

+    it("skips auto-TTS when markdown stripping leaves text too short", async () => {
+      const prevPrefs = process.env.OPENCLAW_TTS_PREFS;
+      process.env.OPENCLAW_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
+      const originalFetch = globalThis.fetch;
+      const fetchMock = vi.fn(async () => ({
+        ok: true,
+        arrayBuffer: async () => new ArrayBuffer(1),
+      }));
+      globalThis.fetch = fetchMock as unknown as typeof fetch;
+
+      const payload = { text: "### **bold**" };
+      const result = await maybeApplyTtsToPayload({
+        payload,
+        cfg: baseCfg,
+        kind: "final",
+        inboundAudio: true,
+      });
+
+      expect(result).toBe(payload);
+      expect(fetchMock).not.toHaveBeenCalled();
+
+      globalThis.fetch = originalFetch;
+      process.env.OPENCLAW_TTS_PREFS = prevPrefs;
+    });
+
    it("attempts auto-TTS when inbound audio gating is on and the message is audio", async () => {
      const prevPrefs = process.env.OPENCLAW_TTS_PREFS;
      process.env.OPENCLAW_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@@ -32,6 +32,7 @@ import {
 import { resolveModel } from "../agents/pi-embedded-runner/model.js";
 import { normalizeChannelId } from "../channels/plugins/index.js";
 import { logVerbose } from "../globals.js";
+import { stripMarkdown } from "../line/markdown-to-line.js";
 import { isVoiceCompatibleAudio } from "../media/audio.js";
 import { CONFIG_DIR, resolveUserPath } from "../utils.js";

@@ -1492,13 +1493,11 @@ export async function maybeApplyTtsToPayload(params: {

  if (textForAudio.length > maxLength) {
    if (!isSummarizationEnabled(prefsPath)) {
-      // Truncate text when summarization is disabled
      logVerbose(
        `TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
      );
      textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
    } else {
-      // Summarize text when enabled
      try {
        const summary = await summarizeText({
          text: textForAudio,
@@ -1523,6 +1522,11 @@ export async function maybeApplyTtsToPayload(params: {
    }
  }

+  textForAudio = stripMarkdown(textForAudio).trim(); // strip markdown for TTS (### → "hashtag" etc.)
+  if (textForAudio.length < 10) {
+    return nextPayload;
+  }
+
  const ttsStart = Date.now();
  const result = await textToSpeech({
    text: textForAudio,