fix: transcript corruption resilience (3 fixes)

Fix 1 - Strip tool_use blocks from aborted/errored assistant messages Instead of keeping aborted assistant messages with their (potentially incomplete) tool_use blocks, strip the tool_use blocks entirely. This prevents the API from expecting matching tool_results that don't exist, which causes 400 errors. If the message has no remaining content after stripping, drop it entirely. If it has text content alongside tool calls, keep the text. Fix 2 - Don't cooldown auth profiles for format errors Format errors (400 Bad Request) indicate malformed session input, not provider unavailability. Previously, format errors put the auth profile into cooldown, which cascaded failures to ALL sessions sharing that profile (Slack, Telegram, webchat all go down from one corrupted session). Now format errors are treated as session-scoped: the failing session gets its error, but other sessions continue working normally. Fix 3 - Updated tests for new behavior Tests updated to reflect that aborted/errored assistant messages have their tool_use blocks stripped rather than passed through unchanged. Fixes: https://github.com/openclaw/openclaw/issues/15037
2026-02-19 18:39:20 -05:00 · 2026-02-12 16:37:54 -06:00
parent 4d0443391c
commit fb8862bb11
3 changed files with 73 additions and 26 deletions
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@@ -673,7 +673,12 @@ export async function runEmbeddedPiAgent(
              };
            }
            const promptFailoverReason = classifyFailoverReason(errorText);
-            if (promptFailoverReason && promptFailoverReason !== "timeout" && lastProfileId) {
+            // Don't mark auth profile as failed for format errors (400 Bad Request).
+            // Format errors indicate malformed session input (e.g., corrupted transcript),
+            // NOT a provider/auth issue. Cooling down the profile cascades failures to
+            // all sessions sharing the same auth profile.
+            // See: https://github.com/openclaw/openclaw/issues/15037
+            if (promptFailoverReason && promptFailoverReason !== "timeout" && promptFailoverReason !== "format" && lastProfileId) {
              await markAuthProfileFailure({
                store: authStore,
                profileId: lastProfileId,
--- a/src/agents/session-transcript-repair.test.ts
+++ b/src/agents/session-transcript-repair.test.ts
@@ -114,10 +114,12 @@ describe("sanitizeToolUseResultPairing", () => {
    expect(out.map((m) => m.role)).toEqual(["user", "assistant"]);
  });

-  it("skips tool call extraction for assistant messages with stopReason 'error'", () => {
+  it("strips tool_use blocks from assistant messages with stopReason 'error'", () => {
    // When an assistant message has stopReason: "error", its tool_use blocks may be
-    // incomplete/malformed. We should NOT create synthetic tool_results for them,
-    // as this causes API 400 errors: "unexpected tool_use_id found in tool_result blocks"
+    // incomplete/malformed. We strip them to prevent the API from expecting matching
+    // tool_results that don't exist.
+    // See: https://github.com/openclaw/openclaw/issues/4597
+    // See: https://github.com/openclaw/openclaw/issues/15037
    const input = [
      {
        role: "assistant",
@@ -131,15 +133,41 @@ describe("sanitizeToolUseResultPairing", () => {

    // Should NOT add synthetic tool results for errored messages
    expect(result.added).toHaveLength(0);
-    // The assistant message should be passed through unchanged
-    expect(result.messages[0]?.role).toBe("assistant");
-    expect(result.messages[1]?.role).toBe("user");
-    expect(result.messages).toHaveLength(2);
+    // The assistant message with only tool calls should be dropped entirely
+    expect(result.messages).toHaveLength(1);
+    expect(result.messages[0]?.role).toBe("user");
  });

-  it("skips tool call extraction for assistant messages with stopReason 'aborted'", () => {
+  it("strips tool_use blocks but keeps text from errored assistant messages", () => {
+    // When an errored assistant message has both text and tool_use blocks,
+    // strip the tool_use blocks but keep the text content.
+    const input = [
+      {
+        role: "assistant",
+        content: [
+          { type: "text", text: "Let me try that..." },
+          { type: "toolCall", id: "call_error", name: "exec", arguments: {} },
+        ],
+        stopReason: "error",
+      },
+      { role: "user", content: "something went wrong" },
+    ] as AgentMessage[];
+
+    const result = repairToolUseResultPairing(input);
+
+    expect(result.added).toHaveLength(0);
+    expect(result.messages).toHaveLength(2);
+    expect(result.messages[0]?.role).toBe("assistant");
+    // The assistant message should only have the text block, not the tool call
+    const content = (result.messages[0] as { content: unknown[] }).content;
+    expect(content).toHaveLength(1);
+    expect((content[0] as { type: string }).type).toBe("text");
+    expect(result.messages[1]?.role).toBe("user");
+  });
+
+  it("strips tool_use blocks from assistant messages with stopReason 'aborted'", () => {
    // When a request is aborted mid-stream, the assistant message may have incomplete
-    // tool_use blocks (with partialJson). We should NOT create synthetic tool_results.
+    // tool_use blocks (with partialJson). We strip them to prevent API 400 errors.
    const input = [
      {
        role: "assistant",
@@ -153,10 +181,9 @@ describe("sanitizeToolUseResultPairing", () => {

    // Should NOT add synthetic tool results for aborted messages
    expect(result.added).toHaveLength(0);
-    // Messages should be passed through without synthetic insertions
-    expect(result.messages).toHaveLength(2);
-    expect(result.messages[0]?.role).toBe("assistant");
-    expect(result.messages[1]?.role).toBe("user");
+    // The assistant message with only tool calls should be dropped
+    expect(result.messages).toHaveLength(1);
+    expect(result.messages[0]?.role).toBe("user");
  });

  it("still repairs tool results for normal assistant messages with stopReason 'toolUse'", () => {
@@ -178,9 +205,8 @@ describe("sanitizeToolUseResultPairing", () => {
  });

  it("drops orphan tool results that follow an aborted assistant message", () => {
-    // When an assistant message is aborted, any tool results that follow should be
-    // dropped as orphans (since we skip extracting tool calls from aborted messages).
-    // This addresses the edge case where a partial tool result was persisted before abort.
+    // When an assistant message is aborted, its tool_use blocks are stripped.
+    // Any tool results that follow should also be dropped as orphans.
    const input = [
      {
        role: "assistant",
@@ -199,11 +225,11 @@ describe("sanitizeToolUseResultPairing", () => {

    const result = repairToolUseResultPairing(input);

-    // The orphan tool result should be dropped
+    // The orphan tool result should be dropped, and the empty assistant message too
    expect(result.droppedOrphanCount).toBe(1);
-    expect(result.messages).toHaveLength(2);
-    expect(result.messages[0]?.role).toBe("assistant");
-    expect(result.messages[1]?.role).toBe("user");
+    // Only the user message should remain
+    expect(result.messages).toHaveLength(1);
+    expect(result.messages[0]?.role).toBe("user");
    // No synthetic results should be added
    expect(result.added).toHaveLength(0);
  });
--- a/src/agents/session-transcript-repair.ts
+++ b/src/agents/session-transcript-repair.ts
@@ -214,15 +214,31 @@ export function repairToolUseResultPairing(messages: AgentMessage[]): ToolUseRep

    const assistant = msg as Extract<AgentMessage, { role: "assistant" }>;

-    // Skip tool call extraction for aborted or errored assistant messages.
+    // Handle aborted or errored assistant messages.
    // When stopReason is "error" or "aborted", the tool_use blocks may be incomplete
-    // (e.g., partialJson: true) and should not have synthetic tool_results created.
-    // Creating synthetic results for incomplete tool calls causes API 400 errors:
-    // "unexpected tool_use_id found in tool_result blocks"
+    // (e.g., partialJson: true). We must NOT create synthetic tool_results for incomplete
+    // tool calls, but we also must NOT leave tool_use blocks in the message that the API
+    // will expect matching tool_results for.
+    // Fix: strip tool_use blocks from aborted/errored messages entirely.
+    // If the message has no remaining content after stripping, drop it.
    // See: https://github.com/openclaw/openclaw/issues/4597
+    // See: https://github.com/openclaw/openclaw/issues/15037
    const stopReason = (assistant as { stopReason?: string }).stopReason;
    if (stopReason === "error" || stopReason === "aborted") {
-      out.push(msg);
+      if (Array.isArray(assistant.content)) {
+        const nonToolContent = assistant.content.filter((block) => {
+          if (!block || typeof block !== "object") return true;
+          const rec = block as { type?: unknown };
+          return !TOOL_CALL_TYPES.has(rec.type as string);
+        });
+        if (nonToolContent.length > 0) {
+          out.push({ ...msg, content: nonToolContent } as AgentMessage);
+        }
+        // If all content was tool calls, drop the entire message
+        changed = true;
+      } else {
+        out.push(msg);
+      }
      continue;
    }