fix: transcript corruption resilience (3 fixes)

Fix 1 - Strip tool_use blocks from aborted/errored assistant messages
Instead of keeping aborted assistant messages with their (potentially incomplete)
tool_use blocks, strip the tool_use blocks entirely. This prevents the API from
expecting matching tool_results that don't exist, which causes 400 errors.

If the message has no remaining content after stripping, drop it entirely.
If it has text content alongside tool calls, keep the text.

Fix 2 - Don't cooldown auth profiles for format errors
Format errors (400 Bad Request) indicate malformed session input, not provider
unavailability. Previously, format errors put the auth profile into cooldown,
which cascaded failures to ALL sessions sharing that profile (Slack, Telegram,
webchat all go down from one corrupted session).

Now format errors are treated as session-scoped: the failing session gets its
error, but other sessions continue working normally.

Fix 3 - Updated tests for new behavior
Tests updated to reflect that aborted/errored assistant messages have their
tool_use blocks stripped rather than passed through unchanged.

Fixes: https://github.com/openclaw/openclaw/issues/15037
This commit is contained in:
Yash Chitneni
2026-02-12 16:37:54 -06:00
parent 4d0443391c
commit fb8862bb11
3 changed files with 73 additions and 26 deletions

View File

@@ -673,7 +673,12 @@ export async function runEmbeddedPiAgent(
};
}
const promptFailoverReason = classifyFailoverReason(errorText);
if (promptFailoverReason && promptFailoverReason !== "timeout" && lastProfileId) {
// Don't mark auth profile as failed for format errors (400 Bad Request).
// Format errors indicate malformed session input (e.g., corrupted transcript),
// NOT a provider/auth issue. Cooling down the profile cascades failures to
// all sessions sharing the same auth profile.
// See: https://github.com/openclaw/openclaw/issues/15037
if (promptFailoverReason && promptFailoverReason !== "timeout" && promptFailoverReason !== "format" && lastProfileId) {
await markAuthProfileFailure({
store: authStore,
profileId: lastProfileId,

View File

@@ -114,10 +114,12 @@ describe("sanitizeToolUseResultPairing", () => {
expect(out.map((m) => m.role)).toEqual(["user", "assistant"]);
});
it("skips tool call extraction for assistant messages with stopReason 'error'", () => {
it("strips tool_use blocks from assistant messages with stopReason 'error'", () => {
// When an assistant message has stopReason: "error", its tool_use blocks may be
// incomplete/malformed. We should NOT create synthetic tool_results for them,
// as this causes API 400 errors: "unexpected tool_use_id found in tool_result blocks"
// incomplete/malformed. We strip them to prevent the API from expecting matching
// tool_results that don't exist.
// See: https://github.com/openclaw/openclaw/issues/4597
// See: https://github.com/openclaw/openclaw/issues/15037
const input = [
{
role: "assistant",
@@ -131,15 +133,41 @@ describe("sanitizeToolUseResultPairing", () => {
// Should NOT add synthetic tool results for errored messages
expect(result.added).toHaveLength(0);
// The assistant message should be passed through unchanged
expect(result.messages[0]?.role).toBe("assistant");
expect(result.messages[1]?.role).toBe("user");
expect(result.messages).toHaveLength(2);
// The assistant message with only tool calls should be dropped entirely
expect(result.messages).toHaveLength(1);
expect(result.messages[0]?.role).toBe("user");
});
it("skips tool call extraction for assistant messages with stopReason 'aborted'", () => {
it("strips tool_use blocks but keeps text from errored assistant messages", () => {
// When an errored assistant message has both text and tool_use blocks,
// strip the tool_use blocks but keep the text content.
const input = [
{
role: "assistant",
content: [
{ type: "text", text: "Let me try that..." },
{ type: "toolCall", id: "call_error", name: "exec", arguments: {} },
],
stopReason: "error",
},
{ role: "user", content: "something went wrong" },
] as AgentMessage[];
const result = repairToolUseResultPairing(input);
expect(result.added).toHaveLength(0);
expect(result.messages).toHaveLength(2);
expect(result.messages[0]?.role).toBe("assistant");
// The assistant message should only have the text block, not the tool call
const content = (result.messages[0] as { content: unknown[] }).content;
expect(content).toHaveLength(1);
expect((content[0] as { type: string }).type).toBe("text");
expect(result.messages[1]?.role).toBe("user");
});
it("strips tool_use blocks from assistant messages with stopReason 'aborted'", () => {
// When a request is aborted mid-stream, the assistant message may have incomplete
// tool_use blocks (with partialJson). We should NOT create synthetic tool_results.
// tool_use blocks (with partialJson). We strip them to prevent API 400 errors.
const input = [
{
role: "assistant",
@@ -153,10 +181,9 @@ describe("sanitizeToolUseResultPairing", () => {
// Should NOT add synthetic tool results for aborted messages
expect(result.added).toHaveLength(0);
// Messages should be passed through without synthetic insertions
expect(result.messages).toHaveLength(2);
expect(result.messages[0]?.role).toBe("assistant");
expect(result.messages[1]?.role).toBe("user");
// The assistant message with only tool calls should be dropped
expect(result.messages).toHaveLength(1);
expect(result.messages[0]?.role).toBe("user");
});
it("still repairs tool results for normal assistant messages with stopReason 'toolUse'", () => {
@@ -178,9 +205,8 @@ describe("sanitizeToolUseResultPairing", () => {
});
it("drops orphan tool results that follow an aborted assistant message", () => {
// When an assistant message is aborted, any tool results that follow should be
// dropped as orphans (since we skip extracting tool calls from aborted messages).
// This addresses the edge case where a partial tool result was persisted before abort.
// When an assistant message is aborted, its tool_use blocks are stripped.
// Any tool results that follow should also be dropped as orphans.
const input = [
{
role: "assistant",
@@ -199,11 +225,11 @@ describe("sanitizeToolUseResultPairing", () => {
const result = repairToolUseResultPairing(input);
// The orphan tool result should be dropped
// The orphan tool result should be dropped, and the empty assistant message too
expect(result.droppedOrphanCount).toBe(1);
expect(result.messages).toHaveLength(2);
expect(result.messages[0]?.role).toBe("assistant");
expect(result.messages[1]?.role).toBe("user");
// Only the user message should remain
expect(result.messages).toHaveLength(1);
expect(result.messages[0]?.role).toBe("user");
// No synthetic results should be added
expect(result.added).toHaveLength(0);
});

View File

@@ -214,15 +214,31 @@ export function repairToolUseResultPairing(messages: AgentMessage[]): ToolUseRep
const assistant = msg as Extract<AgentMessage, { role: "assistant" }>;
// Skip tool call extraction for aborted or errored assistant messages.
// Handle aborted or errored assistant messages.
// When stopReason is "error" or "aborted", the tool_use blocks may be incomplete
// (e.g., partialJson: true) and should not have synthetic tool_results created.
// Creating synthetic results for incomplete tool calls causes API 400 errors:
// "unexpected tool_use_id found in tool_result blocks"
// (e.g., partialJson: true). We must NOT create synthetic tool_results for incomplete
// tool calls, but we also must NOT leave tool_use blocks in the message that the API
// will expect matching tool_results for.
// Fix: strip tool_use blocks from aborted/errored messages entirely.
// If the message has no remaining content after stripping, drop it.
// See: https://github.com/openclaw/openclaw/issues/4597
// See: https://github.com/openclaw/openclaw/issues/15037
const stopReason = (assistant as { stopReason?: string }).stopReason;
if (stopReason === "error" || stopReason === "aborted") {
out.push(msg);
if (Array.isArray(assistant.content)) {
const nonToolContent = assistant.content.filter((block) => {
if (!block || typeof block !== "object") return true;
const rec = block as { type?: unknown };
return !TOOL_CALL_TYPES.has(rec.type as string);
});
if (nonToolContent.length > 0) {
out.push({ ...msg, content: nonToolContent } as AgentMessage);
}
// If all content was tool calls, drop the entire message
changed = true;
} else {
out.push(msg);
}
continue;
}