From fbda9a93fda35558d57d3273dceffe05d9de10db Mon Sep 17 00:00:00 2001 From: Sebastian <19554889+sebslight@users.noreply.github.com> Date: Mon, 16 Feb 2026 20:59:44 -0500 Subject: [PATCH] fix(failover): align abort timeout detection and regressions --- CHANGELOG.md | 1 + src/agents/failover-error.e2e.test.ts | 17 +++++++++++++++++ src/agents/failover-error.ts | 2 +- src/agents/model-fallback.e2e.test.ts | 11 +++++++++++ ...ed-helpers.isbillingerrormessage.e2e.test.ts | 10 ++++++++++ 5 files changed, 40 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53def58770..9ba9e8f63a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai - Agents/Tools: make loop detection progress-aware and phased by hard-blocking known `process(action=poll|log)` no-progress loops, warning on generic identical-call repeats, warning + no-progress-blocking ping-pong alternation loops (10/20), coalescing repeated warning spam into threshold buckets (including canonical ping-pong pairs), adding a global circuit breaker at 30 no-progress repeats, and emitting structured diagnostic `tool.loop` warning/error events for loop actions. (#16808) Thanks @akramcodez and @beca-oc. - Agents/Tools: scope the `message` tool schema to the active channel so Telegram uses `buttons` and Discord uses `components`. (#18215) Thanks @obviyus. - Agents/Models: probe the primary model when its auth-profile cooldown is near expiry (with per-provider throttling), so runs recover from temporary rate limits without staying on fallback models until restart. (#17478) Thanks @PlayerGhost. +- Agents/Failover: classify provider abort stop-reason errors (`Unhandled stop reason: abort`, `stop reason: abort`, `reason: abort`) as timeout-class failures so configured model fallback chains trigger instead of surfacing raw abort failures. (#18618) Thanks @sauerdaniel. - Agents/Context: raise default total bootstrap prompt cap from `24000` to `150000` chars (keeping `bootstrapMaxChars` at `20000`), include total-cap visibility in `/context`, and mark truncation from injected-vs-raw sizes so total-cap clipping is reflected accurately. - Memory/QMD: scope managed collection names per agent and precreate glob-backed collection directories before registration, preventing cross-agent collection clobbering and startup ENOENT failures in fresh workspaces. (#17194) Thanks @jonathanadams96. - Cron: preserve per-job schedule-error isolation in post-run maintenance recompute so malformed sibling jobs no longer abort persistence of successful runs. (#17852) Thanks @pierreeurope. diff --git a/src/agents/failover-error.e2e.test.ts b/src/agents/failover-error.e2e.test.ts index d81781a905..5fb9d06e60 100644 --- a/src/agents/failover-error.e2e.test.ts +++ b/src/agents/failover-error.e2e.test.ts @@ -2,6 +2,7 @@ import { describe, expect, it } from "vitest"; import { coerceToFailoverError, describeFailoverError, + isTimeoutError, resolveFailoverReasonFromError, } from "./failover-error.js"; @@ -27,6 +28,22 @@ describe("failover-error", () => { expect(resolveFailoverReasonFromError({ code: "ECONNRESET" })).toBe("timeout"); }); + it("infers timeout from abort stop-reason messages", () => { + expect(resolveFailoverReasonFromError({ message: "Unhandled stop reason: abort" })).toBe( + "timeout", + ); + expect(resolveFailoverReasonFromError({ message: "stop reason: abort" })).toBe("timeout"); + expect(resolveFailoverReasonFromError({ message: "reason: abort" })).toBe("timeout"); + }); + + it("treats AbortError reason=abort as timeout", () => { + const err = Object.assign(new Error("aborted"), { + name: "AbortError", + reason: "reason: abort", + }); + expect(isTimeoutError(err)).toBe(true); + }); + it("coerces failover-worthy errors into FailoverError with metadata", () => { const err = coerceToFailoverError("credit balance too low", { provider: "anthropic", diff --git a/src/agents/failover-error.ts b/src/agents/failover-error.ts index cfd9d37c46..6592cfc7f7 100644 --- a/src/agents/failover-error.ts +++ b/src/agents/failover-error.ts @@ -1,7 +1,7 @@ import { classifyFailoverReason, type FailoverReason } from "./pi-embedded-helpers.js"; const TIMEOUT_HINT_RE = - /timeout|timed out|deadline exceeded|context deadline exceeded|stop reason:\s*abort|unhandled stop reason:\s*abort/i; + /timeout|timed out|deadline exceeded|context deadline exceeded|stop reason:\s*abort|reason:\s*abort|unhandled stop reason:\s*abort/i; const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i; export class FailoverError extends Error { diff --git a/src/agents/model-fallback.e2e.test.ts b/src/agents/model-fallback.e2e.test.ts index e650d0d470..5eb4734909 100644 --- a/src/agents/model-fallback.e2e.test.ts +++ b/src/agents/model-fallback.e2e.test.ts @@ -400,6 +400,17 @@ describe("runWithModelFallback", () => { }); }); + it("falls back on abort errors with reason: abort", async () => { + await expectFallsBackToHaiku({ + provider: "openai", + model: "gpt-4.1-mini", + firstError: Object.assign(new Error("aborted"), { + name: "AbortError", + reason: "reason: abort", + }), + }); + }); + it("falls back when message says aborted but error is a timeout", async () => { await expectFallsBackToHaiku({ provider: "openai", diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.e2e.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.e2e.test.ts index 4f72364de1..d4b84e4d75 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.e2e.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.e2e.test.ts @@ -10,6 +10,7 @@ import { isFailoverErrorMessage, isImageDimensionErrorMessage, isLikelyContextOverflowError, + isTimeoutErrorMessage, isTransientHttpError, parseImageDimensionError, parseImageSizeError, @@ -286,6 +287,15 @@ describe("isFailoverErrorMessage", () => { expect(isFailoverErrorMessage(sample)).toBe(true); } }); + + it("matches abort stop-reason timeout variants", () => { + const samples = ["Unhandled stop reason: abort", "stop reason: abort", "reason: abort"]; + for (const sample of samples) { + expect(isTimeoutErrorMessage(sample)).toBe(true); + expect(classifyFailoverReason(sample)).toBe("timeout"); + expect(isFailoverErrorMessage(sample)).toBe(true); + } + }); }); describe("parseImageSizeError", () => {