fix(failover): align abort timeout detection and regressions

This commit is contained in:
Sebastian
2026-02-16 20:59:44 -05:00
parent f242246839
commit fbda9a93fd
5 changed files with 40 additions and 1 deletions

View File

@@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
- Agents/Tools: make loop detection progress-aware and phased by hard-blocking known `process(action=poll|log)` no-progress loops, warning on generic identical-call repeats, warning + no-progress-blocking ping-pong alternation loops (10/20), coalescing repeated warning spam into threshold buckets (including canonical ping-pong pairs), adding a global circuit breaker at 30 no-progress repeats, and emitting structured diagnostic `tool.loop` warning/error events for loop actions. (#16808) Thanks @akramcodez and @beca-oc.
- Agents/Tools: scope the `message` tool schema to the active channel so Telegram uses `buttons` and Discord uses `components`. (#18215) Thanks @obviyus.
- Agents/Models: probe the primary model when its auth-profile cooldown is near expiry (with per-provider throttling), so runs recover from temporary rate limits without staying on fallback models until restart. (#17478) Thanks @PlayerGhost.
- Agents/Failover: classify provider abort stop-reason errors (`Unhandled stop reason: abort`, `stop reason: abort`, `reason: abort`) as timeout-class failures so configured model fallback chains trigger instead of surfacing raw abort failures. (#18618) Thanks @sauerdaniel.
- Agents/Context: raise default total bootstrap prompt cap from `24000` to `150000` chars (keeping `bootstrapMaxChars` at `20000`), include total-cap visibility in `/context`, and mark truncation from injected-vs-raw sizes so total-cap clipping is reflected accurately.
- Memory/QMD: scope managed collection names per agent and precreate glob-backed collection directories before registration, preventing cross-agent collection clobbering and startup ENOENT failures in fresh workspaces. (#17194) Thanks @jonathanadams96.
- Cron: preserve per-job schedule-error isolation in post-run maintenance recompute so malformed sibling jobs no longer abort persistence of successful runs. (#17852) Thanks @pierreeurope.

View File

@@ -2,6 +2,7 @@ import { describe, expect, it } from "vitest";
import {
coerceToFailoverError,
describeFailoverError,
isTimeoutError,
resolveFailoverReasonFromError,
} from "./failover-error.js";
@@ -27,6 +28,22 @@ describe("failover-error", () => {
expect(resolveFailoverReasonFromError({ code: "ECONNRESET" })).toBe("timeout");
});
it("infers timeout from abort stop-reason messages", () => {
expect(resolveFailoverReasonFromError({ message: "Unhandled stop reason: abort" })).toBe(
"timeout",
);
expect(resolveFailoverReasonFromError({ message: "stop reason: abort" })).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "reason: abort" })).toBe("timeout");
});
it("treats AbortError reason=abort as timeout", () => {
const err = Object.assign(new Error("aborted"), {
name: "AbortError",
reason: "reason: abort",
});
expect(isTimeoutError(err)).toBe(true);
});
it("coerces failover-worthy errors into FailoverError with metadata", () => {
const err = coerceToFailoverError("credit balance too low", {
provider: "anthropic",

View File

@@ -1,7 +1,7 @@
import { classifyFailoverReason, type FailoverReason } from "./pi-embedded-helpers.js";
const TIMEOUT_HINT_RE =
/timeout|timed out|deadline exceeded|context deadline exceeded|stop reason:\s*abort|unhandled stop reason:\s*abort/i;
/timeout|timed out|deadline exceeded|context deadline exceeded|stop reason:\s*abort|reason:\s*abort|unhandled stop reason:\s*abort/i;
const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i;
export class FailoverError extends Error {

View File

@@ -400,6 +400,17 @@ describe("runWithModelFallback", () => {
});
});
it("falls back on abort errors with reason: abort", async () => {
await expectFallsBackToHaiku({
provider: "openai",
model: "gpt-4.1-mini",
firstError: Object.assign(new Error("aborted"), {
name: "AbortError",
reason: "reason: abort",
}),
});
});
it("falls back when message says aborted but error is a timeout", async () => {
await expectFallsBackToHaiku({
provider: "openai",

View File

@@ -10,6 +10,7 @@ import {
isFailoverErrorMessage,
isImageDimensionErrorMessage,
isLikelyContextOverflowError,
isTimeoutErrorMessage,
isTransientHttpError,
parseImageDimensionError,
parseImageSizeError,
@@ -286,6 +287,15 @@ describe("isFailoverErrorMessage", () => {
expect(isFailoverErrorMessage(sample)).toBe(true);
}
});
it("matches abort stop-reason timeout variants", () => {
const samples = ["Unhandled stop reason: abort", "stop reason: abort", "reason: abort"];
for (const sample of samples) {
expect(isTimeoutErrorMessage(sample)).toBe(true);
expect(classifyFailoverReason(sample)).toBe("timeout");
expect(isFailoverErrorMessage(sample)).toBe(true);
}
});
});
describe("parseImageSizeError", () => {