fix(failover): align abort timeout detection and regressions

2026-02-19 18:39:20 -05:00 · 2026-02-16 20:59:44 -05:00
parent f242246839
commit fbda9a93fd
5 changed files with 40 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
 - Agents/Tools: make loop detection progress-aware and phased by hard-blocking known `process(action=poll|log)` no-progress loops, warning on generic identical-call repeats, warning + no-progress-blocking ping-pong alternation loops (10/20), coalescing repeated warning spam into threshold buckets (including canonical ping-pong pairs), adding a global circuit breaker at 30 no-progress repeats, and emitting structured diagnostic `tool.loop` warning/error events for loop actions. (#16808) Thanks @akramcodez and @beca-oc.
 - Agents/Tools: scope the `message` tool schema to the active channel so Telegram uses `buttons` and Discord uses `components`. (#18215) Thanks @obviyus.
 - Agents/Models: probe the primary model when its auth-profile cooldown is near expiry (with per-provider throttling), so runs recover from temporary rate limits without staying on fallback models until restart. (#17478) Thanks @PlayerGhost.
+- Agents/Failover: classify provider abort stop-reason errors (`Unhandled stop reason: abort`, `stop reason: abort`, `reason: abort`) as timeout-class failures so configured model fallback chains trigger instead of surfacing raw abort failures. (#18618) Thanks @sauerdaniel.
 - Agents/Context: raise default total bootstrap prompt cap from `24000` to `150000` chars (keeping `bootstrapMaxChars` at `20000`), include total-cap visibility in `/context`, and mark truncation from injected-vs-raw sizes so total-cap clipping is reflected accurately.
 - Memory/QMD: scope managed collection names per agent and precreate glob-backed collection directories before registration, preventing cross-agent collection clobbering and startup ENOENT failures in fresh workspaces. (#17194) Thanks @jonathanadams96.
 - Cron: preserve per-job schedule-error isolation in post-run maintenance recompute so malformed sibling jobs no longer abort persistence of successful runs. (#17852) Thanks @pierreeurope.
--- a/src/agents/failover-error.e2e.test.ts
+++ b/src/agents/failover-error.e2e.test.ts
@@ -2,6 +2,7 @@ import { describe, expect, it } from "vitest";
 import {
  coerceToFailoverError,
  describeFailoverError,
+  isTimeoutError,
  resolveFailoverReasonFromError,
 } from "./failover-error.js";

@@ -27,6 +28,22 @@ describe("failover-error", () => {
    expect(resolveFailoverReasonFromError({ code: "ECONNRESET" })).toBe("timeout");
  });

+  it("infers timeout from abort stop-reason messages", () => {
+    expect(resolveFailoverReasonFromError({ message: "Unhandled stop reason: abort" })).toBe(
+      "timeout",
+    );
+    expect(resolveFailoverReasonFromError({ message: "stop reason: abort" })).toBe("timeout");
+    expect(resolveFailoverReasonFromError({ message: "reason: abort" })).toBe("timeout");
+  });
+
+  it("treats AbortError reason=abort as timeout", () => {
+    const err = Object.assign(new Error("aborted"), {
+      name: "AbortError",
+      reason: "reason: abort",
+    });
+    expect(isTimeoutError(err)).toBe(true);
+  });
+
  it("coerces failover-worthy errors into FailoverError with metadata", () => {
    const err = coerceToFailoverError("credit balance too low", {
      provider: "anthropic",
--- a/src/agents/failover-error.ts
+++ b/src/agents/failover-error.ts
@@ -1,7 +1,7 @@
 import { classifyFailoverReason, type FailoverReason } from "./pi-embedded-helpers.js";

 const TIMEOUT_HINT_RE =
-  /timeout|timed out|deadline exceeded|context deadline exceeded|stop reason:\s*abort|unhandled stop reason:\s*abort/i;
+  /timeout|timed out|deadline exceeded|context deadline exceeded|stop reason:\s*abort|reason:\s*abort|unhandled stop reason:\s*abort/i;
 const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i;

 export class FailoverError extends Error {
--- a/src/agents/model-fallback.e2e.test.ts
+++ b/src/agents/model-fallback.e2e.test.ts
@@ -400,6 +400,17 @@ describe("runWithModelFallback", () => {
    });
  });

+  it("falls back on abort errors with reason: abort", async () => {
+    await expectFallsBackToHaiku({
+      provider: "openai",
+      model: "gpt-4.1-mini",
+      firstError: Object.assign(new Error("aborted"), {
+        name: "AbortError",
+        reason: "reason: abort",
+      }),
+    });
+  });
+
  it("falls back when message says aborted but error is a timeout", async () => {
    await expectFallsBackToHaiku({
      provider: "openai",
--- a/src/agents/pi-embedded-helpers.isbillingerrormessage.e2e.test.ts
+++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.e2e.test.ts
@@ -10,6 +10,7 @@ import {
  isFailoverErrorMessage,
  isImageDimensionErrorMessage,
  isLikelyContextOverflowError,
+  isTimeoutErrorMessage,
  isTransientHttpError,
  parseImageDimensionError,
  parseImageSizeError,
@@ -286,6 +287,15 @@ describe("isFailoverErrorMessage", () => {
      expect(isFailoverErrorMessage(sample)).toBe(true);
    }
  });
+
+  it("matches abort stop-reason timeout variants", () => {
+    const samples = ["Unhandled stop reason: abort", "stop reason: abort", "reason: abort"];
+    for (const sample of samples) {
+      expect(isTimeoutErrorMessage(sample)).toBe(true);
+      expect(classifyFailoverReason(sample)).toBe("timeout");
+      expect(isFailoverErrorMessage(sample)).toBe(true);
+    }
+  });
 });

 describe("parseImageSizeError", () => {