mirror of
https://github.com/openclaw/openclaw.git
synced 2026-02-19 18:39:20 -05:00
fix: treat HTTP 503 as failover-eligible for LLM provider errors (#21086)
* fix: treat HTTP 503 as failover-eligible for LLM provider errors
When LLM SDKs wrap 503 responses, the leading "503" prefix is lost
(e.g. Google Gemini returns "high demand" / "UNAVAILABLE" without a
numeric prefix). The existing isTransientHttpError only matches
messages starting with "503 ...", so these wrapped errors silently
skip failover — no profile rotation, no model fallback.
This patch closes that gap:
- resolveFailoverReasonFromError: map HTTP status 503 → rate_limit
(covers structured error objects with a status field)
- ERROR_PATTERNS.overloaded: add /\b503\b/, "service unavailable",
"high demand" (covers message-only classification when the leading
status prefix is absent)
Existing isTransientHttpError behavior is unchanged; these additions
are complementary and only fire for errors that previously fell
through unclassified.
* fix: address review feedback — drop /\b503\b/ pattern, add test coverage
- Remove `/\b503\b/` from ERROR_PATTERNS.overloaded to resolve the
semantic inconsistency noted by reviewers: `isTransientHttpError`
already handles messages prefixed with "503" (→ "timeout"), so a
redundant overloaded pattern would classify the same class of errors
differently depending on message formatting.
- Keep "service unavailable" and "high demand" patterns — these are the
real gap-fillers for SDK-rewritten messages that lack a numeric prefix.
- Add test case for JSON-wrapped 503 error body containing "overloaded"
to strengthen coverage.
* fix: unify 503 classification — status 503 → timeout (consistent with isTransientHttpError)
resolveFailoverReasonFromError previously mapped status 503 → "rate_limit",
while the string-based isTransientHttpError mapped "503 ..." → "timeout".
Align both paths: structured {status: 503} now also returns "timeout",
matching the existing transient-error convention. Both reasons are
failover-eligible, so runtime behavior is unchanged.
---------
Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
@@ -13,6 +13,7 @@ describe("failover-error", () => {
|
||||
expect(resolveFailoverReasonFromError({ status: 403 })).toBe("auth");
|
||||
expect(resolveFailoverReasonFromError({ status: 408 })).toBe("timeout");
|
||||
expect(resolveFailoverReasonFromError({ status: 400 })).toBe("format");
|
||||
expect(resolveFailoverReasonFromError({ status: 503 })).toBe("timeout");
|
||||
});
|
||||
|
||||
it("infers format errors from error messages", () => {
|
||||
|
||||
@@ -161,6 +161,9 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
|
||||
if (status === 408) {
|
||||
return "timeout";
|
||||
}
|
||||
if (status === 503) {
|
||||
return "timeout";
|
||||
}
|
||||
if (status === 400) {
|
||||
return "format";
|
||||
}
|
||||
|
||||
@@ -348,4 +348,17 @@ describe("classifyFailoverReason", () => {
|
||||
"rate_limit",
|
||||
);
|
||||
});
|
||||
it("classifies provider high-demand / service-unavailable messages as rate_limit", () => {
|
||||
expect(
|
||||
classifyFailoverReason(
|
||||
"This model is currently experiencing high demand. Please try again later.",
|
||||
),
|
||||
).toBe("rate_limit");
|
||||
expect(classifyFailoverReason("LLM error: service unavailable")).toBe("rate_limit");
|
||||
expect(
|
||||
classifyFailoverReason(
|
||||
'{"error":{"code":503,"message":"The model is overloaded. Please try later","status":"UNAVAILABLE"}}',
|
||||
),
|
||||
).toBe("rate_limit");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -583,7 +583,12 @@ const ERROR_PATTERNS = {
|
||||
"resource_exhausted",
|
||||
"usage limit",
|
||||
],
|
||||
overloaded: [/overloaded_error|"type"\s*:\s*"overloaded_error"/i, "overloaded"],
|
||||
overloaded: [
|
||||
/overloaded_error|"type"\s*:\s*"overloaded_error"/i,
|
||||
"overloaded",
|
||||
"service unavailable",
|
||||
"high demand",
|
||||
],
|
||||
timeout: [
|
||||
"timeout",
|
||||
"timed out",
|
||||
|
||||
Reference in New Issue
Block a user