diff --git a/CHANGELOG.md b/CHANGELOG.md index 16fefee9d1..5dd2919ebc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -74,6 +74,7 @@ Docs: https://docs.openclaw.ai - Agents/OpenAI: force `store=true` for direct OpenAI Responses/Codex runs to preserve multi-turn server-side conversation state, while leaving proxy/non-OpenAI endpoints unchanged. (#16803) Thanks @mark9232 and @vignesh07. - Memory/FTS: make `buildFtsQuery` Unicode-aware so non-ASCII queries (including CJK) produce keyword tokens instead of falling back to vector-only search. (#17672) Thanks @KinGP5471. - Auto-reply/Compaction: resolve `memory/YYYY-MM-DD.md` placeholders with timezone-aware runtime dates and append a `Current time:` line to memory-flush turns, preventing wrong-year memory filenames without making the system prompt time-variant. (#17603, #17633) Thanks @nicholaspapadam-wq and @vignesh07. +- Auth/Cooldowns: auto-expire stale auth profile cooldowns when `cooldownUntil` or `disabledUntil` timestamps have passed, and reset `errorCount` so the next transient failure does not immediately escalate to a disproportionately long cooldown. Handles `cooldownUntil` and `disabledUntil` independently. (#3604) Thanks @nabbilkhan. - Agents: return an explicit timeout error reply when an embedded run times out before producing any payloads, preventing silent dropped turns during slow cache-refresh transitions. (#16659) Thanks @liaosvcaf and @vignesh07. - Group chats: always inject group chat context (name, participants, reply guidance) into the system prompt on every turn, not just the first. Prevents the model from losing awareness of which group it's in and incorrectly using the message tool to send to the same group. (#14447) Thanks @tyler6204. - Browser/Agents: when browser control service is unavailable, return explicit non-retry guidance (instead of "try again") so models do not loop on repeated browser tool calls until timeout. (#17673) Thanks @austenstone. diff --git a/src/agents/auth-profiles.cooldown-auto-expiry.test.ts b/src/agents/auth-profiles.cooldown-auto-expiry.test.ts new file mode 100644 index 0000000000..9bcd9c93d4 --- /dev/null +++ b/src/agents/auth-profiles.cooldown-auto-expiry.test.ts @@ -0,0 +1,159 @@ +import { describe, expect, it } from "vitest"; +import type { AuthProfileStore } from "./auth-profiles/types.js"; +import { resolveAuthProfileOrder } from "./auth-profiles/order.js"; +import { isProfileInCooldown } from "./auth-profiles/usage.js"; + +/** + * Integration tests for cooldown auto-expiry through resolveAuthProfileOrder. + * Verifies that profiles with expired cooldowns are treated as available and + * have their error state reset, preventing the escalation loop described in + * #3604, #13623, #15851, and #11972. + */ + +function makeStoreWithProfiles(): AuthProfileStore { + return { + version: 1, + profiles: { + "anthropic:default": { type: "api_key", provider: "anthropic", key: "sk-1" }, + "anthropic:secondary": { type: "api_key", provider: "anthropic", key: "sk-2" }, + "openai:default": { type: "api_key", provider: "openai", key: "sk-oi" }, + }, + usageStats: {}, + }; +} + +describe("resolveAuthProfileOrder — cooldown auto-expiry", () => { + it("places profile with expired cooldown in available list (round-robin path)", () => { + const store = makeStoreWithProfiles(); + store.usageStats = { + "anthropic:default": { + cooldownUntil: Date.now() - 10_000, + errorCount: 4, + failureCounts: { rate_limit: 4 }, + lastFailureAt: Date.now() - 70_000, + }, + }; + + const order = resolveAuthProfileOrder({ store, provider: "anthropic" }); + + // Profile should be in the result (available, not skipped) + expect(order).toContain("anthropic:default"); + + // Should no longer report as in cooldown + expect(isProfileInCooldown(store, "anthropic:default")).toBe(false); + + // Error state should have been reset + expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(0); + expect(store.usageStats?.["anthropic:default"]?.cooldownUntil).toBeUndefined(); + }); + + it("places profile with expired cooldown in available list (explicit-order path)", () => { + const store = makeStoreWithProfiles(); + store.order = { anthropic: ["anthropic:secondary", "anthropic:default"] }; + store.usageStats = { + "anthropic:default": { + cooldownUntil: Date.now() - 5_000, + errorCount: 3, + }, + }; + + const order = resolveAuthProfileOrder({ store, provider: "anthropic" }); + + // Both profiles available — explicit order respected + expect(order[0]).toBe("anthropic:secondary"); + expect(order).toContain("anthropic:default"); + + // Expired cooldown cleared + expect(store.usageStats?.["anthropic:default"]?.cooldownUntil).toBeUndefined(); + expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(0); + }); + + it("keeps profile with active cooldown in cooldown list", () => { + const futureMs = Date.now() + 300_000; + const store = makeStoreWithProfiles(); + store.usageStats = { + "anthropic:default": { + cooldownUntil: futureMs, + errorCount: 3, + }, + }; + + const order = resolveAuthProfileOrder({ store, provider: "anthropic" }); + + // Profile is still in the result (appended after available profiles) + expect(order).toContain("anthropic:default"); + + // Should still be in cooldown + expect(isProfileInCooldown(store, "anthropic:default")).toBe(true); + expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(3); + }); + + it("expired cooldown resets error count — prevents escalation on next failure", () => { + const store = makeStoreWithProfiles(); + store.usageStats = { + "anthropic:default": { + cooldownUntil: Date.now() - 1_000, + errorCount: 4, // Would cause 1-hour cooldown on next failure + failureCounts: { rate_limit: 4 }, + lastFailureAt: Date.now() - 3_700_000, + }, + }; + + resolveAuthProfileOrder({ store, provider: "anthropic" }); + + // After clearing, errorCount is 0. If the profile fails again, + // the next cooldown will be 60 seconds (errorCount 1) instead of + // 1 hour (errorCount 5). This is the core fix for #3604. + expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(0); + expect(store.usageStats?.["anthropic:default"]?.failureCounts).toBeUndefined(); + }); + + it("mixed active and expired cooldowns across profiles", () => { + const store = makeStoreWithProfiles(); + store.usageStats = { + "anthropic:default": { + cooldownUntil: Date.now() - 1_000, + errorCount: 3, + }, + "anthropic:secondary": { + cooldownUntil: Date.now() + 300_000, + errorCount: 2, + }, + }; + + const order = resolveAuthProfileOrder({ store, provider: "anthropic" }); + + // anthropic:default should be available (expired, cleared) + expect(store.usageStats?.["anthropic:default"]?.cooldownUntil).toBeUndefined(); + expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(0); + + // anthropic:secondary should still be in cooldown + expect(store.usageStats?.["anthropic:secondary"]?.cooldownUntil).toBeGreaterThan(Date.now()); + expect(store.usageStats?.["anthropic:secondary"]?.errorCount).toBe(2); + + // Available profile should come first + expect(order[0]).toBe("anthropic:default"); + }); + + it("does not affect profiles from other providers", () => { + const store = makeStoreWithProfiles(); + store.usageStats = { + "anthropic:default": { + cooldownUntil: Date.now() - 1_000, + errorCount: 4, + }, + "openai:default": { + cooldownUntil: Date.now() - 1_000, + errorCount: 3, + }, + }; + + // Resolve only anthropic + resolveAuthProfileOrder({ store, provider: "anthropic" }); + + // Both should be cleared since clearExpiredCooldowns sweeps all profiles + // in the store — this is intentional for correctness. + expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(0); + expect(store.usageStats?.["openai:default"]?.errorCount).toBe(0); + }); +}); diff --git a/src/agents/auth-profiles.ts b/src/agents/auth-profiles.ts index 14b383d041..ae55d2cc5f 100644 --- a/src/agents/auth-profiles.ts +++ b/src/agents/auth-profiles.ts @@ -33,6 +33,7 @@ export type { export { calculateAuthProfileCooldownMs, clearAuthProfileCooldown, + clearExpiredCooldowns, getSoonestCooldownExpiry, isProfileInCooldown, markAuthProfileCooldown, diff --git a/src/agents/auth-profiles/order.ts b/src/agents/auth-profiles/order.ts index 31b7814b5f..0301f4d6d5 100644 --- a/src/agents/auth-profiles/order.ts +++ b/src/agents/auth-profiles/order.ts @@ -2,7 +2,7 @@ import type { OpenClawConfig } from "../../config/config.js"; import type { AuthProfileStore } from "./types.js"; import { normalizeProviderId } from "../model-selection.js"; import { listProfilesForProvider } from "./profiles.js"; -import { isProfileInCooldown } from "./usage.js"; +import { clearExpiredCooldowns, isProfileInCooldown } from "./usage.js"; function resolveProfileUnusableUntil(stats: { cooldownUntil?: number; @@ -26,6 +26,11 @@ export function resolveAuthProfileOrder(params: { const { cfg, store, provider, preferredProfile } = params; const providerKey = normalizeProviderId(provider); const now = Date.now(); + + // Clear any cooldowns that have expired since the last check so profiles + // get a fresh error count and are not immediately re-penalized on the + // next transient failure. See #3604. + clearExpiredCooldowns(store, now); const storedOrder = (() => { const order = store.order; if (!order) { diff --git a/src/agents/auth-profiles/usage.test.ts b/src/agents/auth-profiles/usage.test.ts new file mode 100644 index 0000000000..af45781813 --- /dev/null +++ b/src/agents/auth-profiles/usage.test.ts @@ -0,0 +1,269 @@ +import { describe, expect, it } from "vitest"; +import type { AuthProfileStore } from "./types.js"; +import { clearExpiredCooldowns, isProfileInCooldown } from "./usage.js"; + +function makeStore(usageStats: AuthProfileStore["usageStats"]): AuthProfileStore { + return { + version: 1, + profiles: { + "anthropic:default": { type: "api_key", provider: "anthropic", key: "sk-test" }, + "openai:default": { type: "api_key", provider: "openai", key: "sk-test-2" }, + }, + usageStats, + }; +} + +// --------------------------------------------------------------------------- +// isProfileInCooldown +// --------------------------------------------------------------------------- + +describe("isProfileInCooldown", () => { + it("returns false when profile has no usage stats", () => { + const store = makeStore(undefined); + expect(isProfileInCooldown(store, "anthropic:default")).toBe(false); + }); + + it("returns true when cooldownUntil is in the future", () => { + const store = makeStore({ + "anthropic:default": { cooldownUntil: Date.now() + 60_000 }, + }); + expect(isProfileInCooldown(store, "anthropic:default")).toBe(true); + }); + + it("returns false when cooldownUntil has passed", () => { + const store = makeStore({ + "anthropic:default": { cooldownUntil: Date.now() - 1_000 }, + }); + expect(isProfileInCooldown(store, "anthropic:default")).toBe(false); + }); + + it("returns true when disabledUntil is in the future (even if cooldownUntil expired)", () => { + const store = makeStore({ + "anthropic:default": { + cooldownUntil: Date.now() - 1_000, + disabledUntil: Date.now() + 60_000, + }, + }); + expect(isProfileInCooldown(store, "anthropic:default")).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// clearExpiredCooldowns +// --------------------------------------------------------------------------- + +describe("clearExpiredCooldowns", () => { + it("returns false on empty usageStats", () => { + const store = makeStore(undefined); + expect(clearExpiredCooldowns(store)).toBe(false); + }); + + it("returns false when no profiles have cooldowns", () => { + const store = makeStore({ + "anthropic:default": { lastUsed: Date.now() }, + }); + expect(clearExpiredCooldowns(store)).toBe(false); + }); + + it("returns false when cooldown is still active", () => { + const future = Date.now() + 300_000; + const store = makeStore({ + "anthropic:default": { cooldownUntil: future, errorCount: 3 }, + }); + + expect(clearExpiredCooldowns(store)).toBe(false); + expect(store.usageStats?.["anthropic:default"]?.cooldownUntil).toBe(future); + expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(3); + }); + + it("clears expired cooldownUntil and resets errorCount", () => { + const store = makeStore({ + "anthropic:default": { + cooldownUntil: Date.now() - 1_000, + errorCount: 4, + failureCounts: { rate_limit: 3, timeout: 1 }, + lastFailureAt: Date.now() - 120_000, + }, + }); + + expect(clearExpiredCooldowns(store)).toBe(true); + + const stats = store.usageStats?.["anthropic:default"]; + expect(stats?.cooldownUntil).toBeUndefined(); + expect(stats?.errorCount).toBe(0); + expect(stats?.failureCounts).toBeUndefined(); + // lastFailureAt preserved for failureWindowMs decay + expect(stats?.lastFailureAt).toBeDefined(); + }); + + it("clears expired disabledUntil and disabledReason", () => { + const store = makeStore({ + "anthropic:default": { + disabledUntil: Date.now() - 1_000, + disabledReason: "billing", + errorCount: 2, + failureCounts: { billing: 2 }, + }, + }); + + expect(clearExpiredCooldowns(store)).toBe(true); + + const stats = store.usageStats?.["anthropic:default"]; + expect(stats?.disabledUntil).toBeUndefined(); + expect(stats?.disabledReason).toBeUndefined(); + expect(stats?.errorCount).toBe(0); + expect(stats?.failureCounts).toBeUndefined(); + }); + + it("handles independent expiry: cooldown expired but disabled still active", () => { + const future = Date.now() + 3_600_000; + const store = makeStore({ + "anthropic:default": { + cooldownUntil: Date.now() - 1_000, + disabledUntil: future, + disabledReason: "billing", + errorCount: 5, + failureCounts: { rate_limit: 3, billing: 2 }, + }, + }); + + expect(clearExpiredCooldowns(store)).toBe(true); + + const stats = store.usageStats?.["anthropic:default"]; + // cooldownUntil cleared + expect(stats?.cooldownUntil).toBeUndefined(); + // disabledUntil still active — not touched + expect(stats?.disabledUntil).toBe(future); + expect(stats?.disabledReason).toBe("billing"); + // errorCount NOT reset because profile still has an active unusable window + expect(stats?.errorCount).toBe(5); + expect(stats?.failureCounts).toEqual({ rate_limit: 3, billing: 2 }); + }); + + it("handles independent expiry: disabled expired but cooldown still active", () => { + const future = Date.now() + 300_000; + const store = makeStore({ + "anthropic:default": { + cooldownUntil: future, + disabledUntil: Date.now() - 1_000, + disabledReason: "billing", + errorCount: 3, + }, + }); + + expect(clearExpiredCooldowns(store)).toBe(true); + + const stats = store.usageStats?.["anthropic:default"]; + expect(stats?.cooldownUntil).toBe(future); + expect(stats?.disabledUntil).toBeUndefined(); + expect(stats?.disabledReason).toBeUndefined(); + // errorCount NOT reset because cooldown is still active + expect(stats?.errorCount).toBe(3); + }); + + it("resets errorCount only when both cooldown and disabled have expired", () => { + const store = makeStore({ + "anthropic:default": { + cooldownUntil: Date.now() - 2_000, + disabledUntil: Date.now() - 1_000, + disabledReason: "billing", + errorCount: 4, + failureCounts: { rate_limit: 2, billing: 2 }, + }, + }); + + expect(clearExpiredCooldowns(store)).toBe(true); + + const stats = store.usageStats?.["anthropic:default"]; + expect(stats?.cooldownUntil).toBeUndefined(); + expect(stats?.disabledUntil).toBeUndefined(); + expect(stats?.disabledReason).toBeUndefined(); + expect(stats?.errorCount).toBe(0); + expect(stats?.failureCounts).toBeUndefined(); + }); + + it("processes multiple profiles independently", () => { + const store = makeStore({ + "anthropic:default": { + cooldownUntil: Date.now() - 1_000, + errorCount: 3, + }, + "openai:default": { + cooldownUntil: Date.now() + 300_000, + errorCount: 2, + }, + }); + + expect(clearExpiredCooldowns(store)).toBe(true); + + // Anthropic: expired → cleared + expect(store.usageStats?.["anthropic:default"]?.cooldownUntil).toBeUndefined(); + expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(0); + + // OpenAI: still active → untouched + expect(store.usageStats?.["openai:default"]?.cooldownUntil).toBeGreaterThan(Date.now()); + expect(store.usageStats?.["openai:default"]?.errorCount).toBe(2); + }); + + it("accepts an explicit `now` timestamp for deterministic testing", () => { + const fixedNow = 1_700_000_000_000; + const store = makeStore({ + "anthropic:default": { + cooldownUntil: fixedNow - 1, + errorCount: 2, + }, + }); + + expect(clearExpiredCooldowns(store, fixedNow)).toBe(true); + expect(store.usageStats?.["anthropic:default"]?.cooldownUntil).toBeUndefined(); + expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(0); + }); + + it("clears cooldownUntil that equals exactly `now`", () => { + const fixedNow = 1_700_000_000_000; + const store = makeStore({ + "anthropic:default": { + cooldownUntil: fixedNow, + errorCount: 2, + }, + }); + + // ts >= cooldownUntil → should clear (cooldown "until" means the instant + // at cooldownUntil the profile becomes available again). + expect(clearExpiredCooldowns(store, fixedNow)).toBe(true); + expect(store.usageStats?.["anthropic:default"]?.cooldownUntil).toBeUndefined(); + expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(0); + }); + + it("ignores NaN and Infinity cooldown values", () => { + const store = makeStore({ + "anthropic:default": { + cooldownUntil: NaN, + errorCount: 2, + }, + "openai:default": { + cooldownUntil: Infinity, + errorCount: 3, + }, + }); + + expect(clearExpiredCooldowns(store)).toBe(false); + expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(2); + expect(store.usageStats?.["openai:default"]?.errorCount).toBe(3); + }); + + it("ignores zero and negative cooldown values", () => { + const store = makeStore({ + "anthropic:default": { + cooldownUntil: 0, + errorCount: 1, + }, + "openai:default": { + cooldownUntil: -1, + errorCount: 1, + }, + }); + + expect(clearExpiredCooldowns(store)).toBe(false); + }); +}); diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index 366eddf667..ada49f33b3 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -51,6 +51,77 @@ export function getSoonestCooldownExpiry( return soonest; } +/** + * Clear expired cooldowns from all profiles in the store. + * + * When `cooldownUntil` or `disabledUntil` has passed, the corresponding fields + * are removed and error counters are reset so the profile gets a fresh start + * (circuit-breaker half-open → closed). Without this, a stale `errorCount` + * causes the *next* transient failure to immediately escalate to a much longer + * cooldown — the root cause of profiles appearing "stuck" after rate limits. + * + * `cooldownUntil` and `disabledUntil` are handled independently: if a profile + * has both and only one has expired, only that field is cleared. + * + * Mutates the in-memory store; disk persistence happens lazily on the next + * store write (e.g. `markAuthProfileUsed` / `markAuthProfileFailure`), which + * matches the existing save pattern throughout the auth-profiles module. + * + * @returns `true` if any profile was modified. + */ +export function clearExpiredCooldowns(store: AuthProfileStore, now?: number): boolean { + const usageStats = store.usageStats; + if (!usageStats) { + return false; + } + + const ts = now ?? Date.now(); + let mutated = false; + + for (const [profileId, stats] of Object.entries(usageStats)) { + if (!stats) { + continue; + } + + let profileMutated = false; + const cooldownExpired = + typeof stats.cooldownUntil === "number" && + Number.isFinite(stats.cooldownUntil) && + stats.cooldownUntil > 0 && + ts >= stats.cooldownUntil; + const disabledExpired = + typeof stats.disabledUntil === "number" && + Number.isFinite(stats.disabledUntil) && + stats.disabledUntil > 0 && + ts >= stats.disabledUntil; + + if (cooldownExpired) { + stats.cooldownUntil = undefined; + profileMutated = true; + } + if (disabledExpired) { + stats.disabledUntil = undefined; + stats.disabledReason = undefined; + profileMutated = true; + } + + // Reset error counters when ALL cooldowns have expired so the profile gets + // a fair retry window. Preserves lastFailureAt for the failureWindowMs + // decay check in computeNextProfileUsageStats. + if (profileMutated && !resolveProfileUnusableUntil(stats)) { + stats.errorCount = 0; + stats.failureCounts = undefined; + } + + if (profileMutated) { + usageStats[profileId] = stats; + mutated = true; + } + } + + return mutated; +} + /** * Mark a profile as successfully used. Resets error count and updates lastUsed. * Uses store lock to avoid overwriting concurrent usage updates.