From 67014228cffa6c686f30d5a07345cf658fa890cc Mon Sep 17 00:00:00 2001 From: Sebastian <19554889+sebslight@users.noreply.github.com> Date: Mon, 16 Feb 2026 22:57:15 -0500 Subject: [PATCH] fix(subagents): harden announce retry guards --- CHANGELOG.md | 1 + ...agent-registry.announce-loop-guard.test.ts | 56 +++++++++++++++++-- .../subagent-registry.steer-restart.test.ts | 31 ++++++++++ src/agents/subagent-registry.ts | 23 ++++++-- src/web/test-helpers.ts | 24 +++++--- 5 files changed, 118 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54e4028508..62e0835d0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,6 +56,7 @@ Docs: https://docs.openclaw.ai - CLI/Status: fix `openclaw status --all` token summaries for bot-token-only channels so Mattermost/Zalo no longer show a bot+app warning. (#18527) Thanks @echo931. - Voice Call: add an optional stale call reaper (`staleCallReaperSeconds`) to end stuck calls when enabled. (#18437) - Auto-reply/Subagents: propagate group context (`groupId`, `groupChannel`, `space`) when spawning via `/subagents spawn`, matching tool-triggered subagent spawn behavior. +- Subagents: cap announce retry loops with max attempts and expiry to prevent infinite retry spam after deferred announces. (#18444) - Agents/Tools/exec: add a preflight guard that detects likely shell env var injection (e.g. `$DM_JSON`, `$TMPDIR`) in Python/Node scripts before execution, preventing recurring cron failures and wasted tokens when models emit mixed shell+language source. (#12836) - Agents/Tools: make loop detection progress-aware and phased by hard-blocking known `process(action=poll|log)` no-progress loops, warning on generic identical-call repeats, warning + no-progress-blocking ping-pong alternation loops (10/20), coalescing repeated warning spam into threshold buckets (including canonical ping-pong pairs), adding a global circuit breaker at 30 no-progress repeats, and emitting structured diagnostic `tool.loop` warning/error events for loop actions. (#16808) Thanks @akramcodez and @beca-oc. - Agents/Tools: scope the `message` tool schema to the active channel so Telegram uses `buttons` and Discord uses `components`. (#18215) Thanks @obviyus. diff --git a/src/agents/subagent-registry.announce-loop-guard.test.ts b/src/agents/subagent-registry.announce-loop-guard.test.ts index 7bf408bfc5..cfeef00269 100644 --- a/src/agents/subagent-registry.announce-loop-guard.test.ts +++ b/src/agents/subagent-registry.announce-loop-guard.test.ts @@ -38,9 +38,12 @@ vi.mock("./subagent-announce.js", () => ({ runSubagentAnnounceFlow: vi.fn().mockResolvedValue(false), })); +const loadSubagentRegistryFromDisk = vi.fn(() => new Map()); +const saveSubagentRegistryToDisk = vi.fn(); + vi.mock("./subagent-registry.store.js", () => ({ - loadSubagentRegistryFromDisk: () => new Map(), - saveSubagentRegistryToDisk: vi.fn(), + loadSubagentRegistryFromDisk, + saveSubagentRegistryToDisk, })); vi.mock("./subagent-announce-queue.js", () => ({ @@ -58,7 +61,10 @@ describe("announce loop guard (#18264)", () => { afterEach(() => { vi.useRealTimers(); - vi.restoreAllMocks(); + loadSubagentRegistryFromDisk.mockReset(); + loadSubagentRegistryFromDisk.mockReturnValue(new Map()); + saveSubagentRegistryToDisk.mockClear(); + vi.clearAllMocks(); }); test("SubagentRunRecord has announceRetryCount and lastAnnounceRetryAt fields", async () => { @@ -99,7 +105,7 @@ describe("announce loop guard (#18264)", () => { const now = Date.now(); // Add a run that ended 10 minutes ago (well past ANNOUNCE_EXPIRY_MS of 5 min) // with 3 retries already attempted - registry.addSubagentRunForTests({ + const entry = { runId: "test-expired-loop", childSessionKey: "agent:main:subagent:expired-child", requesterSessionKey: "agent:main:main", @@ -111,7 +117,9 @@ describe("announce loop guard (#18264)", () => { endedAt: now - 10 * 60_000, // 10 minutes ago announceRetryCount: 3, lastAnnounceRetryAt: now - 9 * 60_000, - }); + }; + + loadSubagentRegistryFromDisk.mockReturnValue(new Map([[entry.runId, entry]])); // Initialize the registry — this triggers resumeSubagentRun for persisted entries registry.initSubagentRegistry(); @@ -119,5 +127,43 @@ describe("announce loop guard (#18264)", () => { // The announce flow should NOT be called because the entry has exceeded // both the retry count and the expiry window. expect(announceFn).not.toHaveBeenCalled(); + + const runs = registry.listSubagentRunsForRequester("agent:main:main"); + const stored = runs.find((run) => run.runId === entry.runId); + expect(stored?.cleanupCompletedAt).toBeDefined(); + }); + + test("entries over retry budget are marked completed without announcing", async () => { + const registry = await import("./subagent-registry.js"); + const { runSubagentAnnounceFlow } = await import("./subagent-announce.js"); + const announceFn = vi.mocked(runSubagentAnnounceFlow); + announceFn.mockClear(); + + registry.resetSubagentRegistryForTests(); + + const now = Date.now(); + const entry = { + runId: "test-retry-budget", + childSessionKey: "agent:main:subagent:retry-budget", + requesterSessionKey: "agent:main:main", + requesterDisplayKey: "agent:main:main", + task: "retry budget test", + cleanup: "keep", + createdAt: now - 2 * 60_000, + startedAt: now - 90_000, + endedAt: now - 60_000, + announceRetryCount: 3, + lastAnnounceRetryAt: now - 30_000, + }; + + loadSubagentRegistryFromDisk.mockReturnValue(new Map([[entry.runId, entry]])); + + registry.initSubagentRegistry(); + + expect(announceFn).not.toHaveBeenCalled(); + + const runs = registry.listSubagentRunsForRequester("agent:main:main"); + const stored = runs.find((run) => run.runId === entry.runId); + expect(stored?.cleanupCompletedAt).toBeDefined(); }); }); diff --git a/src/agents/subagent-registry.steer-restart.test.ts b/src/agents/subagent-registry.steer-restart.test.ts index 776fa3faff..4e036efe6c 100644 --- a/src/agents/subagent-registry.steer-restart.test.ts +++ b/src/agents/subagent-registry.steer-restart.test.ts @@ -105,6 +105,37 @@ describe("subagent registry steer restarts", () => { expect(announce.childRunId).toBe("run-new"); }); + it("clears announce retry state when replacing after steer restart", () => { + mod.registerSubagentRun({ + runId: "run-retry-reset-old", + childSessionKey: "agent:main:subagent:retry-reset", + requesterSessionKey: "agent:main:main", + requesterDisplayKey: "main", + task: "retry reset", + cleanup: "keep", + }); + + const previous = mod.listSubagentRunsForRequester("agent:main:main")[0]; + expect(previous?.runId).toBe("run-retry-reset-old"); + if (previous) { + previous.announceRetryCount = 2; + previous.lastAnnounceRetryAt = Date.now(); + } + + const replaced = mod.replaceSubagentRunAfterSteer({ + previousRunId: "run-retry-reset-old", + nextRunId: "run-retry-reset-new", + fallback: previous, + }); + expect(replaced).toBe(true); + + const runs = mod.listSubagentRunsForRequester("agent:main:main"); + expect(runs).toHaveLength(1); + expect(runs[0].runId).toBe("run-retry-reset-new"); + expect(runs[0].announceRetryCount).toBeUndefined(); + expect(runs[0].lastAnnounceRetryAt).toBeUndefined(); + }); + it("restores announce for a finished run when steer replacement dispatch fails", async () => { mod.registerSubagentRun({ runId: "run-failed-restart", diff --git a/src/agents/subagent-registry.ts b/src/agents/subagent-registry.ts index 77eb2e2180..a19f1bd55a 100644 --- a/src/agents/subagent-registry.ts +++ b/src/agents/subagent-registry.ts @@ -1,6 +1,7 @@ import { loadConfig } from "../config/config.js"; import { callGateway } from "../gateway/call.js"; import { onAgentEvent } from "../infra/agent-events.js"; +import { defaultRuntime } from "../runtime.js"; import { type DeliveryContext, normalizeDeliveryContext } from "../utils/delivery-context.js"; import { resetAnnounceQueuesForTests } from "./subagent-announce-queue.js"; import { runSubagentAnnounceFlow, type SubagentRunOutcome } from "./subagent-announce.js"; @@ -53,7 +54,16 @@ const MAX_ANNOUNCE_RETRY_COUNT = 3; * succeeded. Guards against stale registry entries surviving gateway restarts. */ const ANNOUNCE_EXPIRY_MS = 5 * 60_000; // 5 minutes -// (Backoff constant removed — max-retry + expiry guards are sufficient.) + +function logAnnounceGiveUp(entry: SubagentRunRecord, reason: "retry-limit" | "expiry") { + const retryCount = entry.announceRetryCount ?? 0; + const endedAgoMs = + typeof entry.endedAt === "number" ? Math.max(0, Date.now() - entry.endedAt) : undefined; + const endedAgoLabel = endedAgoMs != null ? `${Math.round(endedAgoMs / 1000)}s` : "n/a"; + defaultRuntime.log( + `[warn] Subagent announce give up (${reason}) run=${entry.runId} child=${entry.childSessionKey} requester=${entry.requesterSessionKey} retries=${retryCount} endedAgo=${endedAgoLabel}`, + ); +} function persistSubagentRuns() { try { @@ -107,11 +117,13 @@ function resumeSubagentRun(runId: string) { } // Skip entries that have exhausted their retry budget or expired (#18264). if ((entry.announceRetryCount ?? 0) >= MAX_ANNOUNCE_RETRY_COUNT) { + logAnnounceGiveUp(entry, "retry-limit"); entry.cleanupCompletedAt = Date.now(); persistSubagentRuns(); return; } if (typeof entry.endedAt === "number" && Date.now() - entry.endedAt > ANNOUNCE_EXPIRY_MS) { + logAnnounceGiveUp(entry, "expiry"); entry.cleanupCompletedAt = Date.now(); persistSubagentRuns(); return; @@ -283,15 +295,17 @@ function finalizeSubagentCleanup(runId: string, cleanup: "delete" | "keep", didA return; } if (!didAnnounce) { + const now = Date.now(); const retryCount = (entry.announceRetryCount ?? 0) + 1; entry.announceRetryCount = retryCount; - entry.lastAnnounceRetryAt = Date.now(); + entry.lastAnnounceRetryAt = now; // Check if the announce has exceeded retry limits or expired (#18264). - const endedAgo = typeof entry.endedAt === "number" ? Date.now() - entry.endedAt : 0; + const endedAgo = typeof entry.endedAt === "number" ? now - entry.endedAt : 0; if (retryCount >= MAX_ANNOUNCE_RETRY_COUNT || endedAgo > ANNOUNCE_EXPIRY_MS) { // Give up: mark as completed to break the infinite retry loop. - entry.cleanupCompletedAt = Date.now(); + logAnnounceGiveUp(entry, retryCount >= MAX_ANNOUNCE_RETRY_COUNT ? "retry-limit" : "expiry"); + entry.cleanupCompletedAt = now; persistSubagentRuns(); retryDeferredCompletedAnnounces(runId); return; @@ -332,6 +346,7 @@ function retryDeferredCompletedAnnounces(excludeRunId?: string) { // Force-expire announces that have been pending too long (#18264). const endedAgo = now - (entry.endedAt ?? now); if (endedAgo > ANNOUNCE_EXPIRY_MS) { + logAnnounceGiveUp(entry, "expiry"); entry.cleanupCompletedAt = now; persistSubagentRuns(); continue; diff --git a/src/web/test-helpers.ts b/src/web/test-helpers.ts index ae87104d15..bf9f5be44f 100644 --- a/src/web/test-helpers.ts +++ b/src/web/test-helpers.ts @@ -91,14 +91,22 @@ export function resetBaileysMocks() { const recreated = createMockBaileys(); (globalThis as Record)[Symbol.for("openclaw:lastSocket")] = recreated.lastSocket; - // @ts-expect-error - baileys.makeWASocket = vi.fn(recreated.mod.makeWASocket); - // @ts-expect-error - baileys.useMultiFileAuthState = vi.fn(recreated.mod.useMultiFileAuthState); - // @ts-expect-error - baileys.fetchLatestBaileysVersion = vi.fn(recreated.mod.fetchLatestBaileysVersion); - // @ts-expect-error - baileys.makeCacheableSignalKeyStore = vi.fn(recreated.mod.makeCacheableSignalKeyStore); + + const makeWASocket = vi.mocked(baileys.makeWASocket); + makeWASocket.mockReset(); + makeWASocket.mockImplementation(recreated.mod.makeWASocket); + + const useMultiFileAuthState = vi.mocked(baileys.useMultiFileAuthState); + useMultiFileAuthState.mockReset(); + useMultiFileAuthState.mockImplementation(recreated.mod.useMultiFileAuthState); + + const fetchLatestBaileysVersion = vi.mocked(baileys.fetchLatestBaileysVersion); + fetchLatestBaileysVersion.mockReset(); + fetchLatestBaileysVersion.mockImplementation(recreated.mod.fetchLatestBaileysVersion); + + const makeCacheableSignalKeyStore = vi.mocked(baileys.makeCacheableSignalKeyStore); + makeCacheableSignalKeyStore.mockReset(); + makeCacheableSignalKeyStore.mockImplementation(recreated.mod.makeCacheableSignalKeyStore); } export function getLastSocket(): MockBaileysSocket {