mirror of
https://github.com/openclaw/openclaw.git
synced 2026-02-19 18:39:20 -05:00
fix(subagents): harden announce retry guards
This commit is contained in:
@@ -56,6 +56,7 @@ Docs: https://docs.openclaw.ai
|
||||
- CLI/Status: fix `openclaw status --all` token summaries for bot-token-only channels so Mattermost/Zalo no longer show a bot+app warning. (#18527) Thanks @echo931.
|
||||
- Voice Call: add an optional stale call reaper (`staleCallReaperSeconds`) to end stuck calls when enabled. (#18437)
|
||||
- Auto-reply/Subagents: propagate group context (`groupId`, `groupChannel`, `space`) when spawning via `/subagents spawn`, matching tool-triggered subagent spawn behavior.
|
||||
- Subagents: cap announce retry loops with max attempts and expiry to prevent infinite retry spam after deferred announces. (#18444)
|
||||
- Agents/Tools/exec: add a preflight guard that detects likely shell env var injection (e.g. `$DM_JSON`, `$TMPDIR`) in Python/Node scripts before execution, preventing recurring cron failures and wasted tokens when models emit mixed shell+language source. (#12836)
|
||||
- Agents/Tools: make loop detection progress-aware and phased by hard-blocking known `process(action=poll|log)` no-progress loops, warning on generic identical-call repeats, warning + no-progress-blocking ping-pong alternation loops (10/20), coalescing repeated warning spam into threshold buckets (including canonical ping-pong pairs), adding a global circuit breaker at 30 no-progress repeats, and emitting structured diagnostic `tool.loop` warning/error events for loop actions. (#16808) Thanks @akramcodez and @beca-oc.
|
||||
- Agents/Tools: scope the `message` tool schema to the active channel so Telegram uses `buttons` and Discord uses `components`. (#18215) Thanks @obviyus.
|
||||
|
||||
@@ -38,9 +38,12 @@ vi.mock("./subagent-announce.js", () => ({
|
||||
runSubagentAnnounceFlow: vi.fn().mockResolvedValue(false),
|
||||
}));
|
||||
|
||||
const loadSubagentRegistryFromDisk = vi.fn(() => new Map());
|
||||
const saveSubagentRegistryToDisk = vi.fn();
|
||||
|
||||
vi.mock("./subagent-registry.store.js", () => ({
|
||||
loadSubagentRegistryFromDisk: () => new Map(),
|
||||
saveSubagentRegistryToDisk: vi.fn(),
|
||||
loadSubagentRegistryFromDisk,
|
||||
saveSubagentRegistryToDisk,
|
||||
}));
|
||||
|
||||
vi.mock("./subagent-announce-queue.js", () => ({
|
||||
@@ -58,7 +61,10 @@ describe("announce loop guard (#18264)", () => {
|
||||
|
||||
afterEach(() => {
|
||||
vi.useRealTimers();
|
||||
vi.restoreAllMocks();
|
||||
loadSubagentRegistryFromDisk.mockReset();
|
||||
loadSubagentRegistryFromDisk.mockReturnValue(new Map());
|
||||
saveSubagentRegistryToDisk.mockClear();
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
test("SubagentRunRecord has announceRetryCount and lastAnnounceRetryAt fields", async () => {
|
||||
@@ -99,7 +105,7 @@ describe("announce loop guard (#18264)", () => {
|
||||
const now = Date.now();
|
||||
// Add a run that ended 10 minutes ago (well past ANNOUNCE_EXPIRY_MS of 5 min)
|
||||
// with 3 retries already attempted
|
||||
registry.addSubagentRunForTests({
|
||||
const entry = {
|
||||
runId: "test-expired-loop",
|
||||
childSessionKey: "agent:main:subagent:expired-child",
|
||||
requesterSessionKey: "agent:main:main",
|
||||
@@ -111,7 +117,9 @@ describe("announce loop guard (#18264)", () => {
|
||||
endedAt: now - 10 * 60_000, // 10 minutes ago
|
||||
announceRetryCount: 3,
|
||||
lastAnnounceRetryAt: now - 9 * 60_000,
|
||||
});
|
||||
};
|
||||
|
||||
loadSubagentRegistryFromDisk.mockReturnValue(new Map([[entry.runId, entry]]));
|
||||
|
||||
// Initialize the registry — this triggers resumeSubagentRun for persisted entries
|
||||
registry.initSubagentRegistry();
|
||||
@@ -119,5 +127,43 @@ describe("announce loop guard (#18264)", () => {
|
||||
// The announce flow should NOT be called because the entry has exceeded
|
||||
// both the retry count and the expiry window.
|
||||
expect(announceFn).not.toHaveBeenCalled();
|
||||
|
||||
const runs = registry.listSubagentRunsForRequester("agent:main:main");
|
||||
const stored = runs.find((run) => run.runId === entry.runId);
|
||||
expect(stored?.cleanupCompletedAt).toBeDefined();
|
||||
});
|
||||
|
||||
test("entries over retry budget are marked completed without announcing", async () => {
|
||||
const registry = await import("./subagent-registry.js");
|
||||
const { runSubagentAnnounceFlow } = await import("./subagent-announce.js");
|
||||
const announceFn = vi.mocked(runSubagentAnnounceFlow);
|
||||
announceFn.mockClear();
|
||||
|
||||
registry.resetSubagentRegistryForTests();
|
||||
|
||||
const now = Date.now();
|
||||
const entry = {
|
||||
runId: "test-retry-budget",
|
||||
childSessionKey: "agent:main:subagent:retry-budget",
|
||||
requesterSessionKey: "agent:main:main",
|
||||
requesterDisplayKey: "agent:main:main",
|
||||
task: "retry budget test",
|
||||
cleanup: "keep",
|
||||
createdAt: now - 2 * 60_000,
|
||||
startedAt: now - 90_000,
|
||||
endedAt: now - 60_000,
|
||||
announceRetryCount: 3,
|
||||
lastAnnounceRetryAt: now - 30_000,
|
||||
};
|
||||
|
||||
loadSubagentRegistryFromDisk.mockReturnValue(new Map([[entry.runId, entry]]));
|
||||
|
||||
registry.initSubagentRegistry();
|
||||
|
||||
expect(announceFn).not.toHaveBeenCalled();
|
||||
|
||||
const runs = registry.listSubagentRunsForRequester("agent:main:main");
|
||||
const stored = runs.find((run) => run.runId === entry.runId);
|
||||
expect(stored?.cleanupCompletedAt).toBeDefined();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -105,6 +105,37 @@ describe("subagent registry steer restarts", () => {
|
||||
expect(announce.childRunId).toBe("run-new");
|
||||
});
|
||||
|
||||
it("clears announce retry state when replacing after steer restart", () => {
|
||||
mod.registerSubagentRun({
|
||||
runId: "run-retry-reset-old",
|
||||
childSessionKey: "agent:main:subagent:retry-reset",
|
||||
requesterSessionKey: "agent:main:main",
|
||||
requesterDisplayKey: "main",
|
||||
task: "retry reset",
|
||||
cleanup: "keep",
|
||||
});
|
||||
|
||||
const previous = mod.listSubagentRunsForRequester("agent:main:main")[0];
|
||||
expect(previous?.runId).toBe("run-retry-reset-old");
|
||||
if (previous) {
|
||||
previous.announceRetryCount = 2;
|
||||
previous.lastAnnounceRetryAt = Date.now();
|
||||
}
|
||||
|
||||
const replaced = mod.replaceSubagentRunAfterSteer({
|
||||
previousRunId: "run-retry-reset-old",
|
||||
nextRunId: "run-retry-reset-new",
|
||||
fallback: previous,
|
||||
});
|
||||
expect(replaced).toBe(true);
|
||||
|
||||
const runs = mod.listSubagentRunsForRequester("agent:main:main");
|
||||
expect(runs).toHaveLength(1);
|
||||
expect(runs[0].runId).toBe("run-retry-reset-new");
|
||||
expect(runs[0].announceRetryCount).toBeUndefined();
|
||||
expect(runs[0].lastAnnounceRetryAt).toBeUndefined();
|
||||
});
|
||||
|
||||
it("restores announce for a finished run when steer replacement dispatch fails", async () => {
|
||||
mod.registerSubagentRun({
|
||||
runId: "run-failed-restart",
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { loadConfig } from "../config/config.js";
|
||||
import { callGateway } from "../gateway/call.js";
|
||||
import { onAgentEvent } from "../infra/agent-events.js";
|
||||
import { defaultRuntime } from "../runtime.js";
|
||||
import { type DeliveryContext, normalizeDeliveryContext } from "../utils/delivery-context.js";
|
||||
import { resetAnnounceQueuesForTests } from "./subagent-announce-queue.js";
|
||||
import { runSubagentAnnounceFlow, type SubagentRunOutcome } from "./subagent-announce.js";
|
||||
@@ -53,7 +54,16 @@ const MAX_ANNOUNCE_RETRY_COUNT = 3;
|
||||
* succeeded. Guards against stale registry entries surviving gateway restarts.
|
||||
*/
|
||||
const ANNOUNCE_EXPIRY_MS = 5 * 60_000; // 5 minutes
|
||||
// (Backoff constant removed — max-retry + expiry guards are sufficient.)
|
||||
|
||||
function logAnnounceGiveUp(entry: SubagentRunRecord, reason: "retry-limit" | "expiry") {
|
||||
const retryCount = entry.announceRetryCount ?? 0;
|
||||
const endedAgoMs =
|
||||
typeof entry.endedAt === "number" ? Math.max(0, Date.now() - entry.endedAt) : undefined;
|
||||
const endedAgoLabel = endedAgoMs != null ? `${Math.round(endedAgoMs / 1000)}s` : "n/a";
|
||||
defaultRuntime.log(
|
||||
`[warn] Subagent announce give up (${reason}) run=${entry.runId} child=${entry.childSessionKey} requester=${entry.requesterSessionKey} retries=${retryCount} endedAgo=${endedAgoLabel}`,
|
||||
);
|
||||
}
|
||||
|
||||
function persistSubagentRuns() {
|
||||
try {
|
||||
@@ -107,11 +117,13 @@ function resumeSubagentRun(runId: string) {
|
||||
}
|
||||
// Skip entries that have exhausted their retry budget or expired (#18264).
|
||||
if ((entry.announceRetryCount ?? 0) >= MAX_ANNOUNCE_RETRY_COUNT) {
|
||||
logAnnounceGiveUp(entry, "retry-limit");
|
||||
entry.cleanupCompletedAt = Date.now();
|
||||
persistSubagentRuns();
|
||||
return;
|
||||
}
|
||||
if (typeof entry.endedAt === "number" && Date.now() - entry.endedAt > ANNOUNCE_EXPIRY_MS) {
|
||||
logAnnounceGiveUp(entry, "expiry");
|
||||
entry.cleanupCompletedAt = Date.now();
|
||||
persistSubagentRuns();
|
||||
return;
|
||||
@@ -283,15 +295,17 @@ function finalizeSubagentCleanup(runId: string, cleanup: "delete" | "keep", didA
|
||||
return;
|
||||
}
|
||||
if (!didAnnounce) {
|
||||
const now = Date.now();
|
||||
const retryCount = (entry.announceRetryCount ?? 0) + 1;
|
||||
entry.announceRetryCount = retryCount;
|
||||
entry.lastAnnounceRetryAt = Date.now();
|
||||
entry.lastAnnounceRetryAt = now;
|
||||
|
||||
// Check if the announce has exceeded retry limits or expired (#18264).
|
||||
const endedAgo = typeof entry.endedAt === "number" ? Date.now() - entry.endedAt : 0;
|
||||
const endedAgo = typeof entry.endedAt === "number" ? now - entry.endedAt : 0;
|
||||
if (retryCount >= MAX_ANNOUNCE_RETRY_COUNT || endedAgo > ANNOUNCE_EXPIRY_MS) {
|
||||
// Give up: mark as completed to break the infinite retry loop.
|
||||
entry.cleanupCompletedAt = Date.now();
|
||||
logAnnounceGiveUp(entry, retryCount >= MAX_ANNOUNCE_RETRY_COUNT ? "retry-limit" : "expiry");
|
||||
entry.cleanupCompletedAt = now;
|
||||
persistSubagentRuns();
|
||||
retryDeferredCompletedAnnounces(runId);
|
||||
return;
|
||||
@@ -332,6 +346,7 @@ function retryDeferredCompletedAnnounces(excludeRunId?: string) {
|
||||
// Force-expire announces that have been pending too long (#18264).
|
||||
const endedAgo = now - (entry.endedAt ?? now);
|
||||
if (endedAgo > ANNOUNCE_EXPIRY_MS) {
|
||||
logAnnounceGiveUp(entry, "expiry");
|
||||
entry.cleanupCompletedAt = now;
|
||||
persistSubagentRuns();
|
||||
continue;
|
||||
|
||||
@@ -91,14 +91,22 @@ export function resetBaileysMocks() {
|
||||
const recreated = createMockBaileys();
|
||||
(globalThis as Record<PropertyKey, unknown>)[Symbol.for("openclaw:lastSocket")] =
|
||||
recreated.lastSocket;
|
||||
// @ts-expect-error
|
||||
baileys.makeWASocket = vi.fn(recreated.mod.makeWASocket);
|
||||
// @ts-expect-error
|
||||
baileys.useMultiFileAuthState = vi.fn(recreated.mod.useMultiFileAuthState);
|
||||
// @ts-expect-error
|
||||
baileys.fetchLatestBaileysVersion = vi.fn(recreated.mod.fetchLatestBaileysVersion);
|
||||
// @ts-expect-error
|
||||
baileys.makeCacheableSignalKeyStore = vi.fn(recreated.mod.makeCacheableSignalKeyStore);
|
||||
|
||||
const makeWASocket = vi.mocked(baileys.makeWASocket);
|
||||
makeWASocket.mockReset();
|
||||
makeWASocket.mockImplementation(recreated.mod.makeWASocket);
|
||||
|
||||
const useMultiFileAuthState = vi.mocked(baileys.useMultiFileAuthState);
|
||||
useMultiFileAuthState.mockReset();
|
||||
useMultiFileAuthState.mockImplementation(recreated.mod.useMultiFileAuthState);
|
||||
|
||||
const fetchLatestBaileysVersion = vi.mocked(baileys.fetchLatestBaileysVersion);
|
||||
fetchLatestBaileysVersion.mockReset();
|
||||
fetchLatestBaileysVersion.mockImplementation(recreated.mod.fetchLatestBaileysVersion);
|
||||
|
||||
const makeCacheableSignalKeyStore = vi.mocked(baileys.makeCacheableSignalKeyStore);
|
||||
makeCacheableSignalKeyStore.mockReset();
|
||||
makeCacheableSignalKeyStore.mockImplementation(recreated.mod.makeCacheableSignalKeyStore);
|
||||
}
|
||||
|
||||
export function getLastSocket(): MockBaileysSocket {
|
||||
|
||||
Reference in New Issue
Block a user