From e5eb5b3e43d78a0beca7bd917f65eeadb73a033d Mon Sep 17 00:00:00 2001 From: Sk Akram Date: Tue, 17 Feb 2026 01:46:35 +0530 Subject: [PATCH] feat: add stuck loop detection and exponential backoff infrastructure for agent polling (#17118) Merged via /review-pr -> /prepare-pr -> /merge-pr. Prepared head SHA: eebabf679b983e5a660fb3cef371e1303f11f615 Co-authored-by: akramcodez <179671552+akramcodez@users.noreply.github.com> Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com> Reviewed-by: @gumadeiras --- CHANGELOG.md | 1 + src/agents/command-poll-backoff.test.ts | 173 ++++++ src/agents/command-poll-backoff.ts | 82 +++ .../pi-tools.before-tool-call.e2e.test.ts | 4 + src/agents/pi-tools.before-tool-call.test.ts | 363 ++++++++++++ src/agents/pi-tools.before-tool-call.ts | 126 ++++- src/agents/tool-loop-detection.test.ts | 422 ++++++++++++++ src/agents/tool-loop-detection.ts | 527 ++++++++++++++++++ src/infra/diagnostic-events.ts | 16 +- src/logging/diagnostic-session-state.ts | 11 + src/logging/diagnostic.ts | 46 ++ 11 files changed, 1769 insertions(+), 2 deletions(-) create mode 100644 src/agents/command-poll-backoff.test.ts create mode 100644 src/agents/command-poll-backoff.ts create mode 100644 src/agents/pi-tools.before-tool-call.test.ts create mode 100644 src/agents/tool-loop-detection.test.ts create mode 100644 src/agents/tool-loop-detection.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 6569dd0305..c2539747ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Docs: https://docs.openclaw.ai - Gateway/Config: prevent `config.patch` object-array merges from falling back to full-array replacement when some patch entries lack `id`, so partial `agents.list` updates no longer drop unrelated agents. (#17989) Thanks @stakeswky. - Config/Discord: require string IDs in Discord allowlists, keep onboarding inputs string-only, and add doctor repair for numeric entries. (#18220) Thanks @thewilloftheshadow. - Agents/Models: probe the primary model when its auth-profile cooldown is near expiry (with per-provider throttling), so runs recover from temporary rate limits without staying on fallback models until restart. (#17478) Thanks @PlayerGhost. +- Agents/Tools: make loop detection progress-aware and phased by hard-blocking known `process(action=poll|log)` no-progress loops, warning on generic identical-call repeats, warning + no-progress-blocking ping-pong alternation loops (10/20), coalescing repeated warning spam into threshold buckets (including canonical ping-pong pairs), adding a global circuit breaker at 30 no-progress repeats, and emitting structured diagnostic `tool.loop` warning/error events for loop actions. (#16808) Thanks @akramcodez and @beca-oc. - Agents/Tools: scope the `message` tool schema to the active channel so Telegram uses `buttons` and Discord uses `components`. (#18215) Thanks @obviyus. - Discord: optimize reaction notification handling to skip unnecessary message fetches in `off`/`all`/`allowlist` modes, streamline reaction routing, and improve reaction emoji formatting. (#18248) Thanks @thewilloftheshadow and @victorGPT. - Telegram: keep draft-stream preview replies attached to the user message for `replyToMode: "all"` in groups and DMs, preserving threaded reply context from preview through finalization. (#17880) Thanks @yinghaosang. diff --git a/src/agents/command-poll-backoff.test.ts b/src/agents/command-poll-backoff.test.ts new file mode 100644 index 0000000000..a83272b386 --- /dev/null +++ b/src/agents/command-poll-backoff.test.ts @@ -0,0 +1,173 @@ +import { describe, expect, it } from "vitest"; +import type { SessionState } from "../logging/diagnostic-session-state.js"; +import { + calculateBackoffMs, + getCommandPollSuggestion, + pruneStaleCommandPolls, + recordCommandPoll, + resetCommandPollCount, +} from "./command-poll-backoff.js"; + +describe("command-poll-backoff", () => { + describe("calculateBackoffMs", () => { + it("returns 5s for first poll", () => { + expect(calculateBackoffMs(0)).toBe(5000); + }); + + it("returns 10s for second poll", () => { + expect(calculateBackoffMs(1)).toBe(10000); + }); + + it("returns 30s for third poll", () => { + expect(calculateBackoffMs(2)).toBe(30000); + }); + + it("returns 60s for fourth and subsequent polls (capped)", () => { + expect(calculateBackoffMs(3)).toBe(60000); + expect(calculateBackoffMs(4)).toBe(60000); + expect(calculateBackoffMs(10)).toBe(60000); + expect(calculateBackoffMs(100)).toBe(60000); + }); + }); + + describe("recordCommandPoll", () => { + it("returns 5s on first no-output poll", () => { + const state: SessionState = { + lastActivity: Date.now(), + state: "processing", + queueDepth: 0, + }; + const retryMs = recordCommandPoll(state, "cmd-123", false); + expect(retryMs).toBe(5000); + expect(state.commandPollCounts?.get("cmd-123")?.count).toBe(0); // First poll = index 0 + }); + + it("increments count and increases backoff on consecutive no-output polls", () => { + const state: SessionState = { + lastActivity: Date.now(), + state: "processing", + queueDepth: 0, + }; + + expect(recordCommandPoll(state, "cmd-123", false)).toBe(5000); // count=0 -> 5s + expect(recordCommandPoll(state, "cmd-123", false)).toBe(10000); // count=1 -> 10s + expect(recordCommandPoll(state, "cmd-123", false)).toBe(30000); // count=2 -> 30s + expect(recordCommandPoll(state, "cmd-123", false)).toBe(60000); // count=3 -> 60s + expect(recordCommandPoll(state, "cmd-123", false)).toBe(60000); // count=4 -> 60s (capped) + + expect(state.commandPollCounts?.get("cmd-123")?.count).toBe(4); // 5 polls = index 4 + }); + + it("resets count when poll returns new output", () => { + const state: SessionState = { + lastActivity: Date.now(), + state: "processing", + queueDepth: 0, + }; + + recordCommandPoll(state, "cmd-123", false); + recordCommandPoll(state, "cmd-123", false); + recordCommandPoll(state, "cmd-123", false); + expect(state.commandPollCounts?.get("cmd-123")?.count).toBe(2); // 3 polls = index 2 + + // New output resets count + const retryMs = recordCommandPoll(state, "cmd-123", true); + expect(retryMs).toBe(5000); // Back to first poll delay + expect(state.commandPollCounts?.get("cmd-123")?.count).toBe(0); + }); + + it("tracks different commands independently", () => { + const state: SessionState = { + lastActivity: Date.now(), + state: "processing", + queueDepth: 0, + }; + + recordCommandPoll(state, "cmd-1", false); + recordCommandPoll(state, "cmd-1", false); + recordCommandPoll(state, "cmd-2", false); + + expect(state.commandPollCounts?.get("cmd-1")?.count).toBe(1); // 2 polls = index 1 + expect(state.commandPollCounts?.get("cmd-2")?.count).toBe(0); // 1 poll = index 0 + }); + }); + + describe("getCommandPollSuggestion", () => { + it("returns undefined for untracked command", () => { + const state: SessionState = { + lastActivity: Date.now(), + state: "processing", + queueDepth: 0, + }; + expect(getCommandPollSuggestion(state, "unknown")).toBeUndefined(); + }); + + it("returns current backoff for tracked command", () => { + const state: SessionState = { + lastActivity: Date.now(), + state: "processing", + queueDepth: 0, + }; + + recordCommandPoll(state, "cmd-123", false); + recordCommandPoll(state, "cmd-123", false); + + expect(getCommandPollSuggestion(state, "cmd-123")).toBe(10000); + }); + }); + + describe("resetCommandPollCount", () => { + it("removes command from tracking", () => { + const state: SessionState = { + lastActivity: Date.now(), + state: "processing", + queueDepth: 0, + }; + + recordCommandPoll(state, "cmd-123", false); + expect(state.commandPollCounts?.has("cmd-123")).toBe(true); + + resetCommandPollCount(state, "cmd-123"); + expect(state.commandPollCounts?.has("cmd-123")).toBe(false); + }); + + it("is safe to call on untracked command", () => { + const state: SessionState = { + lastActivity: Date.now(), + state: "processing", + queueDepth: 0, + }; + + expect(() => resetCommandPollCount(state, "unknown")).not.toThrow(); + }); + }); + + describe("pruneStaleCommandPolls", () => { + it("removes polls older than maxAge", () => { + const state: SessionState = { + lastActivity: Date.now(), + state: "processing", + queueDepth: 0, + commandPollCounts: new Map([ + ["cmd-old", { count: 5, lastPollAt: Date.now() - 7200000 }], // 2 hours ago + ["cmd-new", { count: 3, lastPollAt: Date.now() - 1000 }], // 1 second ago + ]), + }; + + pruneStaleCommandPolls(state, 3600000); // 1 hour max age + + expect(state.commandPollCounts?.has("cmd-old")).toBe(false); + expect(state.commandPollCounts?.has("cmd-new")).toBe(true); + }); + + it("handles empty state gracefully", () => { + const state: SessionState = { + lastActivity: Date.now(), + state: "idle", + queueDepth: 0, + }; + + expect(() => pruneStaleCommandPolls(state)).not.toThrow(); + }); + }); +}); diff --git a/src/agents/command-poll-backoff.ts b/src/agents/command-poll-backoff.ts new file mode 100644 index 0000000000..d26134892f --- /dev/null +++ b/src/agents/command-poll-backoff.ts @@ -0,0 +1,82 @@ +import type { SessionState } from "../logging/diagnostic-session-state.js"; + +// Exponential backoff schedule for command polling +const BACKOFF_SCHEDULE_MS = [5000, 10000, 30000, 60000]; + +/** + * Calculate suggested retry delay based on consecutive no-output poll count. + * Implements exponential backoff schedule: 5s → 10s → 30s → 60s (capped). + */ +export function calculateBackoffMs(consecutiveNoOutputPolls: number): number { + const index = Math.min(consecutiveNoOutputPolls, BACKOFF_SCHEDULE_MS.length - 1); + return BACKOFF_SCHEDULE_MS[index] ?? 60000; +} + +/** + * Record a command poll and return suggested retry delay. + * @param state Session state to track polling in + * @param commandId Unique identifier for the command being polled + * @param hasNewOutput Whether this poll returned new output + * @returns Suggested delay in milliseconds before next poll + */ +export function recordCommandPoll( + state: SessionState, + commandId: string, + hasNewOutput: boolean, +): number { + if (!state.commandPollCounts) { + state.commandPollCounts = new Map(); + } + + const existing = state.commandPollCounts.get(commandId); + const now = Date.now(); + + if (hasNewOutput) { + state.commandPollCounts.set(commandId, { count: 0, lastPollAt: now }); + return BACKOFF_SCHEDULE_MS[0] ?? 5000; + } + + const newCount = (existing?.count ?? -1) + 1; + state.commandPollCounts.set(commandId, { count: newCount, lastPollAt: now }); + + return calculateBackoffMs(newCount); +} + +/** + * Get current suggested backoff for a command without modifying state. + * Useful for checking current backoff level. + */ +export function getCommandPollSuggestion( + state: SessionState, + commandId: string, +): number | undefined { + const pollData = state.commandPollCounts?.get(commandId); + if (!pollData) { + return undefined; + } + return calculateBackoffMs(pollData.count); +} + +/** + * Reset poll count for a command (e.g., when command completes). + */ +export function resetCommandPollCount(state: SessionState, commandId: string): void { + state.commandPollCounts?.delete(commandId); +} + +/** + * Prune stale command poll records (older than 1 hour). + * Call periodically to prevent memory bloat. + */ +export function pruneStaleCommandPolls(state: SessionState, maxAgeMs = 3600000): void { + if (!state.commandPollCounts) { + return; + } + + const now = Date.now(); + for (const [commandId, data] of state.commandPollCounts.entries()) { + if (now - data.lastPollAt > maxAgeMs) { + state.commandPollCounts.delete(commandId); + } + } +} diff --git a/src/agents/pi-tools.before-tool-call.e2e.test.ts b/src/agents/pi-tools.before-tool-call.e2e.test.ts index f6a81bf1fc..20145cb2af 100644 --- a/src/agents/pi-tools.before-tool-call.e2e.test.ts +++ b/src/agents/pi-tools.before-tool-call.e2e.test.ts @@ -1,4 +1,5 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; +import { resetDiagnosticSessionStateForTest } from "../logging/diagnostic-session-state.js"; import { getGlobalHookRunner } from "../plugins/hook-runner-global.js"; import { toClientToolDefinitions, toToolDefinitions } from "./pi-tool-definition-adapter.js"; import { wrapToolWithBeforeToolCallHook } from "./pi-tools.before-tool-call.js"; @@ -14,6 +15,7 @@ describe("before_tool_call hook integration", () => { }; beforeEach(() => { + resetDiagnosticSessionStateForTest(); hookRunner = { hasHooks: vi.fn(), runBeforeToolCall: vi.fn(), @@ -115,6 +117,7 @@ describe("before_tool_call hook deduplication (#15502)", () => { }; beforeEach(() => { + resetDiagnosticSessionStateForTest(); hookRunner = { hasHooks: vi.fn(() => true), runBeforeToolCall: vi.fn(async () => undefined), @@ -153,6 +156,7 @@ describe("before_tool_call hook integration for client tools", () => { }; beforeEach(() => { + resetDiagnosticSessionStateForTest(); hookRunner = { hasHooks: vi.fn(), runBeforeToolCall: vi.fn(), diff --git a/src/agents/pi-tools.before-tool-call.test.ts b/src/agents/pi-tools.before-tool-call.test.ts new file mode 100644 index 0000000000..34ae63ffc6 --- /dev/null +++ b/src/agents/pi-tools.before-tool-call.test.ts @@ -0,0 +1,363 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { AnyAgentTool } from "./tools/common.js"; +import { + onDiagnosticEvent, + resetDiagnosticEventsForTest, + type DiagnosticToolLoopEvent, +} from "../infra/diagnostic-events.js"; +import { resetDiagnosticSessionStateForTest } from "../logging/diagnostic-session-state.js"; +import { getGlobalHookRunner } from "../plugins/hook-runner-global.js"; +import { wrapToolWithBeforeToolCallHook } from "./pi-tools.before-tool-call.js"; +import { CRITICAL_THRESHOLD, GLOBAL_CIRCUIT_BREAKER_THRESHOLD } from "./tool-loop-detection.js"; + +vi.mock("../plugins/hook-runner-global.js"); + +const mockGetGlobalHookRunner = vi.mocked(getGlobalHookRunner); + +describe("before_tool_call loop detection behavior", () => { + let hookRunner: { + hasHooks: ReturnType; + runBeforeToolCall: ReturnType; + }; + + beforeEach(() => { + resetDiagnosticSessionStateForTest(); + resetDiagnosticEventsForTest(); + hookRunner = { + hasHooks: vi.fn(), + runBeforeToolCall: vi.fn(), + }; + // oxlint-disable-next-line typescript/no-explicit-any + mockGetGlobalHookRunner.mockReturnValue(hookRunner as any); + hookRunner.hasHooks.mockReturnValue(false); + }); + + it("blocks known poll loops when no progress repeats", async () => { + const execute = vi.fn().mockResolvedValue({ + content: [{ type: "text", text: "(no new output)\n\nProcess still running." }], + details: { status: "running", aggregated: "steady" }, + }); + // oxlint-disable-next-line typescript/no-explicit-any + const tool = wrapToolWithBeforeToolCallHook({ name: "process", execute } as any, { + agentId: "main", + sessionKey: "main", + }); + const params = { action: "poll", sessionId: "sess-1" }; + + for (let i = 0; i < CRITICAL_THRESHOLD; i += 1) { + await expect(tool.execute(`poll-${i}`, params, undefined, undefined)).resolves.toBeDefined(); + } + + await expect( + tool.execute(`poll-${CRITICAL_THRESHOLD}`, params, undefined, undefined), + ).rejects.toThrow("CRITICAL"); + }); + + it("does not block known poll loops when output progresses", async () => { + const execute = vi.fn().mockImplementation(async (toolCallId: string) => { + return { + content: [{ type: "text", text: `output ${toolCallId}` }], + details: { status: "running", aggregated: `output ${toolCallId}` }, + }; + }); + // oxlint-disable-next-line typescript/no-explicit-any + const tool = wrapToolWithBeforeToolCallHook({ name: "process", execute } as any, { + agentId: "main", + sessionKey: "main", + }); + const params = { action: "poll", sessionId: "sess-2" }; + + for (let i = 0; i < CRITICAL_THRESHOLD + 5; i += 1) { + await expect( + tool.execute(`poll-progress-${i}`, params, undefined, undefined), + ).resolves.toBeDefined(); + } + }); + + it("keeps generic repeated calls warn-only below global breaker", async () => { + const execute = vi.fn().mockResolvedValue({ + content: [{ type: "text", text: "same output" }], + details: { ok: true }, + }); + // oxlint-disable-next-line typescript/no-explicit-any + const tool = wrapToolWithBeforeToolCallHook({ name: "read", execute } as any, { + agentId: "main", + sessionKey: "main", + }); + const params = { path: "/tmp/file" }; + + for (let i = 0; i < CRITICAL_THRESHOLD + 5; i += 1) { + await expect(tool.execute(`read-${i}`, params, undefined, undefined)).resolves.toBeDefined(); + } + }); + + it("blocks generic repeated no-progress calls at global breaker threshold", async () => { + const execute = vi.fn().mockResolvedValue({ + content: [{ type: "text", text: "same output" }], + details: { ok: true }, + }); + // oxlint-disable-next-line typescript/no-explicit-any + const tool = wrapToolWithBeforeToolCallHook({ name: "read", execute } as any, { + agentId: "main", + sessionKey: "main", + }); + const params = { path: "/tmp/file" }; + + for (let i = 0; i < GLOBAL_CIRCUIT_BREAKER_THRESHOLD; i += 1) { + await expect(tool.execute(`read-${i}`, params, undefined, undefined)).resolves.toBeDefined(); + } + + await expect( + tool.execute(`read-${GLOBAL_CIRCUIT_BREAKER_THRESHOLD}`, params, undefined, undefined), + ).rejects.toThrow("global circuit breaker"); + }); + + it("coalesces repeated generic warning events into threshold buckets", async () => { + const emitted: DiagnosticToolLoopEvent[] = []; + const stop = onDiagnosticEvent((evt) => { + if (evt.type === "tool.loop" && evt.level === "warning") { + emitted.push(evt); + } + }); + try { + const execute = vi.fn().mockResolvedValue({ + content: [{ type: "text", text: "same output" }], + details: { ok: true }, + }); + // oxlint-disable-next-line typescript/no-explicit-any + const tool = wrapToolWithBeforeToolCallHook({ name: "read", execute } as any, { + agentId: "main", + sessionKey: "main", + }); + const params = { path: "/tmp/file" }; + + for (let i = 0; i < 21; i += 1) { + await tool.execute(`read-bucket-${i}`, params, undefined, undefined); + } + + const genericWarns = emitted.filter((evt) => evt.detector === "generic_repeat"); + expect(genericWarns.map((evt) => evt.count)).toEqual([10, 20]); + } finally { + stop(); + } + }); + + it("emits structured warning diagnostic events for ping-pong loops", async () => { + const emitted: DiagnosticToolLoopEvent[] = []; + const stop = onDiagnosticEvent((evt) => { + if (evt.type === "tool.loop") { + emitted.push(evt); + } + }); + try { + const readExecute = vi.fn().mockResolvedValue({ + content: [{ type: "text", text: "read ok" }], + details: { ok: true }, + }); + const listExecute = vi.fn().mockResolvedValue({ + content: [{ type: "text", text: "list ok" }], + details: { ok: true }, + }); + const readTool = wrapToolWithBeforeToolCallHook( + { name: "read", execute: readExecute } as unknown as AnyAgentTool, + { + agentId: "main", + sessionKey: "main", + }, + ); + const listTool = wrapToolWithBeforeToolCallHook( + { name: "list", execute: listExecute } as unknown as AnyAgentTool, + { + agentId: "main", + sessionKey: "main", + }, + ); + + for (let i = 0; i < 9; i += 1) { + if (i % 2 === 0) { + await readTool.execute(`read-${i}`, { path: "/a.txt" }, undefined, undefined); + } else { + await listTool.execute(`list-${i}`, { dir: "/workspace" }, undefined, undefined); + } + } + + await listTool.execute("list-9", { dir: "/workspace" }, undefined, undefined); + await readTool.execute("read-10", { path: "/a.txt" }, undefined, undefined); + await listTool.execute("list-11", { dir: "/workspace" }, undefined, undefined); + + const pingPongWarns = emitted.filter( + (evt) => evt.level === "warning" && evt.detector === "ping_pong", + ); + expect(pingPongWarns).toHaveLength(1); + const loopEvent = pingPongWarns[0]; + expect(loopEvent?.type).toBe("tool.loop"); + expect(loopEvent?.level).toBe("warning"); + expect(loopEvent?.action).toBe("warn"); + expect(loopEvent?.detector).toBe("ping_pong"); + expect(loopEvent?.count).toBe(10); + expect(loopEvent?.toolName).toBe("list"); + } finally { + stop(); + } + }); + + it("blocks ping-pong loops at critical threshold and emits critical diagnostic events", async () => { + const emitted: DiagnosticToolLoopEvent[] = []; + const stop = onDiagnosticEvent((evt) => { + if (evt.type === "tool.loop") { + emitted.push(evt); + } + }); + try { + const readExecute = vi.fn().mockResolvedValue({ + content: [{ type: "text", text: "read ok" }], + details: { ok: true }, + }); + const listExecute = vi.fn().mockResolvedValue({ + content: [{ type: "text", text: "list ok" }], + details: { ok: true }, + }); + const readTool = wrapToolWithBeforeToolCallHook( + { name: "read", execute: readExecute } as unknown as AnyAgentTool, + { + agentId: "main", + sessionKey: "main", + }, + ); + const listTool = wrapToolWithBeforeToolCallHook( + { name: "list", execute: listExecute } as unknown as AnyAgentTool, + { + agentId: "main", + sessionKey: "main", + }, + ); + + for (let i = 0; i < CRITICAL_THRESHOLD - 1; i += 1) { + if (i % 2 === 0) { + await readTool.execute(`read-${i}`, { path: "/a.txt" }, undefined, undefined); + } else { + await listTool.execute(`list-${i}`, { dir: "/workspace" }, undefined, undefined); + } + } + + await expect( + listTool.execute( + `list-${CRITICAL_THRESHOLD - 1}`, + { dir: "/workspace" }, + undefined, + undefined, + ), + ).rejects.toThrow("CRITICAL"); + + const loopEvent = emitted.at(-1); + expect(loopEvent?.type).toBe("tool.loop"); + expect(loopEvent?.level).toBe("critical"); + expect(loopEvent?.action).toBe("block"); + expect(loopEvent?.detector).toBe("ping_pong"); + expect(loopEvent?.count).toBe(CRITICAL_THRESHOLD); + expect(loopEvent?.toolName).toBe("list"); + } finally { + stop(); + } + }); + + it("does not block ping-pong at critical threshold when outcomes are progressing", async () => { + const emitted: DiagnosticToolLoopEvent[] = []; + const stop = onDiagnosticEvent((evt) => { + if (evt.type === "tool.loop") { + emitted.push(evt); + } + }); + try { + const readExecute = vi.fn().mockImplementation(async (toolCallId: string) => ({ + content: [{ type: "text", text: `read ${toolCallId}` }], + details: { ok: true }, + })); + const listExecute = vi.fn().mockImplementation(async (toolCallId: string) => ({ + content: [{ type: "text", text: `list ${toolCallId}` }], + details: { ok: true }, + })); + const readTool = wrapToolWithBeforeToolCallHook( + { name: "read", execute: readExecute } as unknown as AnyAgentTool, + { + agentId: "main", + sessionKey: "main", + }, + ); + const listTool = wrapToolWithBeforeToolCallHook( + { name: "list", execute: listExecute } as unknown as AnyAgentTool, + { + agentId: "main", + sessionKey: "main", + }, + ); + + for (let i = 0; i < CRITICAL_THRESHOLD - 1; i += 1) { + if (i % 2 === 0) { + await readTool.execute(`read-${i}`, { path: "/a.txt" }, undefined, undefined); + } else { + await listTool.execute(`list-${i}`, { dir: "/workspace" }, undefined, undefined); + } + } + + await expect( + listTool.execute( + `list-${CRITICAL_THRESHOLD - 1}`, + { dir: "/workspace" }, + undefined, + undefined, + ), + ).resolves.toBeDefined(); + + const criticalPingPong = emitted.find( + (evt) => evt.level === "critical" && evt.detector === "ping_pong", + ); + expect(criticalPingPong).toBeUndefined(); + const warningPingPong = emitted.find( + (evt) => evt.level === "warning" && evt.detector === "ping_pong", + ); + expect(warningPingPong).toBeTruthy(); + } finally { + stop(); + } + }); + + it("emits structured critical diagnostic events when blocking loops", async () => { + const emitted: DiagnosticToolLoopEvent[] = []; + const stop = onDiagnosticEvent((evt) => { + if (evt.type === "tool.loop") { + emitted.push(evt); + } + }); + try { + const execute = vi.fn().mockResolvedValue({ + content: [{ type: "text", text: "(no new output)\n\nProcess still running." }], + details: { status: "running", aggregated: "steady" }, + }); + // oxlint-disable-next-line typescript/no-explicit-any + const tool = wrapToolWithBeforeToolCallHook({ name: "process", execute } as any, { + agentId: "main", + sessionKey: "main", + }); + const params = { action: "poll", sessionId: "sess-crit" }; + + for (let i = 0; i < CRITICAL_THRESHOLD; i += 1) { + await tool.execute(`poll-${i}`, params, undefined, undefined); + } + + await expect( + tool.execute(`poll-${CRITICAL_THRESHOLD}`, params, undefined, undefined), + ).rejects.toThrow("CRITICAL"); + + const loopEvent = emitted.at(-1); + expect(loopEvent?.type).toBe("tool.loop"); + expect(loopEvent?.level).toBe("critical"); + expect(loopEvent?.action).toBe("block"); + expect(loopEvent?.detector).toBe("known_poll_no_progress"); + expect(loopEvent?.count).toBe(CRITICAL_THRESHOLD); + expect(loopEvent?.toolName).toBe("process"); + } finally { + stop(); + } + }); +}); diff --git a/src/agents/pi-tools.before-tool-call.ts b/src/agents/pi-tools.before-tool-call.ts index 26761f3127..44ad96a8cc 100644 --- a/src/agents/pi-tools.before-tool-call.ts +++ b/src/agents/pi-tools.before-tool-call.ts @@ -1,3 +1,4 @@ +import type { SessionState } from "../logging/diagnostic-session-state.js"; import type { AnyAgentTool } from "./tools/common.js"; import { createSubsystemLogger } from "../logging/subsystem.js"; import { getGlobalHookRunner } from "../plugins/hook-runner-global.js"; @@ -15,6 +16,57 @@ const log = createSubsystemLogger("agents/tools"); const BEFORE_TOOL_CALL_WRAPPED = Symbol("beforeToolCallWrapped"); const adjustedParamsByToolCallId = new Map(); const MAX_TRACKED_ADJUSTED_PARAMS = 1024; +const LOOP_WARNING_BUCKET_SIZE = 10; +const MAX_LOOP_WARNING_KEYS = 256; + +function shouldEmitLoopWarning(state: SessionState, warningKey: string, count: number): boolean { + if (!state.toolLoopWarningBuckets) { + state.toolLoopWarningBuckets = new Map(); + } + const bucket = Math.floor(count / LOOP_WARNING_BUCKET_SIZE); + const lastBucket = state.toolLoopWarningBuckets.get(warningKey) ?? 0; + if (bucket <= lastBucket) { + return false; + } + state.toolLoopWarningBuckets.set(warningKey, bucket); + if (state.toolLoopWarningBuckets.size > MAX_LOOP_WARNING_KEYS) { + const oldest = state.toolLoopWarningBuckets.keys().next().value; + if (oldest) { + state.toolLoopWarningBuckets.delete(oldest); + } + } + return true; +} + +async function recordLoopOutcome(args: { + ctx?: HookContext; + toolName: string; + toolParams: unknown; + toolCallId?: string; + result?: unknown; + error?: unknown; +}): Promise { + if (!args.ctx?.sessionKey) { + return; + } + try { + const { getDiagnosticSessionState } = await import("../logging/diagnostic-session-state.js"); + const { recordToolCallOutcome } = await import("./tool-loop-detection.js"); + const sessionState = getDiagnosticSessionState({ + sessionKey: args.ctx.sessionKey, + sessionId: args.ctx?.agentId, + }); + recordToolCallOutcome(sessionState, { + toolName: args.toolName, + toolParams: args.toolParams, + toolCallId: args.toolCallId, + result: args.result, + error: args.error, + }); + } catch (err) { + log.warn(`tool loop outcome tracking failed: tool=${args.toolName} error=${String(err)}`); + } +} export async function runBeforeToolCallHook(args: { toolName: string; @@ -25,6 +77,58 @@ export async function runBeforeToolCallHook(args: { const toolName = normalizeToolName(args.toolName || "tool"); const params = args.params; + if (args.ctx?.sessionKey) { + const { getDiagnosticSessionState } = await import("../logging/diagnostic-session-state.js"); + const { logToolLoopAction } = await import("../logging/diagnostic.js"); + const { detectToolCallLoop, recordToolCall } = await import("./tool-loop-detection.js"); + + const sessionState = getDiagnosticSessionState({ + sessionKey: args.ctx.sessionKey, + sessionId: args.ctx?.agentId, + }); + + const loopResult = detectToolCallLoop(sessionState, toolName, params); + + if (loopResult.stuck) { + if (loopResult.level === "critical") { + log.error(`Blocking ${toolName} due to critical loop: ${loopResult.message}`); + logToolLoopAction({ + sessionKey: args.ctx.sessionKey, + sessionId: args.ctx?.agentId, + toolName, + level: "critical", + action: "block", + detector: loopResult.detector, + count: loopResult.count, + message: loopResult.message, + pairedToolName: loopResult.pairedToolName, + }); + return { + blocked: true, + reason: loopResult.message, + }; + } else { + const warningKey = loopResult.warningKey ?? `${loopResult.detector}:${toolName}`; + if (shouldEmitLoopWarning(sessionState, warningKey, loopResult.count)) { + log.warn(`Loop warning for ${toolName}: ${loopResult.message}`); + logToolLoopAction({ + sessionKey: args.ctx.sessionKey, + sessionId: args.ctx?.agentId, + toolName, + level: "warning", + action: "warn", + detector: loopResult.detector, + count: loopResult.count, + message: loopResult.message, + pairedToolName: loopResult.pairedToolName, + }); + } + } + } + + recordToolCall(sessionState, toolName, params, args.toolCallId); + } + const hookRunner = getGlobalHookRunner(); if (!hookRunner?.hasHooks("before_tool_call")) { return { blocked: false, params: args.params }; @@ -95,7 +199,27 @@ export function wrapToolWithBeforeToolCallHook( } } } - return await execute(toolCallId, outcome.params, signal, onUpdate); + const normalizedToolName = normalizeToolName(toolName || "tool"); + try { + const result = await execute(toolCallId, outcome.params, signal, onUpdate); + await recordLoopOutcome({ + ctx, + toolName: normalizedToolName, + toolParams: outcome.params, + toolCallId, + result, + }); + return result; + } catch (err) { + await recordLoopOutcome({ + ctx, + toolName: normalizedToolName, + toolParams: outcome.params, + toolCallId, + error: err, + }); + throw err; + } }, }; Object.defineProperty(wrappedTool, BEFORE_TOOL_CALL_WRAPPED, { diff --git a/src/agents/tool-loop-detection.test.ts b/src/agents/tool-loop-detection.test.ts new file mode 100644 index 0000000000..1955c6634f --- /dev/null +++ b/src/agents/tool-loop-detection.test.ts @@ -0,0 +1,422 @@ +import { describe, expect, it } from "vitest"; +import type { SessionState } from "../logging/diagnostic-session-state.js"; +import { + CRITICAL_THRESHOLD, + GLOBAL_CIRCUIT_BREAKER_THRESHOLD, + TOOL_CALL_HISTORY_SIZE, + WARNING_THRESHOLD, + detectToolCallLoop, + getToolCallStats, + hashToolCall, + recordToolCall, + recordToolCallOutcome, +} from "./tool-loop-detection.js"; + +function createState(): SessionState { + return { + lastActivity: Date.now(), + state: "processing", + queueDepth: 0, + }; +} + +function recordSuccessfulCall( + state: SessionState, + toolName: string, + params: unknown, + result: unknown, + index: number, +): void { + const toolCallId = `${toolName}-${index}`; + recordToolCall(state, toolName, params, toolCallId); + recordToolCallOutcome(state, { + toolName, + toolParams: params, + toolCallId, + result, + }); +} + +describe("tool-loop-detection", () => { + describe("hashToolCall", () => { + it("creates consistent hash for same tool and params", () => { + const hash1 = hashToolCall("read", { path: "/file.txt" }); + const hash2 = hashToolCall("read", { path: "/file.txt" }); + expect(hash1).toBe(hash2); + }); + + it("creates different hashes for different params", () => { + const hash1 = hashToolCall("read", { path: "/file1.txt" }); + const hash2 = hashToolCall("read", { path: "/file2.txt" }); + expect(hash1).not.toBe(hash2); + }); + + it("creates different hashes for different tools", () => { + const hash1 = hashToolCall("read", { path: "/file.txt" }); + const hash2 = hashToolCall("write", { path: "/file.txt" }); + expect(hash1).not.toBe(hash2); + }); + + it("handles non-object params", () => { + expect(() => hashToolCall("tool", "string-param")).not.toThrow(); + expect(() => hashToolCall("tool", 123)).not.toThrow(); + expect(() => hashToolCall("tool", null)).not.toThrow(); + }); + + it("produces deterministic hashes regardless of key order", () => { + const hash1 = hashToolCall("tool", { a: 1, b: 2 }); + const hash2 = hashToolCall("tool", { b: 2, a: 1 }); + expect(hash1).toBe(hash2); + }); + + it("keeps hashes fixed-size even for large params", () => { + const payload = { data: "x".repeat(20_000) }; + const hash = hashToolCall("read", payload); + expect(hash.startsWith("read:")).toBe(true); + expect(hash.length).toBe("read:".length + 64); + }); + }); + + describe("recordToolCall", () => { + it("adds tool call to empty history", () => { + const state = createState(); + + recordToolCall(state, "read", { path: "/file.txt" }, "call-1"); + + expect(state.toolCallHistory).toHaveLength(1); + expect(state.toolCallHistory?.[0]?.toolName).toBe("read"); + expect(state.toolCallHistory?.[0]?.toolCallId).toBe("call-1"); + }); + + it("maintains sliding window of last N calls", () => { + const state = createState(); + + for (let i = 0; i < TOOL_CALL_HISTORY_SIZE + 10; i += 1) { + recordToolCall(state, "tool", { iteration: i }, `call-${i}`); + } + + expect(state.toolCallHistory).toHaveLength(TOOL_CALL_HISTORY_SIZE); + + const oldestCall = state.toolCallHistory?.[0]; + expect(oldestCall?.argsHash).toBe(hashToolCall("tool", { iteration: 10 })); + }); + + it("records timestamp for each call", () => { + const state = createState(); + const before = Date.now(); + recordToolCall(state, "tool", { arg: 1 }, "call-ts"); + const after = Date.now(); + + const timestamp = state.toolCallHistory?.[0]?.timestamp ?? 0; + expect(timestamp).toBeGreaterThanOrEqual(before); + expect(timestamp).toBeLessThanOrEqual(after); + }); + }); + + describe("detectToolCallLoop", () => { + it("does not flag unique tool calls", () => { + const state = createState(); + + for (let i = 0; i < 15; i += 1) { + recordToolCall(state, "read", { path: `/file${i}.txt` }, `call-${i}`); + } + + const result = detectToolCallLoop(state, "read", { path: "/new-file.txt" }); + expect(result.stuck).toBe(false); + }); + + it("warns on generic repeated tool+args calls", () => { + const state = createState(); + for (let i = 0; i < WARNING_THRESHOLD; i += 1) { + recordToolCall(state, "read", { path: "/same.txt" }, `warn-${i}`); + } + + const result = detectToolCallLoop(state, "read", { path: "/same.txt" }); + + expect(result.stuck).toBe(true); + if (result.stuck) { + expect(result.level).toBe("warning"); + expect(result.detector).toBe("generic_repeat"); + expect(result.count).toBe(WARNING_THRESHOLD); + expect(result.message).toContain("WARNING"); + expect(result.message).toContain(`${WARNING_THRESHOLD} times`); + } + }); + + it("keeps generic loops warn-only below global breaker threshold", () => { + const state = createState(); + const params = { path: "/same.txt" }; + const result = { + content: [{ type: "text", text: "same output" }], + details: { ok: true }, + }; + + for (let i = 0; i < CRITICAL_THRESHOLD; i += 1) { + recordSuccessfulCall(state, "read", params, result, i); + } + + const loopResult = detectToolCallLoop(state, "read", params); + expect(loopResult.stuck).toBe(true); + if (loopResult.stuck) { + expect(loopResult.level).toBe("warning"); + } + }); + + it("warns for known polling no-progress loops", () => { + const state = createState(); + const params = { action: "poll", sessionId: "sess-1" }; + const result = { + content: [{ type: "text", text: "(no new output)\n\nProcess still running." }], + details: { status: "running", aggregated: "steady" }, + }; + + for (let i = 0; i < WARNING_THRESHOLD; i += 1) { + recordSuccessfulCall(state, "process", params, result, i); + } + + const loopResult = detectToolCallLoop(state, "process", params); + expect(loopResult.stuck).toBe(true); + if (loopResult.stuck) { + expect(loopResult.level).toBe("warning"); + expect(loopResult.detector).toBe("known_poll_no_progress"); + expect(loopResult.message).toContain("no progress"); + } + }); + + it("blocks known polling no-progress loops at critical threshold", () => { + const state = createState(); + const params = { action: "poll", sessionId: "sess-1" }; + const result = { + content: [{ type: "text", text: "(no new output)\n\nProcess still running." }], + details: { status: "running", aggregated: "steady" }, + }; + + for (let i = 0; i < CRITICAL_THRESHOLD; i += 1) { + recordSuccessfulCall(state, "process", params, result, i); + } + + const loopResult = detectToolCallLoop(state, "process", params); + expect(loopResult.stuck).toBe(true); + if (loopResult.stuck) { + expect(loopResult.level).toBe("critical"); + expect(loopResult.detector).toBe("known_poll_no_progress"); + expect(loopResult.message).toContain("CRITICAL"); + } + }); + + it("does not block known polling when output progresses", () => { + const state = createState(); + const params = { action: "poll", sessionId: "sess-1" }; + + for (let i = 0; i < CRITICAL_THRESHOLD + 5; i += 1) { + const result = { + content: [{ type: "text", text: `line ${i}` }], + details: { status: "running", aggregated: `line ${i}` }, + }; + recordSuccessfulCall(state, "process", params, result, i); + } + + const loopResult = detectToolCallLoop(state, "process", params); + expect(loopResult.stuck).toBe(false); + }); + + it("blocks any tool with global no-progress breaker at 30", () => { + const state = createState(); + const params = { path: "/same.txt" }; + const result = { + content: [{ type: "text", text: "same output" }], + details: { ok: true }, + }; + + for (let i = 0; i < GLOBAL_CIRCUIT_BREAKER_THRESHOLD; i += 1) { + recordSuccessfulCall(state, "read", params, result, i); + } + + const loopResult = detectToolCallLoop(state, "read", params); + expect(loopResult.stuck).toBe(true); + if (loopResult.stuck) { + expect(loopResult.level).toBe("critical"); + expect(loopResult.detector).toBe("global_circuit_breaker"); + expect(loopResult.message).toContain("global circuit breaker"); + } + }); + + it("warns on ping-pong alternating patterns", () => { + const state = createState(); + const readParams = { path: "/a.txt" }; + const listParams = { dir: "/workspace" }; + + for (let i = 0; i < WARNING_THRESHOLD - 1; i += 1) { + if (i % 2 === 0) { + recordToolCall(state, "read", readParams, `read-${i}`); + } else { + recordToolCall(state, "list", listParams, `list-${i}`); + } + } + + const loopResult = detectToolCallLoop(state, "list", listParams); + expect(loopResult.stuck).toBe(true); + if (loopResult.stuck) { + expect(loopResult.level).toBe("warning"); + expect(loopResult.detector).toBe("ping_pong"); + expect(loopResult.count).toBe(WARNING_THRESHOLD); + expect(loopResult.message).toContain("ping-pong loop"); + } + }); + + it("blocks ping-pong alternating patterns at critical threshold", () => { + const state = createState(); + const readParams = { path: "/a.txt" }; + const listParams = { dir: "/workspace" }; + + for (let i = 0; i < CRITICAL_THRESHOLD - 1; i += 1) { + if (i % 2 === 0) { + recordSuccessfulCall( + state, + "read", + readParams, + { content: [{ type: "text", text: "read stable" }], details: { ok: true } }, + i, + ); + } else { + recordSuccessfulCall( + state, + "list", + listParams, + { content: [{ type: "text", text: "list stable" }], details: { ok: true } }, + i, + ); + } + } + + const loopResult = detectToolCallLoop(state, "list", listParams); + expect(loopResult.stuck).toBe(true); + if (loopResult.stuck) { + expect(loopResult.level).toBe("critical"); + expect(loopResult.detector).toBe("ping_pong"); + expect(loopResult.count).toBe(CRITICAL_THRESHOLD); + expect(loopResult.message).toContain("CRITICAL"); + expect(loopResult.message).toContain("ping-pong loop"); + } + }); + + it("does not block ping-pong at critical threshold when outcomes are progressing", () => { + const state = createState(); + const readParams = { path: "/a.txt" }; + const listParams = { dir: "/workspace" }; + + for (let i = 0; i < CRITICAL_THRESHOLD - 1; i += 1) { + if (i % 2 === 0) { + recordSuccessfulCall( + state, + "read", + readParams, + { content: [{ type: "text", text: `read ${i}` }], details: { ok: true } }, + i, + ); + } else { + recordSuccessfulCall( + state, + "list", + listParams, + { content: [{ type: "text", text: `list ${i}` }], details: { ok: true } }, + i, + ); + } + } + + const loopResult = detectToolCallLoop(state, "list", listParams); + expect(loopResult.stuck).toBe(true); + if (loopResult.stuck) { + expect(loopResult.level).toBe("warning"); + expect(loopResult.detector).toBe("ping_pong"); + expect(loopResult.count).toBe(CRITICAL_THRESHOLD); + } + }); + + it("does not flag ping-pong when alternation is broken", () => { + const state = createState(); + recordToolCall(state, "read", { path: "/a.txt" }, "a1"); + recordToolCall(state, "list", { dir: "/workspace" }, "b1"); + recordToolCall(state, "read", { path: "/a.txt" }, "a2"); + recordToolCall(state, "write", { path: "/tmp/out.txt" }, "c1"); // breaks alternation + + const loopResult = detectToolCallLoop(state, "list", { dir: "/workspace" }); + expect(loopResult.stuck).toBe(false); + }); + + it("records fixed-size result hashes for large tool outputs", () => { + const state = createState(); + const params = { action: "log", sessionId: "sess-big" }; + const toolCallId = "log-big"; + recordToolCall(state, "process", params, toolCallId); + recordToolCallOutcome(state, { + toolName: "process", + toolParams: params, + toolCallId, + result: { + content: [{ type: "text", text: "y".repeat(40_000) }], + details: { status: "running", totalLines: 1, totalChars: 40_000 }, + }, + }); + + const entry = state.toolCallHistory?.find((call) => call.toolCallId === toolCallId); + expect(typeof entry?.resultHash).toBe("string"); + expect(entry?.resultHash?.length).toBe(64); + }); + + it("handles empty history", () => { + const state = createState(); + + const result = detectToolCallLoop(state, "tool", { arg: 1 }); + expect(result.stuck).toBe(false); + }); + }); + + describe("getToolCallStats", () => { + it("returns zero stats for empty history", () => { + const state = createState(); + + const stats = getToolCallStats(state); + expect(stats.totalCalls).toBe(0); + expect(stats.uniquePatterns).toBe(0); + expect(stats.mostFrequent).toBeNull(); + }); + + it("counts total calls and unique patterns", () => { + const state = createState(); + + for (let i = 0; i < 5; i += 1) { + recordToolCall(state, "read", { path: "/file.txt" }, `same-${i}`); + } + + recordToolCall(state, "write", { path: "/output.txt" }, "write-1"); + recordToolCall(state, "list", { dir: "/home" }, "list-1"); + recordToolCall(state, "read", { path: "/other.txt" }, "read-other"); + + const stats = getToolCallStats(state); + expect(stats.totalCalls).toBe(8); + expect(stats.uniquePatterns).toBe(4); + }); + + it("identifies most frequent pattern", () => { + const state = createState(); + + for (let i = 0; i < 3; i += 1) { + recordToolCall(state, "read", { path: "/file1.txt" }, `p1-${i}`); + } + + for (let i = 0; i < 7; i += 1) { + recordToolCall(state, "read", { path: "/file2.txt" }, `p2-${i}`); + } + + for (let i = 0; i < 2; i += 1) { + recordToolCall(state, "write", { path: "/output.txt" }, `p3-${i}`); + } + + const stats = getToolCallStats(state); + expect(stats.mostFrequent?.toolName).toBe("read"); + expect(stats.mostFrequent?.count).toBe(7); + }); + }); +}); diff --git a/src/agents/tool-loop-detection.ts b/src/agents/tool-loop-detection.ts new file mode 100644 index 0000000000..b32ad4cb2f --- /dev/null +++ b/src/agents/tool-loop-detection.ts @@ -0,0 +1,527 @@ +import { createHash } from "node:crypto"; +import type { SessionState } from "../logging/diagnostic-session-state.js"; +import { createSubsystemLogger } from "../logging/subsystem.js"; +import { isPlainObject } from "../utils.js"; + +const log = createSubsystemLogger("agents/loop-detection"); + +export type LoopDetectorKind = + | "generic_repeat" + | "known_poll_no_progress" + | "global_circuit_breaker" + | "ping_pong"; + +export type LoopDetectionResult = + | { stuck: false } + | { + stuck: true; + level: "warning" | "critical"; + detector: LoopDetectorKind; + count: number; + message: string; + pairedToolName?: string; + warningKey?: string; + }; + +export const TOOL_CALL_HISTORY_SIZE = 30; +export const WARNING_THRESHOLD = 10; +export const CRITICAL_THRESHOLD = 20; +export const GLOBAL_CIRCUIT_BREAKER_THRESHOLD = 30; + +/** + * Hash a tool call for pattern matching. + * Uses tool name + deterministic JSON serialization digest of params. + */ +export function hashToolCall(toolName: string, params: unknown): string { + return `${toolName}:${digestStable(params)}`; +} + +function stableStringify(value: unknown): string { + if (value === null || typeof value !== "object") { + return JSON.stringify(value); + } + if (Array.isArray(value)) { + return `[${value.map(stableStringify).join(",")}]`; + } + const obj = value as Record; + const keys = Object.keys(obj).toSorted(); + return `{${keys.map((k) => `${JSON.stringify(k)}:${stableStringify(obj[k])}`).join(",")}}`; +} + +function digestStable(value: unknown): string { + const serialized = stableStringifyFallback(value); + return createHash("sha256").update(serialized).digest("hex"); +} + +function stableStringifyFallback(value: unknown): string { + try { + return stableStringify(value); + } catch { + if (value === null || value === undefined) { + return `${value}`; + } + if (typeof value === "string") { + return value; + } + if (typeof value === "number" || typeof value === "boolean" || typeof value === "bigint") { + return `${value}`; + } + if (value instanceof Error) { + return `${value.name}:${value.message}`; + } + return Object.prototype.toString.call(value); + } +} + +function isKnownPollToolCall(toolName: string, params: unknown): boolean { + if (toolName === "command_status") { + return true; + } + if (toolName !== "process" || !isPlainObject(params)) { + return false; + } + const action = params.action; + return action === "poll" || action === "log"; +} + +function extractTextContent(result: unknown): string { + if (!isPlainObject(result) || !Array.isArray(result.content)) { + return ""; + } + return result.content + .filter( + (entry): entry is { type: string; text: string } => + isPlainObject(entry) && typeof entry.type === "string" && typeof entry.text === "string", + ) + .map((entry) => entry.text) + .join("\n") + .trim(); +} + +function formatErrorForHash(error: unknown): string { + if (error instanceof Error) { + return error.message || error.name; + } + if (typeof error === "string") { + return error; + } + if (typeof error === "number" || typeof error === "boolean" || typeof error === "bigint") { + return `${error}`; + } + return stableStringify(error); +} + +function hashToolOutcome( + toolName: string, + params: unknown, + result: unknown, + error: unknown, +): string | undefined { + if (error !== undefined) { + return `error:${digestStable(formatErrorForHash(error))}`; + } + if (!isPlainObject(result)) { + return result === undefined ? undefined : digestStable(result); + } + + const details = isPlainObject(result.details) ? result.details : {}; + const text = extractTextContent(result); + if (isKnownPollToolCall(toolName, params) && toolName === "process" && isPlainObject(params)) { + const action = params.action; + if (action === "poll") { + return digestStable({ + action, + status: details.status, + exitCode: details.exitCode ?? null, + exitSignal: details.exitSignal ?? null, + aggregated: details.aggregated ?? null, + text, + }); + } + if (action === "log") { + return digestStable({ + action, + status: details.status, + totalLines: details.totalLines ?? null, + totalChars: details.totalChars ?? null, + truncated: details.truncated ?? null, + exitCode: details.exitCode ?? null, + exitSignal: details.exitSignal ?? null, + text, + }); + } + } + + return digestStable({ + details, + text, + }); +} + +function getNoProgressStreak( + history: Array<{ toolName: string; argsHash: string; resultHash?: string }>, + toolName: string, + argsHash: string, +): { count: number; latestResultHash?: string } { + let streak = 0; + let latestResultHash: string | undefined; + + for (let i = history.length - 1; i >= 0; i -= 1) { + const record = history[i]; + if (!record || record.toolName !== toolName || record.argsHash !== argsHash) { + continue; + } + if (typeof record.resultHash !== "string" || !record.resultHash) { + continue; + } + if (!latestResultHash) { + latestResultHash = record.resultHash; + streak = 1; + continue; + } + if (record.resultHash !== latestResultHash) { + break; + } + streak += 1; + } + + return { count: streak, latestResultHash }; +} + +function getPingPongStreak( + history: Array<{ toolName: string; argsHash: string; resultHash?: string }>, + currentSignature: string, +): { + count: number; + pairedToolName?: string; + pairedSignature?: string; + noProgressEvidence: boolean; +} { + const last = history.at(-1); + if (!last) { + return { count: 0, noProgressEvidence: false }; + } + + let otherSignature: string | undefined; + let otherToolName: string | undefined; + for (let i = history.length - 2; i >= 0; i -= 1) { + const call = history[i]; + if (!call) { + continue; + } + if (call.argsHash !== last.argsHash) { + otherSignature = call.argsHash; + otherToolName = call.toolName; + break; + } + } + + if (!otherSignature || !otherToolName) { + return { count: 0, noProgressEvidence: false }; + } + + let alternatingTailCount = 0; + for (let i = history.length - 1; i >= 0; i -= 1) { + const call = history[i]; + if (!call) { + continue; + } + const expected = alternatingTailCount % 2 === 0 ? last.argsHash : otherSignature; + if (call.argsHash !== expected) { + break; + } + alternatingTailCount += 1; + } + + if (alternatingTailCount < 2) { + return { count: 0, noProgressEvidence: false }; + } + + const expectedCurrentSignature = otherSignature; + if (currentSignature !== expectedCurrentSignature) { + return { count: 0, noProgressEvidence: false }; + } + + const tailStart = Math.max(0, history.length - alternatingTailCount); + let firstHashA: string | undefined; + let firstHashB: string | undefined; + let noProgressEvidence = true; + for (let i = tailStart; i < history.length; i += 1) { + const call = history[i]; + if (!call) { + continue; + } + if (!call.resultHash) { + noProgressEvidence = false; + break; + } + if (call.argsHash === last.argsHash) { + if (!firstHashA) { + firstHashA = call.resultHash; + } else if (firstHashA !== call.resultHash) { + noProgressEvidence = false; + break; + } + continue; + } + if (call.argsHash === otherSignature) { + if (!firstHashB) { + firstHashB = call.resultHash; + } else if (firstHashB !== call.resultHash) { + noProgressEvidence = false; + break; + } + continue; + } + noProgressEvidence = false; + break; + } + + // Need repeated stable outcomes on both sides before treating ping-pong as no-progress. + if (!firstHashA || !firstHashB) { + noProgressEvidence = false; + } + + return { + count: alternatingTailCount + 1, + pairedToolName: last.toolName, + pairedSignature: last.argsHash, + noProgressEvidence, + }; +} + +function canonicalPairKey(signatureA: string, signatureB: string): string { + return [signatureA, signatureB].toSorted().join("|"); +} + +/** + * Detect if an agent is stuck in a repetitive tool call loop. + * Checks if the same tool+params combination has been called excessively. + */ +export function detectToolCallLoop( + state: SessionState, + toolName: string, + params: unknown, +): LoopDetectionResult { + const history = state.toolCallHistory ?? []; + const currentHash = hashToolCall(toolName, params); + const noProgress = getNoProgressStreak(history, toolName, currentHash); + const noProgressStreak = noProgress.count; + const knownPollTool = isKnownPollToolCall(toolName, params); + const pingPong = getPingPongStreak(history, currentHash); + + if (noProgressStreak >= GLOBAL_CIRCUIT_BREAKER_THRESHOLD) { + log.error( + `Global circuit breaker triggered: ${toolName} repeated ${noProgressStreak} times with no progress`, + ); + return { + stuck: true, + level: "critical", + detector: "global_circuit_breaker", + count: noProgressStreak, + message: `CRITICAL: ${toolName} has repeated identical no-progress outcomes ${noProgressStreak} times. Session execution blocked by global circuit breaker to prevent runaway loops.`, + warningKey: `global:${toolName}:${currentHash}:${noProgress.latestResultHash ?? "none"}`, + }; + } + + if (knownPollTool && noProgressStreak >= CRITICAL_THRESHOLD) { + log.error(`Critical polling loop detected: ${toolName} repeated ${noProgressStreak} times`); + return { + stuck: true, + level: "critical", + detector: "known_poll_no_progress", + count: noProgressStreak, + message: `CRITICAL: Called ${toolName} with identical arguments and no progress ${noProgressStreak} times. This appears to be a stuck polling loop. Session execution blocked to prevent resource waste.`, + warningKey: `poll:${toolName}:${currentHash}:${noProgress.latestResultHash ?? "none"}`, + }; + } + + if (knownPollTool && noProgressStreak >= WARNING_THRESHOLD) { + log.warn(`Polling loop warning: ${toolName} repeated ${noProgressStreak} times`); + return { + stuck: true, + level: "warning", + detector: "known_poll_no_progress", + count: noProgressStreak, + message: `WARNING: You have called ${toolName} ${noProgressStreak} times with identical arguments and no progress. Stop polling and either (1) increase wait time between checks, or (2) report the task as failed if the process is stuck.`, + warningKey: `poll:${toolName}:${currentHash}:${noProgress.latestResultHash ?? "none"}`, + }; + } + + const pingPongWarningKey = pingPong.pairedSignature + ? `pingpong:${canonicalPairKey(currentHash, pingPong.pairedSignature)}` + : `pingpong:${toolName}:${currentHash}`; + + if (pingPong.count >= CRITICAL_THRESHOLD && pingPong.noProgressEvidence) { + log.error( + `Critical ping-pong loop detected: alternating calls count=${pingPong.count} currentTool=${toolName}`, + ); + return { + stuck: true, + level: "critical", + detector: "ping_pong", + count: pingPong.count, + message: `CRITICAL: You are alternating between repeated tool-call patterns (${pingPong.count} consecutive calls) with no progress. This appears to be a stuck ping-pong loop. Session execution blocked to prevent resource waste.`, + pairedToolName: pingPong.pairedToolName, + warningKey: pingPongWarningKey, + }; + } + + if (pingPong.count >= WARNING_THRESHOLD) { + log.warn( + `Ping-pong loop warning: alternating calls count=${pingPong.count} currentTool=${toolName}`, + ); + return { + stuck: true, + level: "warning", + detector: "ping_pong", + count: pingPong.count, + message: `WARNING: You are alternating between repeated tool-call patterns (${pingPong.count} consecutive calls). This looks like a ping-pong loop; stop retrying and report the task as failed.`, + pairedToolName: pingPong.pairedToolName, + warningKey: pingPongWarningKey, + }; + } + + // Generic detector: warn-only for repeated identical calls. + const recentCount = history.filter( + (h) => h.toolName === toolName && h.argsHash === currentHash, + ).length; + + if (!knownPollTool && recentCount >= WARNING_THRESHOLD) { + log.warn(`Loop warning: ${toolName} called ${recentCount} times with identical arguments`); + return { + stuck: true, + level: "warning", + detector: "generic_repeat", + count: recentCount, + message: `WARNING: You have called ${toolName} ${recentCount} times with identical arguments. If this is not making progress, stop retrying and report the task as failed.`, + warningKey: `generic:${toolName}:${currentHash}`, + }; + } + + return { stuck: false }; +} + +/** + * Record a tool call in the session's history for loop detection. + * Maintains sliding window of last N calls. + */ +export function recordToolCall( + state: SessionState, + toolName: string, + params: unknown, + toolCallId?: string, +): void { + if (!state.toolCallHistory) { + state.toolCallHistory = []; + } + + state.toolCallHistory.push({ + toolName, + argsHash: hashToolCall(toolName, params), + toolCallId, + timestamp: Date.now(), + }); + + if (state.toolCallHistory.length > TOOL_CALL_HISTORY_SIZE) { + state.toolCallHistory.shift(); + } +} + +/** + * Record a completed tool call outcome so loop detection can identify no-progress repeats. + */ +export function recordToolCallOutcome( + state: SessionState, + params: { + toolName: string; + toolParams: unknown; + toolCallId?: string; + result?: unknown; + error?: unknown; + }, +): void { + const resultHash = hashToolOutcome( + params.toolName, + params.toolParams, + params.result, + params.error, + ); + if (!resultHash) { + return; + } + + if (!state.toolCallHistory) { + state.toolCallHistory = []; + } + + const argsHash = hashToolCall(params.toolName, params.toolParams); + let matched = false; + for (let i = state.toolCallHistory.length - 1; i >= 0; i -= 1) { + const call = state.toolCallHistory[i]; + if (!call) { + continue; + } + if (params.toolCallId && call.toolCallId !== params.toolCallId) { + continue; + } + if (call.toolName !== params.toolName || call.argsHash !== argsHash) { + continue; + } + if (call.resultHash !== undefined) { + continue; + } + call.resultHash = resultHash; + matched = true; + break; + } + + if (!matched) { + state.toolCallHistory.push({ + toolName: params.toolName, + argsHash, + toolCallId: params.toolCallId, + resultHash, + timestamp: Date.now(), + }); + } + + if (state.toolCallHistory.length > TOOL_CALL_HISTORY_SIZE) { + state.toolCallHistory.splice(0, state.toolCallHistory.length - TOOL_CALL_HISTORY_SIZE); + } +} + +/** + * Get current tool call statistics for a session (for debugging/monitoring). + */ +export function getToolCallStats(state: SessionState): { + totalCalls: number; + uniquePatterns: number; + mostFrequent: { toolName: string; count: number } | null; +} { + const history = state.toolCallHistory ?? []; + const patterns = new Map(); + + for (const call of history) { + const key = call.argsHash; + const existing = patterns.get(key); + if (existing) { + existing.count += 1; + } else { + patterns.set(key, { toolName: call.toolName, count: 1 }); + } + } + + let mostFrequent: { toolName: string; count: number } | null = null; + for (const pattern of patterns.values()) { + if (!mostFrequent || pattern.count > mostFrequent.count) { + mostFrequent = pattern; + } + } + + return { + totalCalls: history.length, + uniquePatterns: patterns.size, + mostFrequent, + }; +} diff --git a/src/infra/diagnostic-events.ts b/src/infra/diagnostic-events.ts index 6b9f9f2d1b..cf8958dd71 100644 --- a/src/infra/diagnostic-events.ts +++ b/src/infra/diagnostic-events.ts @@ -134,6 +134,19 @@ export type DiagnosticHeartbeatEvent = DiagnosticBaseEvent & { queued: number; }; +export type DiagnosticToolLoopEvent = DiagnosticBaseEvent & { + type: "tool.loop"; + sessionKey?: string; + sessionId?: string; + toolName: string; + level: "warning" | "critical"; + action: "warn" | "block"; + detector: "generic_repeat" | "known_poll_no_progress" | "global_circuit_breaker" | "ping_pong"; + count: number; + message: string; + pairedToolName?: string; +}; + export type DiagnosticEventPayload = | DiagnosticUsageEvent | DiagnosticWebhookReceivedEvent @@ -146,7 +159,8 @@ export type DiagnosticEventPayload = | DiagnosticLaneEnqueueEvent | DiagnosticLaneDequeueEvent | DiagnosticRunAttemptEvent - | DiagnosticHeartbeatEvent; + | DiagnosticHeartbeatEvent + | DiagnosticToolLoopEvent; export type DiagnosticEventInput = DiagnosticEventPayload extends infer Event ? Event extends DiagnosticEventPayload diff --git a/src/logging/diagnostic-session-state.ts b/src/logging/diagnostic-session-state.ts index aa5224550b..30ea1249aa 100644 --- a/src/logging/diagnostic-session-state.ts +++ b/src/logging/diagnostic-session-state.ts @@ -6,6 +6,17 @@ export type SessionState = { lastActivity: number; state: SessionStateValue; queueDepth: number; + toolCallHistory?: ToolCallRecord[]; + toolLoopWarningBuckets?: Map; + commandPollCounts?: Map; +}; + +export type ToolCallRecord = { + toolName: string; + argsHash: string; + toolCallId?: string; + resultHash?: string; + timestamp: number; }; export type SessionRef = { diff --git a/src/logging/diagnostic.ts b/src/logging/diagnostic.ts index 73a3ac8938..3751416c13 100644 --- a/src/logging/diagnostic.ts +++ b/src/logging/diagnostic.ts @@ -256,6 +256,42 @@ export function logRunAttempt(params: SessionRef & { runId: string; attempt: num markActivity(); } +export function logToolLoopAction( + params: SessionRef & { + toolName: string; + level: "warning" | "critical"; + action: "warn" | "block"; + detector: "generic_repeat" | "known_poll_no_progress" | "global_circuit_breaker" | "ping_pong"; + count: number; + message: string; + pairedToolName?: string; + }, +) { + const payload = `tool loop: sessionId=${params.sessionId ?? "unknown"} sessionKey=${ + params.sessionKey ?? "unknown" + } tool=${params.toolName} level=${params.level} action=${params.action} detector=${ + params.detector + } count=${params.count}${params.pairedToolName ? ` pairedTool=${params.pairedToolName}` : ""} message="${params.message}"`; + if (params.level === "critical") { + diag.error(payload); + } else { + diag.warn(payload); + } + emitDiagnosticEvent({ + type: "tool.loop", + sessionId: params.sessionId, + sessionKey: params.sessionKey, + toolName: params.toolName, + level: params.level, + action: params.action, + detector: params.detector, + count: params.count, + message: params.message, + pairedToolName: params.pairedToolName, + }); + markActivity(); +} + export function logActiveRuns() { const activeSessions = Array.from(diagnosticSessionStates.entries()) .filter(([, s]) => s.state === "processing") @@ -314,6 +350,16 @@ export function startDiagnosticHeartbeat() { queued: totalQueued, }); + import("../agents/command-poll-backoff.js") + .then(({ pruneStaleCommandPolls }) => { + for (const [, state] of diagnosticSessionStates) { + pruneStaleCommandPolls(state); + } + }) + .catch((err) => { + diag.debug(`command-poll-backoff prune failed: ${String(err)}`); + }); + for (const [, state] of diagnosticSessionStates) { const ageMs = now - state.lastActivity; if (state.state === "processing" && ageMs > 120_000) {