From 8af4712c4066660735313f2cc26220656362e705 Mon Sep 17 00:00:00 2001 From: Marcus Widing Date: Mon, 16 Feb 2026 14:09:16 +0100 Subject: [PATCH] fix(cron): prevent spin loop when job completes within scheduled second (#17821) When a cron job fires and completes within the same wall-clock second it was scheduled for, the next-run computation could return undefined or the same second, causing the scheduler to re-trigger the job hundreds of times in a tight loop. Two-layer fix: 1. computeJobNextRunAtMs: When computeNextRunAtMs returns undefined for a cron-kind schedule (edge case where floored nowSecondMs matches the schedule), retry with the ceiling (next second) as reference time. This ensures we always get the next valid occurrence. 2. applyJobResult: Add MIN_REFIRE_GAP_MS (2s) safety net for cron-kind jobs. After a successful run, nextRunAtMs is guaranteed to be at least 2s in the future. This breaks any remaining spin-loop edge cases without affecting normal daily/hourly schedules (where the natural next run is hours/days away). Fixes #17821 --- src/cron/service.issue-regressions.test.ts | 63 ++++++++++++++++++++++ src/cron/service/jobs.ts | 13 ++++- src/cron/service/timer.ts | 22 +++++++- 3 files changed, 96 insertions(+), 2 deletions(-) diff --git a/src/cron/service.issue-regressions.test.ts b/src/cron/service.issue-regressions.test.ts index e7dd528551..6c4522a599 100644 --- a/src/cron/service.issue-regressions.test.ts +++ b/src/cron/service.issue-regressions.test.ts @@ -514,6 +514,69 @@ describe("Cron issue regressions", () => { } }); + it("prevents spin loop when cron job completes within the scheduled second (#17821)", async () => { + const store = await makeStorePath(); + // Simulate a cron job "0 13 * * *" (daily 13:00 UTC) that fires exactly + // at 13:00:00.000 and completes 7ms later (still in the same second). + const scheduledAt = Date.parse("2026-02-15T13:00:00.000Z"); + const nextDay = scheduledAt + 86_400_000; + + const cronJob: CronJob = { + id: "spin-loop-17821", + name: "daily noon", + enabled: true, + createdAtMs: scheduledAt - 86_400_000, + updatedAtMs: scheduledAt - 86_400_000, + schedule: { kind: "cron", expr: "0 13 * * *", tz: "UTC" }, + sessionTarget: "isolated", + wakeMode: "next-heartbeat", + payload: { kind: "agentTurn", message: "briefing" }, + delivery: { mode: "announce" }, + state: { nextRunAtMs: scheduledAt }, + }; + await fs.writeFile( + store.storePath, + JSON.stringify({ version: 1, jobs: [cronJob] }, null, 2), + "utf-8", + ); + + let now = scheduledAt; + let fireCount = 0; + const events: CronEvent[] = []; + const state = createCronServiceState({ + cronEnabled: true, + storePath: store.storePath, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + onEvent: (evt) => { + events.push(evt); + }, + runIsolatedAgentJob: vi.fn(async () => { + // Job completes very quickly (7ms) — still within the same second + now += 7; + fireCount++; + return { status: "ok" as const, summary: "done" }; + }), + }); + + // First timer tick — should fire the job exactly once + await onTimer(state); + + expect(fireCount).toBe(1); + + const job = state.store?.jobs.find((j) => j.id === "spin-loop-17821"); + expect(job).toBeDefined(); + // nextRunAtMs MUST be in the future (next day), not the same second + expect(job!.state.nextRunAtMs).toBeDefined(); + expect(job!.state.nextRunAtMs).toBeGreaterThanOrEqual(nextDay); + + // Second timer tick (simulating the timer re-arm) — should NOT fire again + await onTimer(state); + expect(fireCount).toBe(1); + }); + it("records per-job start time and duration for batched due jobs", async () => { const store = await makeStorePath(); const dueAt = Date.parse("2026-02-06T10:05:01.000Z"); diff --git a/src/cron/service/jobs.ts b/src/cron/service/jobs.ts index 0d529843e7..86cb6e87c8 100644 --- a/src/cron/service/jobs.ts +++ b/src/cron/service/jobs.ts @@ -96,7 +96,18 @@ export function computeJobNextRunAtMs(job: CronJob, nowMs: number): number | und : null; return atMs !== null ? atMs : undefined; } - return computeNextRunAtMs(job.schedule, nowMs); + const next = computeNextRunAtMs(job.schedule, nowMs); + // Guard against the scheduler returning a time within the same second as + // nowMs. When a cron job completes within the same wall-clock second it + // was scheduled for, some croner versions/timezone combinations may return + // the current second (or computeNextRunAtMs may return undefined, which + // triggers recomputation). Advancing to the next second and retrying + // ensures we always land on the *next* occurrence. (See #17821) + if (next === undefined && job.schedule.kind === "cron") { + const nextSecondMs = (Math.floor(nowMs / 1000) + 1) * 1000; + return computeNextRunAtMs(job.schedule, nextSecondMs); + } + return next; } /** Maximum consecutive schedule errors before auto-disabling a job. */ diff --git a/src/cron/service/timer.ts b/src/cron/service/timer.ts index cf38396201..a4942eab38 100644 --- a/src/cron/service/timer.ts +++ b/src/cron/service/timer.ts @@ -15,6 +15,15 @@ import { ensureLoaded, persist } from "./store.js"; const MAX_TIMER_DELAY_MS = 60_000; +/** + * Minimum gap between consecutive fires of the same cron job. This is a + * safety net that prevents spin-loops when `computeJobNextRunAtMs` returns + * a value within the same second as the just-completed run. The guard + * is intentionally generous (2 s) so it never masks a legitimate schedule + * but always breaks an infinite re-trigger cycle. (See #17821) + */ +const MIN_REFIRE_GAP_MS = 2_000; + /** * Maximum wall-clock time for a single job execution. Acts as a safety net * on top of the per-provider / per-agent timeouts to prevent one stuck job @@ -107,7 +116,18 @@ function applyJobResult( "cron: applying error backoff", ); } else if (job.enabled) { - job.state.nextRunAtMs = computeJobNextRunAtMs(job, result.endedAt); + const naturalNext = computeJobNextRunAtMs(job, result.endedAt); + if (job.schedule.kind === "cron") { + // Safety net: ensure the next fire is at least MIN_REFIRE_GAP_MS + // after the current run ended. Prevents spin-loops when the + // schedule computation lands in the same second due to + // timezone/croner edge cases (see #17821). + const minNext = result.endedAt + MIN_REFIRE_GAP_MS; + job.state.nextRunAtMs = + naturalNext !== undefined ? Math.max(naturalNext, minNext) : minNext; + } else { + job.state.nextRunAtMs = naturalNext; + } } else { job.state.nextRunAtMs = undefined; }