From ccb09653e302672962d458e908db8acfff1acf22 Mon Sep 17 00:00:00 2001 From: mbelinky Date: Mon, 13 Apr 2026 19:06:25 +0200 Subject: [PATCH] fix(cron): stop unresolved next-run refire loops --- CHANGELOG.md | 2 +- src/cron/service.armtimer-tight-loop.test.ts | 37 ++++++++ src/cron/service/timer.regression.test.ts | 89 +++++++++++++++++++- src/cron/service/timer.ts | 50 ++++++++++- 4 files changed, 171 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ed0ff441240..96c1a6a093a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,7 @@ Docs: https://docs.openclaw.ai - Browser/CDP: let managed local Chrome readiness, status probes, and managed loopback CDP control bypass browser SSRF policy for their own loopback control plane, so OpenClaw no longer misclassifies a healthy child browser as "not reachable after start". (#65695, #66043) Thanks @mbelinky. - Gateway/sessions: stop heartbeat, cron-event, and exec-event turns from overwriting shared-session routing and origin metadata, preventing synthetic `heartbeat` targets from poisoning later cron or user delivery. (#63733, #35300) - Browser/CDP: let local attach-only `manual-cdp` profiles reuse the local loopback CDP control plane under strict default policy and remote-class probe timeouts, so tabs/snapshot stop falsely reporting a live local browser session as not running. (#65611, #66080) Thanks @mbelinky. - +- Cron/scheduler: stop inventing short retries when cron next-run calculation returns no valid future slot, and keep a maintenance wake armed so enabled unscheduled jobs recover without entering a refire loop. (#66019, #66083) Thanks @mbelinky. ## 2026.4.12 ### Changes diff --git a/src/cron/service.armtimer-tight-loop.test.ts b/src/cron/service.armtimer-tight-loop.test.ts index b725adc78d6..c8cca51af98 100644 --- a/src/cron/service.armtimer-tight-loop.test.ts +++ b/src/cron/service.armtimer-tight-loop.test.ts @@ -141,6 +141,43 @@ describe("CronService - armTimer tight loop prevention", () => { timeoutSpy.mockRestore(); }); + it("keeps a maintenance wake armed when enabled jobs have no nextRunAtMs", () => { + const timeoutSpy = vi.spyOn(globalThis, "setTimeout"); + const now = Date.parse("2026-02-28T12:32:00.000Z"); + + const state = createTimerState({ + storePath: "/tmp/test-cron/jobs.json", + now, + }); + state.store = { + version: 1, + jobs: [ + { + id: "missing-next-run", + name: "missing-next-run", + enabled: true, + deleteAfterRun: false, + createdAtMs: now - 60_000, + updatedAtMs: now - 60_000, + schedule: { kind: "cron", expr: "*/15 * * * *" }, + sessionTarget: "isolated" as const, + wakeMode: "next-heartbeat" as const, + payload: { kind: "agentTurn" as const, message: "test" }, + delivery: { mode: "none" as const }, + state: {}, + }, + ], + }; + + armTimer(state); + + expect(state.timer).not.toBeNull(); + const delays = extractTimeoutDelays(timeoutSpy); + expect(delays).toContain(60_000); + + timeoutSpy.mockRestore(); + }); + it("breaks the onTimer→armTimer hot-loop with stuck runningAtMs", async () => { const timeoutSpy = vi.spyOn(globalThis, "setTimeout"); const store = await makeStorePath(); diff --git a/src/cron/service/timer.regression.test.ts b/src/cron/service/timer.regression.test.ts index a9ba523407e..f010df729bc 100644 --- a/src/cron/service/timer.regression.test.ts +++ b/src/cron/service/timer.regression.test.ts @@ -1061,11 +1061,11 @@ describe("cron service timer regressions", () => { expect(job.state.lastStatus).toBe("ok"); expect(job.state.scheduleErrorCount).toBe(1); expect(job.state.lastError).toMatch(/^schedule error:/); - expect(job.state.nextRunAtMs).toBe(endedAt + 2_000); + expect(job.state.nextRunAtMs).toBeUndefined(); expect(job.enabled).toBe(true); }); - it("falls back to backoff schedule when cron next-run computation throws on error path (#30905)", () => { + it("keeps state updates when cron next-run computation throws on error path (#30905)", () => { const startedAt = Date.parse("2026-03-02T12:05:00.000Z"); const endedAt = startedAt + 25; const state = createCronServiceState({ @@ -1100,10 +1100,93 @@ describe("cron service timer regressions", () => { expect(job.state.consecutiveErrors).toBe(1); expect(job.state.scheduleErrorCount).toBe(1); expect(job.state.lastError).toMatch(/^schedule error:/); - expect(job.state.nextRunAtMs).toBe(endedAt + 30_000); + expect(job.state.nextRunAtMs).toBeUndefined(); expect(job.enabled).toBe(true); }); + it("does not synthesize a 2s retry when cron schedule computation returns undefined (#66019)", () => { + const startedAt = Date.parse("2026-04-13T15:40:00.000Z"); + const endedAt = startedAt + 50; + const state = createCronServiceState({ + cronEnabled: true, + storePath: "/tmp/cron-66019-success.json", + log: noopLogger, + nowMs: () => endedAt, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob: createDefaultIsolatedRunner(), + }); + const job = createIsolatedRegressionJob({ + id: "cron-66019-success", + name: "cron-66019-success", + scheduledAt: startedAt, + schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" }, + payload: { kind: "agentTurn", message: "ping" }, + state: { nextRunAtMs: startedAt - 1_000, runningAtMs: startedAt - 500 }, + }); + const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined); + + try { + const shouldDelete = applyJobResult(state, job, { + status: "ok", + delivered: true, + startedAt, + endedAt, + }); + + expect(shouldDelete).toBe(false); + expect(job.state.runningAtMs).toBeUndefined(); + expect(job.state.lastRunAtMs).toBe(startedAt); + expect(job.state.lastStatus).toBe("ok"); + expect(job.state.nextRunAtMs).toBeUndefined(); + expect(job.enabled).toBe(true); + } finally { + nextRunSpy.mockRestore(); + } + }); + + it("does not synthesize backoff retries when cron schedule computation returns undefined (#66019)", () => { + const startedAt = Date.parse("2026-04-13T15:45:00.000Z"); + const endedAt = startedAt + 25; + const state = createCronServiceState({ + cronEnabled: true, + storePath: "/tmp/cron-66019-error.json", + log: noopLogger, + nowMs: () => endedAt, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob: createDefaultIsolatedRunner(), + }); + const job = createIsolatedRegressionJob({ + id: "cron-66019-error", + name: "cron-66019-error", + scheduledAt: startedAt, + schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" }, + payload: { kind: "agentTurn", message: "ping" }, + state: { nextRunAtMs: startedAt - 1_000, runningAtMs: startedAt - 500 }, + }); + const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined); + + try { + const shouldDelete = applyJobResult(state, job, { + status: "error", + error: "synthetic failure", + startedAt, + endedAt, + }); + + expect(shouldDelete).toBe(false); + expect(job.state.runningAtMs).toBeUndefined(); + expect(job.state.lastRunAtMs).toBe(startedAt); + expect(job.state.lastStatus).toBe("error"); + expect(job.state.consecutiveErrors).toBe(1); + expect(job.state.nextRunAtMs).toBeUndefined(); + expect(job.enabled).toBe(true); + } finally { + nextRunSpy.mockRestore(); + } + }); + it("force run preserves 'every' anchor while recording manual lastRunAtMs", () => { const nowMs = Date.now(); const everyMs = 24 * 60 * 60 * 1_000; diff --git a/src/cron/service/timer.ts b/src/cron/service/timer.ts index 8cfe3b19270..5bfd0c9e7a1 100644 --- a/src/cron/service/timer.ts +++ b/src/cron/service/timer.ts @@ -240,6 +240,27 @@ function isTransientCronError(error: string | undefined, retryOn?: CronRetryOn[] return keys.some((k) => TRANSIENT_PATTERNS[k]?.test(error)); } +function resolveCronNextRunWithLowerBound(params: { + state: CronServiceState; + job: CronJob; + naturalNext: number | undefined; + lowerBoundMs: number; + context: "completion" | "error_backoff"; +}): number | undefined { + if (params.naturalNext === undefined) { + params.state.deps.log.warn( + { + jobId: params.job.id, + jobName: params.job.name, + context: params.context, + }, + "cron: next run unresolved; clearing schedule to avoid a refire loop", + ); + return undefined; + } + return Math.max(params.naturalNext, params.lowerBoundMs); +} + function resolveRetryConfig(cronConfig?: CronConfig) { const retry = cronConfig?.retry; return { @@ -518,7 +539,17 @@ export function applyJobResult( const backoffNext = result.endedAt + backoff; // Use whichever is later: the natural next run or the backoff delay. job.state.nextRunAtMs = - normalNext !== undefined ? Math.max(normalNext, backoffNext) : backoffNext; + job.schedule.kind === "cron" + ? resolveCronNextRunWithLowerBound({ + state, + job, + naturalNext: normalNext, + lowerBoundMs: backoffNext, + context: "error_backoff", + }) + : normalNext !== undefined + ? Math.max(normalNext, backoffNext) + : backoffNext; state.deps.log.info( { jobId: job.id, @@ -547,8 +578,13 @@ export function applyJobResult( // schedule computation lands in the same second due to // timezone/croner edge cases (see #17821). const minNext = result.endedAt + MIN_REFIRE_GAP_MS; - job.state.nextRunAtMs = - naturalNext !== undefined ? Math.max(naturalNext, minNext) : minNext; + job.state.nextRunAtMs = resolveCronNextRunWithLowerBound({ + state, + job, + naturalNext, + lowerBoundMs: minNext, + context: "completion", + }); } else { job.state.nextRunAtMs = naturalNext; } @@ -609,6 +645,14 @@ export function armTimer(state: CronServiceState) { const withNextRun = state.store?.jobs.filter((j) => j.enabled && hasScheduledNextRunAtMs(j.state.nextRunAtMs)) .length ?? 0; + if (enabledCount > 0) { + armRunningRecheckTimer(state); + state.deps.log.debug( + { jobCount, enabledCount, withNextRun, delayMs: MAX_TIMER_DELAY_MS }, + "cron: timer armed for maintenance recheck", + ); + return; + } state.deps.log.debug( { jobCount, enabledCount, withNextRun }, "cron: armTimer skipped - no jobs with nextRunAtMs",