diff --git a/CHANGELOG.md b/CHANGELOG.md index ae4be395195..9e5e094727f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -286,6 +286,7 @@ Docs: https://docs.openclaw.ai - Discord/plugin native command aliases: let plugins declare provider-specific slash names so native Discord registration can avoid built-in command collisions; the bundled Talk voice plugin now uses `/talkvoice` natively on Discord while keeping text `/voice`. - Daemon/Windows schtasks status normalization: derive runtime state from locale-neutral numeric `Last Run Result` codes only (without language string matching) and surface unknown when numeric result data is unavailable, preventing locale-specific misclassification drift. (#39153) Thanks @scoootscooob. - Telegram/polling conflict recovery: reset the polling `webhookCleared` latch on `getUpdates` 409 conflicts so webhook cleanup re-runs on restart cycles and polling avoids infinite conflict loops. (#39205) Thanks @amittell. +- Heartbeat/requests-in-flight scheduling: stop advancing `nextDueMs` and avoid immediate `scheduleNext()` timer overrides on requests-in-flight skips, so wake-layer retry cooldowns are honored and heartbeat cadence no longer drifts under sustained contention. (#39182) Thanks @MumuTW. ## 2026.3.2 diff --git a/src/infra/heartbeat-runner.scheduler.test.ts b/src/infra/heartbeat-runner.scheduler.test.ts index dab56c28215..4a184650128 100644 --- a/src/infra/heartbeat-runner.scheduler.test.ts +++ b/src/infra/heartbeat-runner.scheduler.test.ts @@ -158,13 +158,55 @@ describe("startHeartbeatRunner", () => { await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); expect(runSpy).toHaveBeenCalledTimes(1); - // Timer should be rescheduled; next heartbeat should still fire - await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); + // The wake layer retries after DEFAULT_RETRY_MS (1 s). No scheduleNext() + // is called inside runOnce, so we must wait for the full cooldown. + await vi.advanceTimersByTimeAsync(1_000); expect(runSpy).toHaveBeenCalledTimes(2); runner.stop(); }); + it("does not push nextDueMs forward on repeated requests-in-flight skips", async () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date(0)); + + // Simulate a long-running heartbeat: the first 5 calls return + // requests-in-flight (retries from the wake layer), then the 6th succeeds. + let callCount = 0; + const runSpy = vi.fn().mockImplementation(async () => { + callCount++; + if (callCount <= 5) { + return { status: "skipped", reason: "requests-in-flight" }; + } + return { status: "ran", durationMs: 1 }; + }); + + const runner = startHeartbeatRunner({ + cfg: { + agents: { defaults: { heartbeat: { every: "30m" } } }, + } as OpenClawConfig, + runOnce: runSpy, + }); + + // Trigger the first heartbeat at t=30m — returns requests-in-flight. + await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); + expect(runSpy).toHaveBeenCalledTimes(1); + + // Simulate 4 more retries at short intervals (wake layer retries). + for (let i = 0; i < 4; i++) { + requestHeartbeatNow({ reason: "retry", coalesceMs: 0 }); + await vi.advanceTimersByTimeAsync(1_000); + } + expect(runSpy).toHaveBeenCalledTimes(5); + + // The next interval tick at ~t=60m should still fire — the schedule + // must not have been pushed to t=30m * 6 = 180m by the 5 retries. + await vi.advanceTimersByTimeAsync(30 * 60_000); + expect(runSpy).toHaveBeenCalledTimes(6); + + runner.stop(); + }); + it("routes targeted wake requests to the requested agent/session", async () => { vi.useFakeTimers(); vi.setSystemTime(new Date(0)); diff --git a/src/infra/heartbeat-runner.ts b/src/infra/heartbeat-runner.ts index 71953e1da78..c3c58d34c1e 100644 --- a/src/infra/heartbeat-runner.ts +++ b/src/infra/heartbeat-runner.ts @@ -1190,8 +1190,10 @@ export function startHeartbeatRunner(opts: { continue; } if (res.status === "skipped" && res.reason === "requests-in-flight") { - advanceAgentSchedule(agent, now); - scheduleNext(); + // Do not advance the schedule — the main lane is busy and the wake + // layer will retry shortly (DEFAULT_RETRY_MS = 1 s). Calling + // scheduleNext() here would register a 0 ms timer that races with + // the wake layer's 1 s retry and wins, bypassing the cooldown. return res; } if (res.status !== "skipped" || res.reason !== "disabled") {