fix(heartbeat): keep requests-in-flight retries from drifting schedule (#39182, thanks @MumuTW)

Co-authored-by: MumuTW <clothl47364@gmail.com>
2026-05-06 11:20:43 +00:00 · 2026-03-07 22:10:51 +00:00
parent 42bf4998d3
commit 733f7af92b
3 changed files with 49 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -286,6 +286,7 @@ Docs: https://docs.openclaw.ai
 - Discord/plugin native command aliases: let plugins declare provider-specific slash names so native Discord registration can avoid built-in command collisions; the bundled Talk voice plugin now uses `/talkvoice` natively on Discord while keeping text `/voice`.
 - Daemon/Windows schtasks status normalization: derive runtime state from locale-neutral numeric `Last Run Result` codes only (without language string matching) and surface unknown when numeric result data is unavailable, preventing locale-specific misclassification drift. (#39153) Thanks @scoootscooob.
 - Telegram/polling conflict recovery: reset the polling `webhookCleared` latch on `getUpdates` 409 conflicts so webhook cleanup re-runs on restart cycles and polling avoids infinite conflict loops. (#39205) Thanks @amittell.
+- Heartbeat/requests-in-flight scheduling: stop advancing `nextDueMs` and avoid immediate `scheduleNext()` timer overrides on requests-in-flight skips, so wake-layer retry cooldowns are honored and heartbeat cadence no longer drifts under sustained contention. (#39182) Thanks @MumuTW.

 ## 2026.3.2

--- a/src/infra/heartbeat-runner.scheduler.test.ts
+++ b/src/infra/heartbeat-runner.scheduler.test.ts
@@ -158,13 +158,55 @@ describe("startHeartbeatRunner", () => {
    await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
    expect(runSpy).toHaveBeenCalledTimes(1);

-    // Timer should be rescheduled; next heartbeat should still fire
-    await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
+    // The wake layer retries after DEFAULT_RETRY_MS (1 s).  No scheduleNext()
+    // is called inside runOnce, so we must wait for the full cooldown.
+    await vi.advanceTimersByTimeAsync(1_000);
    expect(runSpy).toHaveBeenCalledTimes(2);

    runner.stop();
  });

+  it("does not push nextDueMs forward on repeated requests-in-flight skips", async () => {
+    vi.useFakeTimers();
+    vi.setSystemTime(new Date(0));
+
+    // Simulate a long-running heartbeat: the first 5 calls return
+    // requests-in-flight (retries from the wake layer), then the 6th succeeds.
+    let callCount = 0;
+    const runSpy = vi.fn().mockImplementation(async () => {
+      callCount++;
+      if (callCount <= 5) {
+        return { status: "skipped", reason: "requests-in-flight" };
+      }
+      return { status: "ran", durationMs: 1 };
+    });
+
+    const runner = startHeartbeatRunner({
+      cfg: {
+        agents: { defaults: { heartbeat: { every: "30m" } } },
+      } as OpenClawConfig,
+      runOnce: runSpy,
+    });
+
+    // Trigger the first heartbeat at t=30m — returns requests-in-flight.
+    await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
+    expect(runSpy).toHaveBeenCalledTimes(1);
+
+    // Simulate 4 more retries at short intervals (wake layer retries).
+    for (let i = 0; i < 4; i++) {
+      requestHeartbeatNow({ reason: "retry", coalesceMs: 0 });
+      await vi.advanceTimersByTimeAsync(1_000);
+    }
+    expect(runSpy).toHaveBeenCalledTimes(5);
+
+    // The next interval tick at ~t=60m should still fire — the schedule
+    // must not have been pushed to t=30m * 6 = 180m by the 5 retries.
+    await vi.advanceTimersByTimeAsync(30 * 60_000);
+    expect(runSpy).toHaveBeenCalledTimes(6);
+
+    runner.stop();
+  });
+
  it("routes targeted wake requests to the requested agent/session", async () => {
    vi.useFakeTimers();
    vi.setSystemTime(new Date(0));
--- a/src/infra/heartbeat-runner.ts
+++ b/src/infra/heartbeat-runner.ts
@@ -1190,8 +1190,10 @@ export function startHeartbeatRunner(opts: {
        continue;
      }
      if (res.status === "skipped" && res.reason === "requests-in-flight") {
-        advanceAgentSchedule(agent, now);
-        scheduleNext();
+        // Do not advance the schedule — the main lane is busy and the wake
+        // layer will retry shortly (DEFAULT_RETRY_MS = 1 s).  Calling
+        // scheduleNext() here would register a 0 ms timer that races with
+        // the wake layer's 1 s retry and wins, bypassing the cooldown.
        return res;
      }
      if (res.status !== "skipped" || res.reason !== "disabled") {