From fd74fc5a4f556d4c597a4de70bb2948ff503b746 Mon Sep 17 00:00:00 2001 From: hcl Date: Sat, 25 Apr 2026 17:03:43 +0800 Subject: [PATCH] fix(heartbeat): clamp scheduler delay to Node setTimeout cap (#71414) (#71478) * fix(heartbeat): clamp scheduler delay to Node setTimeout cap (#71414) When `agents.defaults.heartbeat.every` resolves to >2_147_483_647 ms (~24.85d), the previous scheduleNext() called setTimeout with the raw delay. Node clamps any delay > 2^31-1 to 1 ms, fires the callback, and the heartbeat re-arms with the same oversized value - a tight loop that floods the log with TimeoutOverflowWarning and crashes the gateway with exit code 1. Clamp the computed delay to HEARTBEAT_MAX_TIMEOUT_MS (2_147_483_647) before calling setTimeout. The worst case is now one heartbeat every ~24.85d instead of crash-loop. Warn once per process when clamping fires, so a misconfigured "365d" remains visible without flooding. This is a defense-in-depth fix at the scheduler layer; loadConfig-level rejection is a broader change with more blast radius and a separate question (some users may legitimately want "every: 365d" to mean "effectively never"). The clamped behaviour is closer to that intent than the crash is. Test: new scheduler test sets heartbeat.every="365d" with fake timers, advances 60s, and asserts runSpy was never called (with the bug, it would be called ~60_000 times). * style: format heartbeat scheduler clamp * fix: share safe timeout delay clamp (#71478) (thanks @hclsys) --------- Co-authored-by: Peter Steinberger --- CHANGELOG.md | 1 + src/agents/runtime-auth-refresh.ts | 4 +- src/gateway/call.ts | 3 +- src/gateway/client.ts | 5 +- src/gateway/probe.ts | 5 +- src/gateway/server-chat.ts | 6 +- src/gateway/server-methods/agent-job.ts | 9 ++- .../server-methods/agent-wait-dedupe.ts | 4 +- src/infra/heartbeat-runner.scheduler.test.ts | 18 +++++ .../heartbeat-runner.timeout-warning.test.ts | 70 +++++++++++++++++++ src/infra/heartbeat-runner.ts | 12 +++- src/utils/timer-delay.test.ts | 34 +++++++++ src/utils/timer-delay.ts | 19 +++++ 13 files changed, 172 insertions(+), 18 deletions(-) create mode 100644 src/infra/heartbeat-runner.timeout-warning.test.ts create mode 100644 src/utils/timer-delay.test.ts create mode 100644 src/utils/timer-delay.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 8395a9129fa..84c5bd96f38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Heartbeat: clamp oversized scheduler delays through the shared safe timer helper, preventing `every` values over Node's timeout cap from becoming a 1 ms crash loop. Fixes #71414. (#71478) Thanks @hclsys. - Telegram: remove the startup persisted-offset `getUpdates` preflight so polling restarts do not self-conflict before the runner starts. Fixes #69304. (#69779) Thanks @chinar-amrutkar. - Browser/Playwright: ignore benign already-handled route races during guarded navigation so browser-page tasks no longer fail when Playwright tears down a route mid-flight. (#68708) Thanks @Steady-ai. - Browser/aria snapshots: bind `format=aria` `axN` refs to live DOM nodes through backend DOM ids when Playwright is available, so follow-up browser actions can use those refs without timing out. (#62434) Thanks @MrKipler. diff --git a/src/agents/runtime-auth-refresh.ts b/src/agents/runtime-auth-refresh.ts index 0860e1e4031..915bc2e6ebe 100644 --- a/src/agents/runtime-auth-refresh.ts +++ b/src/agents/runtime-auth-refresh.ts @@ -1,9 +1,9 @@ -const MAX_SAFE_TIMEOUT_MS = 2_147_483_647; +import { resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js"; export function clampRuntimeAuthRefreshDelayMs(params: { refreshAt: number; now: number; minDelayMs: number; }): number { - return Math.min(MAX_SAFE_TIMEOUT_MS, Math.max(params.minDelayMs, params.refreshAt - params.now)); + return resolveSafeTimeoutDelayMs(params.refreshAt - params.now, { minMs: params.minDelayMs }); } diff --git a/src/gateway/call.ts b/src/gateway/call.ts index 8a39007c31a..18d95cae599 100644 --- a/src/gateway/call.ts +++ b/src/gateway/call.ts @@ -15,6 +15,7 @@ import { type GatewayClientMode, type GatewayClientName, } from "../utils/message-channel.js"; +import { resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js"; import { VERSION } from "../version.js"; import { GatewayClient, type GatewayClientOptions } from "./client.js"; import { @@ -289,7 +290,7 @@ function resolveGatewayCallTimeout(timeoutValue: unknown): { } { const timeoutMs = typeof timeoutValue === "number" && Number.isFinite(timeoutValue) ? timeoutValue : 10_000; - const safeTimerTimeoutMs = Math.max(1, Math.min(Math.floor(timeoutMs), 2_147_483_647)); + const safeTimerTimeoutMs = resolveSafeTimeoutDelayMs(timeoutMs); return { timeoutMs, safeTimerTimeoutMs }; } diff --git a/src/gateway/client.ts b/src/gateway/client.ts index 997b7227df5..8cb2bcdf5b9 100644 --- a/src/gateway/client.ts +++ b/src/gateway/client.ts @@ -24,6 +24,7 @@ import { type GatewayClientMode, type GatewayClientName, } from "../utils/message-channel.js"; +import { resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js"; import { VERSION } from "../version.js"; import { buildDeviceAuthPayloadV3 } from "./device-auth.js"; import { resolveConnectChallengeTimeoutMs } from "./handshake-timeouts.js"; @@ -210,7 +211,7 @@ export class GatewayClient { }; this.requestTimeoutMs = typeof opts.requestTimeoutMs === "number" && Number.isFinite(opts.requestTimeoutMs) - ? Math.max(1, Math.min(Math.floor(opts.requestTimeoutMs), 2_147_483_647)) + ? resolveSafeTimeoutDelayMs(opts.requestTimeoutMs) : 30_000; } @@ -931,7 +932,7 @@ export class GatewayClient { opts?.timeoutMs === null ? null : typeof opts?.timeoutMs === "number" && Number.isFinite(opts.timeoutMs) - ? Math.max(1, Math.min(Math.floor(opts.timeoutMs), 2_147_483_647)) + ? resolveSafeTimeoutDelayMs(opts.timeoutMs) : expectFinal ? null : this.requestTimeoutMs; diff --git a/src/gateway/probe.ts b/src/gateway/probe.ts index b39b0fec14c..8763bec5e9b 100644 --- a/src/gateway/probe.ts +++ b/src/gateway/probe.ts @@ -1,6 +1,7 @@ import { randomUUID } from "node:crypto"; import { formatErrorMessage } from "../infra/errors.js"; import type { SystemPresence } from "../infra/system-presence.js"; +import { MAX_SAFE_TIMEOUT_DELAY_MS, resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js"; import { GatewayClient, GatewayClientRequestError } from "./client.js"; import { READ_SCOPE } from "./method-scopes.js"; import { isLoopbackHost } from "./net.js"; @@ -46,14 +47,14 @@ export type GatewayProbeResult = { }; export const MIN_PROBE_TIMEOUT_MS = 250; -export const MAX_TIMER_DELAY_MS = 2_147_483_647; +export const MAX_TIMER_DELAY_MS = MAX_SAFE_TIMEOUT_DELAY_MS; const PAIRING_REQUIRED_PATTERN = /\bpairing required\b/i; const OPERATOR_READ_SCOPE = "operator.read"; const OPERATOR_WRITE_SCOPE = "operator.write"; const OPERATOR_ADMIN_SCOPE = "operator.admin"; export function clampProbeTimeoutMs(timeoutMs: number): number { - return Math.min(MAX_TIMER_DELAY_MS, Math.max(MIN_PROBE_TIMEOUT_MS, timeoutMs)); + return resolveSafeTimeoutDelayMs(timeoutMs, { minMs: MIN_PROBE_TIMEOUT_MS }); } function formatProbeCloseError(close: GatewayProbeClose): string { diff --git a/src/gateway/server-chat.ts b/src/gateway/server-chat.ts index bf8f2eb8ed8..10ba7dbbcd0 100644 --- a/src/gateway/server-chat.ts +++ b/src/gateway/server-chat.ts @@ -10,6 +10,7 @@ import { type AgentEventPayload, getAgentRunContext } from "../infra/agent-event import { detectErrorKind, type ErrorKind } from "../infra/errors.js"; import { resolveHeartbeatVisibility } from "../infra/heartbeat-visibility.js"; import { stripInlineDirectiveTagsForDisplay } from "../utils/directive-tags.js"; +import { setSafeTimeout } from "../utils/timer-delay.js"; import { isSuppressedControlReplyLeadFragment, isSuppressedControlReplyText, @@ -671,11 +672,10 @@ export function createAgentEventHandler({ opts?: { skipChatErrorFinal?: boolean }, ) => { clearPendingTerminalLifecycleError(evt.runId); - const delayMs = Math.max(1, Math.min(Math.floor(lifecycleErrorRetryGraceMs), 2_147_483_647)); - const timer = setTimeout(() => { + const timer = setSafeTimeout(() => { pendingTerminalLifecycleErrors.delete(evt.runId); finalizeLifecycleEvent(evt, opts); - }, delayMs); + }, lifecycleErrorRetryGraceMs); timer.unref?.(); pendingTerminalLifecycleErrors.set(evt.runId, timer); }; diff --git a/src/gateway/server-methods/agent-job.ts b/src/gateway/server-methods/agent-job.ts index 360e445f697..f32a1d8b0e2 100644 --- a/src/gateway/server-methods/agent-job.ts +++ b/src/gateway/server-methods/agent-job.ts @@ -1,4 +1,5 @@ import { onAgentEvent } from "../../infra/agent-events.js"; +import { setSafeTimeout } from "../../utils/timer-delay.js"; const AGENT_RUN_CACHE_TTL_MS = 10 * 60_000; /** @@ -254,8 +255,7 @@ export async function waitForAgentJob(params: { ) => { clearPendingErrorTimer(); clearPendingTimeoutTimer(); - const effectiveDelay = Math.max(1, Math.min(Math.floor(delayMs), 2_147_483_647)); - const timerRef = setTimeout(() => { + const timerRef = setSafeTimeout(() => { const latest = ignoreCachedSnapshot ? undefined : getCachedAgentRun(runId); if (latest) { finish(latest); @@ -263,7 +263,7 @@ export async function waitForAgentJob(params: { } recordAgentRunSnapshot(snapshot); finish(snapshot); - }, effectiveDelay); + }, delayMs); timerRef.unref?.(); if (kind === "error") { pendingErrorTimer = timerRef; @@ -335,8 +335,7 @@ export async function waitForAgentJob(params: { finish(snapshot); }); - const timerDelayMs = Math.max(1, Math.min(Math.floor(timeoutMs), 2_147_483_647)); - const timer = setTimeout(() => finish(null), timerDelayMs); + const timer = setSafeTimeout(() => finish(null), timeoutMs); onAbort = () => finish(null); signal?.addEventListener("abort", onAbort, { once: true }); }); diff --git a/src/gateway/server-methods/agent-wait-dedupe.ts b/src/gateway/server-methods/agent-wait-dedupe.ts index 50629beb3eb..d38b368d8ff 100644 --- a/src/gateway/server-methods/agent-wait-dedupe.ts +++ b/src/gateway/server-methods/agent-wait-dedupe.ts @@ -1,3 +1,4 @@ +import { setSafeTimeout } from "../../utils/timer-delay.js"; import type { DedupeEntry } from "../server-shared.js"; export type AgentWaitTerminalSnapshot = { @@ -194,8 +195,7 @@ export async function waitForTerminalGatewayDedupe(params: { return; } - const timeoutDelayMs = Math.max(1, Math.min(Math.floor(params.timeoutMs), 2_147_483_647)); - timeoutHandle = setTimeout(() => finish(null), timeoutDelayMs); + timeoutHandle = setSafeTimeout(() => finish(null), params.timeoutMs); timeoutHandle.unref?.(); onAbort = () => finish(null); diff --git a/src/infra/heartbeat-runner.scheduler.test.ts b/src/infra/heartbeat-runner.scheduler.test.ts index a81453b855b..8b0f7f18aa9 100644 --- a/src/infra/heartbeat-runner.scheduler.test.ts +++ b/src/infra/heartbeat-runner.scheduler.test.ts @@ -346,6 +346,24 @@ describe("startHeartbeatRunner", () => { runner.stop(); }); + it("clamps oversized scheduler delays so heartbeats do not fire in a tight loop (#71414)", async () => { + useFakeHeartbeatTime(); + const runSpy = vi.fn().mockResolvedValue({ status: "ran", durationMs: 1 }); + // 365d resolves to ~31_536_000_000 ms, well past Node setTimeout's + // 2_147_483_647 ms cap. Without clamping, setTimeout would fire after + // 1ms and re-arm in a tight loop, exhausting the runner. + const runner = startHeartbeatRunner({ + cfg: heartbeatConfig([{ id: "main", heartbeat: { every: "365d" } }]), + runOnce: runSpy, + stableSchedulerSeed: TEST_SCHEDULER_SEED, + }); + // Advance well past the broken 1ms re-arm but well under the clamped cap + // (~24.85d). If the bug is present, runSpy gets called many times. + await vi.advanceTimersByTimeAsync(60_000); + expect(runSpy).not.toHaveBeenCalled(); + runner.stop(); + }); + it("does not fan out to unrelated agents for session-scoped exec wakes", async () => { useFakeHeartbeatTime(); const runSpy = vi.fn().mockResolvedValue({ status: "ran", durationMs: 1 }); diff --git a/src/infra/heartbeat-runner.timeout-warning.test.ts b/src/infra/heartbeat-runner.timeout-warning.test.ts new file mode 100644 index 00000000000..90f18d4e658 --- /dev/null +++ b/src/infra/heartbeat-runner.timeout-warning.test.ts @@ -0,0 +1,70 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import type { OpenClawConfig } from "../config/config.js"; + +function createHeartbeatConfig(every: string): OpenClawConfig { + return { + agents: { + defaults: { heartbeat: { every } }, + list: [{ id: "main", heartbeat: { every } }], + }, + } as OpenClawConfig; +} + +describe("startHeartbeatRunner timeout overflow warnings", () => { + afterEach(() => { + vi.useRealTimers(); + vi.resetModules(); + vi.restoreAllMocks(); + }); + + it("warns once per runner lifetime when clamping an oversized scheduler delay", async () => { + const warn = vi.fn(); + const noop = vi.fn(); + const logger = { + subsystem: "gateway/heartbeat", + isEnabled: vi.fn(() => true), + trace: noop, + debug: noop, + info: noop, + warn, + error: noop, + fatal: noop, + raw: noop, + child: vi.fn(() => logger), + }; + + vi.doMock("../logging/subsystem.js", async () => { + const actual = + await vi.importActual("../logging/subsystem.js"); + return { + ...actual, + createSubsystemLogger: vi.fn(() => logger), + }; + }); + + const [{ startHeartbeatRunner }, { resetHeartbeatWakeStateForTests }] = await Promise.all([ + import("./heartbeat-runner.js"), + import("./heartbeat-wake.js"), + ]); + + vi.useFakeTimers(); + vi.setSystemTime(new Date(0)); + const cfg = createHeartbeatConfig("365d"); + const runnerA = startHeartbeatRunner({ + cfg, + runOnce: vi.fn().mockResolvedValue({ status: "ran", durationMs: 1 }), + stableSchedulerSeed: "seed-0", + }); + const runnerB = startHeartbeatRunner({ + cfg, + runOnce: vi.fn().mockResolvedValue({ status: "ran", durationMs: 1 }), + stableSchedulerSeed: "seed-0", + }); + + expect(warn).toHaveBeenCalledTimes(2); + + runnerA.stop(); + runnerB.stop(); + resetHeartbeatWakeStateForTests(); + }); +}); diff --git a/src/infra/heartbeat-runner.ts b/src/infra/heartbeat-runner.ts index 6992f8bb044..71c52baec37 100644 --- a/src/infra/heartbeat-runner.ts +++ b/src/infra/heartbeat-runner.ts @@ -64,6 +64,7 @@ import { normalizeOptionalString, } from "../shared/string-coerce.js"; import { escapeRegExp } from "../utils.js"; +import { MAX_SAFE_TIMEOUT_DELAY_MS, resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js"; import { loadOrCreateDeviceIdentity } from "./device-identity.js"; import { formatErrorMessage, hasErrnoCode } from "./errors.js"; import { isWithinActiveHours } from "./heartbeat-active-hours.js"; @@ -1330,6 +1331,7 @@ export function startHeartbeatRunner(opts: { stopped: false, }; let initialized = false; + let heartbeatTimeoutOverflowWarned = false; const resolveNextDue = ( now: number, @@ -1384,7 +1386,15 @@ export function startHeartbeatRunner(opts: { if (!Number.isFinite(nextDue)) { return; } - const delay = Math.max(0, nextDue - now); + const rawDelay = Math.max(0, nextDue - now); + if (rawDelay > MAX_SAFE_TIMEOUT_DELAY_MS && !heartbeatTimeoutOverflowWarned) { + heartbeatTimeoutOverflowWarned = true; + log.warn("heartbeat: scheduled delay exceeds Node setTimeout cap; clamping to ~24.85d", { + rawDelayMs: rawDelay, + clampedMs: MAX_SAFE_TIMEOUT_DELAY_MS, + }); + } + const delay = resolveSafeTimeoutDelayMs(rawDelay, { minMs: 0 }); state.timer = setTimeout(() => { state.timer = null; requestHeartbeatNow({ reason: "interval", coalesceMs: 0 }); diff --git a/src/utils/timer-delay.test.ts b/src/utils/timer-delay.test.ts new file mode 100644 index 00000000000..c9f552486a6 --- /dev/null +++ b/src/utils/timer-delay.test.ts @@ -0,0 +1,34 @@ +import { describe, expect, it, vi } from "vitest"; +import { + MAX_SAFE_TIMEOUT_DELAY_MS, + resolveSafeTimeoutDelayMs, + setSafeTimeout, +} from "./timer-delay.js"; + +describe("resolveSafeTimeoutDelayMs", () => { + it("clamps to Node's signed-32-bit timer ceiling", () => { + expect(resolveSafeTimeoutDelayMs(3_000_000_000)).toBe(MAX_SAFE_TIMEOUT_DELAY_MS); + }); + + it("respects custom minimums", () => { + expect(resolveSafeTimeoutDelayMs(10, { minMs: 250 })).toBe(250); + expect(resolveSafeTimeoutDelayMs(10, { minMs: 0 })).toBe(10); + }); + + it("falls back to the minimum for non-finite input", () => { + expect(resolveSafeTimeoutDelayMs(Number.POSITIVE_INFINITY, { minMs: 250 })).toBe(250); + expect(resolveSafeTimeoutDelayMs(Number.NaN)).toBe(1); + }); +}); + +describe("setSafeTimeout", () => { + it("arms setTimeout with the clamped delay", () => { + const timeoutSpy = vi.spyOn(globalThis, "setTimeout"); + + const timer = setSafeTimeout(() => undefined, 3_000_000_000); + clearTimeout(timer); + + expect(timeoutSpy).toHaveBeenCalledWith(expect.any(Function), MAX_SAFE_TIMEOUT_DELAY_MS); + timeoutSpy.mockRestore(); + }); +}); diff --git a/src/utils/timer-delay.ts b/src/utils/timer-delay.ts new file mode 100644 index 00000000000..70f4c089aae --- /dev/null +++ b/src/utils/timer-delay.ts @@ -0,0 +1,19 @@ +export const MAX_SAFE_TIMEOUT_DELAY_MS = 2_147_483_647; + +export function resolveSafeTimeoutDelayMs(delayMs: number, opts?: { minMs?: number }): number { + const rawMinMs = opts?.minMs ?? 1; + const minMs = Math.min( + MAX_SAFE_TIMEOUT_DELAY_MS, + Math.max(0, Number.isFinite(rawMinMs) ? Math.floor(rawMinMs) : 1), + ); + const candidateMs = Number.isFinite(delayMs) ? Math.floor(delayMs) : minMs; + return Math.min(MAX_SAFE_TIMEOUT_DELAY_MS, Math.max(minMs, candidateMs)); +} + +export function setSafeTimeout( + callback: () => void, + delayMs: number, + opts?: { minMs?: number }, +): NodeJS.Timeout { + return setTimeout(callback, resolveSafeTimeoutDelayMs(delayMs, opts)); +}