fix(heartbeat): clamp scheduler delay to Node setTimeout cap (#71414) (#71478)

* fix(heartbeat): clamp scheduler delay to Node setTimeout cap (#71414)

When `agents.defaults.heartbeat.every` resolves to >2_147_483_647 ms
(~24.85d), the previous scheduleNext() called setTimeout with the raw
delay. Node clamps any delay > 2^31-1 to 1 ms, fires the callback, and
the heartbeat re-arms with the same oversized value - a tight loop that
floods the log with TimeoutOverflowWarning and crashes the gateway with
exit code 1.

Clamp the computed delay to HEARTBEAT_MAX_TIMEOUT_MS (2_147_483_647)
before calling setTimeout. The worst case is now one heartbeat every
~24.85d instead of crash-loop. Warn once per process when clamping
fires, so a misconfigured "365d" remains visible without flooding.

This is a defense-in-depth fix at the scheduler layer; loadConfig-level
rejection is a broader change with more blast radius and a separate
question (some users may legitimately want "every: 365d" to mean
"effectively never"). The clamped behaviour is closer to that intent
than the crash is.

Test: new scheduler test sets heartbeat.every="365d" with fake timers,
advances 60s, and asserts runSpy was never called (with the bug, it
would be called ~60_000 times).

* style: format heartbeat scheduler clamp

* fix: share safe timeout delay clamp (#71478) (thanks @hclsys)

---------

Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
hcl
2026-04-25 17:03:43 +08:00
committed by GitHub
parent a33f7b7d05
commit fd74fc5a4f
13 changed files with 172 additions and 18 deletions

View File

@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Heartbeat: clamp oversized scheduler delays through the shared safe timer helper, preventing `every` values over Node's timeout cap from becoming a 1 ms crash loop. Fixes #71414. (#71478) Thanks @hclsys.
- Telegram: remove the startup persisted-offset `getUpdates` preflight so polling restarts do not self-conflict before the runner starts. Fixes #69304. (#69779) Thanks @chinar-amrutkar.
- Browser/Playwright: ignore benign already-handled route races during guarded navigation so browser-page tasks no longer fail when Playwright tears down a route mid-flight. (#68708) Thanks @Steady-ai.
- Browser/aria snapshots: bind `format=aria` `axN` refs to live DOM nodes through backend DOM ids when Playwright is available, so follow-up browser actions can use those refs without timing out. (#62434) Thanks @MrKipler.

View File

@@ -1,9 +1,9 @@
const MAX_SAFE_TIMEOUT_MS = 2_147_483_647;
import { resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js";
export function clampRuntimeAuthRefreshDelayMs(params: {
refreshAt: number;
now: number;
minDelayMs: number;
}): number {
return Math.min(MAX_SAFE_TIMEOUT_MS, Math.max(params.minDelayMs, params.refreshAt - params.now));
return resolveSafeTimeoutDelayMs(params.refreshAt - params.now, { minMs: params.minDelayMs });
}

View File

@@ -15,6 +15,7 @@ import {
type GatewayClientMode,
type GatewayClientName,
} from "../utils/message-channel.js";
import { resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js";
import { VERSION } from "../version.js";
import { GatewayClient, type GatewayClientOptions } from "./client.js";
import {
@@ -289,7 +290,7 @@ function resolveGatewayCallTimeout(timeoutValue: unknown): {
} {
const timeoutMs =
typeof timeoutValue === "number" && Number.isFinite(timeoutValue) ? timeoutValue : 10_000;
const safeTimerTimeoutMs = Math.max(1, Math.min(Math.floor(timeoutMs), 2_147_483_647));
const safeTimerTimeoutMs = resolveSafeTimeoutDelayMs(timeoutMs);
return { timeoutMs, safeTimerTimeoutMs };
}

View File

@@ -24,6 +24,7 @@ import {
type GatewayClientMode,
type GatewayClientName,
} from "../utils/message-channel.js";
import { resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js";
import { VERSION } from "../version.js";
import { buildDeviceAuthPayloadV3 } from "./device-auth.js";
import { resolveConnectChallengeTimeoutMs } from "./handshake-timeouts.js";
@@ -210,7 +211,7 @@ export class GatewayClient {
};
this.requestTimeoutMs =
typeof opts.requestTimeoutMs === "number" && Number.isFinite(opts.requestTimeoutMs)
? Math.max(1, Math.min(Math.floor(opts.requestTimeoutMs), 2_147_483_647))
? resolveSafeTimeoutDelayMs(opts.requestTimeoutMs)
: 30_000;
}
@@ -931,7 +932,7 @@ export class GatewayClient {
opts?.timeoutMs === null
? null
: typeof opts?.timeoutMs === "number" && Number.isFinite(opts.timeoutMs)
? Math.max(1, Math.min(Math.floor(opts.timeoutMs), 2_147_483_647))
? resolveSafeTimeoutDelayMs(opts.timeoutMs)
: expectFinal
? null
: this.requestTimeoutMs;

View File

@@ -1,6 +1,7 @@
import { randomUUID } from "node:crypto";
import { formatErrorMessage } from "../infra/errors.js";
import type { SystemPresence } from "../infra/system-presence.js";
import { MAX_SAFE_TIMEOUT_DELAY_MS, resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js";
import { GatewayClient, GatewayClientRequestError } from "./client.js";
import { READ_SCOPE } from "./method-scopes.js";
import { isLoopbackHost } from "./net.js";
@@ -46,14 +47,14 @@ export type GatewayProbeResult = {
};
export const MIN_PROBE_TIMEOUT_MS = 250;
export const MAX_TIMER_DELAY_MS = 2_147_483_647;
export const MAX_TIMER_DELAY_MS = MAX_SAFE_TIMEOUT_DELAY_MS;
const PAIRING_REQUIRED_PATTERN = /\bpairing required\b/i;
const OPERATOR_READ_SCOPE = "operator.read";
const OPERATOR_WRITE_SCOPE = "operator.write";
const OPERATOR_ADMIN_SCOPE = "operator.admin";
export function clampProbeTimeoutMs(timeoutMs: number): number {
return Math.min(MAX_TIMER_DELAY_MS, Math.max(MIN_PROBE_TIMEOUT_MS, timeoutMs));
return resolveSafeTimeoutDelayMs(timeoutMs, { minMs: MIN_PROBE_TIMEOUT_MS });
}
function formatProbeCloseError(close: GatewayProbeClose): string {

View File

@@ -10,6 +10,7 @@ import { type AgentEventPayload, getAgentRunContext } from "../infra/agent-event
import { detectErrorKind, type ErrorKind } from "../infra/errors.js";
import { resolveHeartbeatVisibility } from "../infra/heartbeat-visibility.js";
import { stripInlineDirectiveTagsForDisplay } from "../utils/directive-tags.js";
import { setSafeTimeout } from "../utils/timer-delay.js";
import {
isSuppressedControlReplyLeadFragment,
isSuppressedControlReplyText,
@@ -671,11 +672,10 @@ export function createAgentEventHandler({
opts?: { skipChatErrorFinal?: boolean },
) => {
clearPendingTerminalLifecycleError(evt.runId);
const delayMs = Math.max(1, Math.min(Math.floor(lifecycleErrorRetryGraceMs), 2_147_483_647));
const timer = setTimeout(() => {
const timer = setSafeTimeout(() => {
pendingTerminalLifecycleErrors.delete(evt.runId);
finalizeLifecycleEvent(evt, opts);
}, delayMs);
}, lifecycleErrorRetryGraceMs);
timer.unref?.();
pendingTerminalLifecycleErrors.set(evt.runId, timer);
};

View File

@@ -1,4 +1,5 @@
import { onAgentEvent } from "../../infra/agent-events.js";
import { setSafeTimeout } from "../../utils/timer-delay.js";
const AGENT_RUN_CACHE_TTL_MS = 10 * 60_000;
/**
@@ -254,8 +255,7 @@ export async function waitForAgentJob(params: {
) => {
clearPendingErrorTimer();
clearPendingTimeoutTimer();
const effectiveDelay = Math.max(1, Math.min(Math.floor(delayMs), 2_147_483_647));
const timerRef = setTimeout(() => {
const timerRef = setSafeTimeout(() => {
const latest = ignoreCachedSnapshot ? undefined : getCachedAgentRun(runId);
if (latest) {
finish(latest);
@@ -263,7 +263,7 @@ export async function waitForAgentJob(params: {
}
recordAgentRunSnapshot(snapshot);
finish(snapshot);
}, effectiveDelay);
}, delayMs);
timerRef.unref?.();
if (kind === "error") {
pendingErrorTimer = timerRef;
@@ -335,8 +335,7 @@ export async function waitForAgentJob(params: {
finish(snapshot);
});
const timerDelayMs = Math.max(1, Math.min(Math.floor(timeoutMs), 2_147_483_647));
const timer = setTimeout(() => finish(null), timerDelayMs);
const timer = setSafeTimeout(() => finish(null), timeoutMs);
onAbort = () => finish(null);
signal?.addEventListener("abort", onAbort, { once: true });
});

View File

@@ -1,3 +1,4 @@
import { setSafeTimeout } from "../../utils/timer-delay.js";
import type { DedupeEntry } from "../server-shared.js";
export type AgentWaitTerminalSnapshot = {
@@ -194,8 +195,7 @@ export async function waitForTerminalGatewayDedupe(params: {
return;
}
const timeoutDelayMs = Math.max(1, Math.min(Math.floor(params.timeoutMs), 2_147_483_647));
timeoutHandle = setTimeout(() => finish(null), timeoutDelayMs);
timeoutHandle = setSafeTimeout(() => finish(null), params.timeoutMs);
timeoutHandle.unref?.();
onAbort = () => finish(null);

View File

@@ -346,6 +346,24 @@ describe("startHeartbeatRunner", () => {
runner.stop();
});
it("clamps oversized scheduler delays so heartbeats do not fire in a tight loop (#71414)", async () => {
useFakeHeartbeatTime();
const runSpy = vi.fn().mockResolvedValue({ status: "ran", durationMs: 1 });
// 365d resolves to ~31_536_000_000 ms, well past Node setTimeout's
// 2_147_483_647 ms cap. Without clamping, setTimeout would fire after
// 1ms and re-arm in a tight loop, exhausting the runner.
const runner = startHeartbeatRunner({
cfg: heartbeatConfig([{ id: "main", heartbeat: { every: "365d" } }]),
runOnce: runSpy,
stableSchedulerSeed: TEST_SCHEDULER_SEED,
});
// Advance well past the broken 1ms re-arm but well under the clamped cap
// (~24.85d). If the bug is present, runSpy gets called many times.
await vi.advanceTimersByTimeAsync(60_000);
expect(runSpy).not.toHaveBeenCalled();
runner.stop();
});
it("does not fan out to unrelated agents for session-scoped exec wakes", async () => {
useFakeHeartbeatTime();
const runSpy = vi.fn().mockResolvedValue({ status: "ran", durationMs: 1 });

View File

@@ -0,0 +1,70 @@
import { afterEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
function createHeartbeatConfig(every: string): OpenClawConfig {
return {
agents: {
defaults: { heartbeat: { every } },
list: [{ id: "main", heartbeat: { every } }],
},
} as OpenClawConfig;
}
describe("startHeartbeatRunner timeout overflow warnings", () => {
afterEach(() => {
vi.useRealTimers();
vi.resetModules();
vi.restoreAllMocks();
});
it("warns once per runner lifetime when clamping an oversized scheduler delay", async () => {
const warn = vi.fn();
const noop = vi.fn();
const logger = {
subsystem: "gateway/heartbeat",
isEnabled: vi.fn(() => true),
trace: noop,
debug: noop,
info: noop,
warn,
error: noop,
fatal: noop,
raw: noop,
child: vi.fn(() => logger),
};
vi.doMock("../logging/subsystem.js", async () => {
const actual =
await vi.importActual<typeof import("../logging/subsystem.js")>("../logging/subsystem.js");
return {
...actual,
createSubsystemLogger: vi.fn(() => logger),
};
});
const [{ startHeartbeatRunner }, { resetHeartbeatWakeStateForTests }] = await Promise.all([
import("./heartbeat-runner.js"),
import("./heartbeat-wake.js"),
]);
vi.useFakeTimers();
vi.setSystemTime(new Date(0));
const cfg = createHeartbeatConfig("365d");
const runnerA = startHeartbeatRunner({
cfg,
runOnce: vi.fn().mockResolvedValue({ status: "ran", durationMs: 1 }),
stableSchedulerSeed: "seed-0",
});
const runnerB = startHeartbeatRunner({
cfg,
runOnce: vi.fn().mockResolvedValue({ status: "ran", durationMs: 1 }),
stableSchedulerSeed: "seed-0",
});
expect(warn).toHaveBeenCalledTimes(2);
runnerA.stop();
runnerB.stop();
resetHeartbeatWakeStateForTests();
});
});

View File

@@ -64,6 +64,7 @@ import {
normalizeOptionalString,
} from "../shared/string-coerce.js";
import { escapeRegExp } from "../utils.js";
import { MAX_SAFE_TIMEOUT_DELAY_MS, resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js";
import { loadOrCreateDeviceIdentity } from "./device-identity.js";
import { formatErrorMessage, hasErrnoCode } from "./errors.js";
import { isWithinActiveHours } from "./heartbeat-active-hours.js";
@@ -1330,6 +1331,7 @@ export function startHeartbeatRunner(opts: {
stopped: false,
};
let initialized = false;
let heartbeatTimeoutOverflowWarned = false;
const resolveNextDue = (
now: number,
@@ -1384,7 +1386,15 @@ export function startHeartbeatRunner(opts: {
if (!Number.isFinite(nextDue)) {
return;
}
const delay = Math.max(0, nextDue - now);
const rawDelay = Math.max(0, nextDue - now);
if (rawDelay > MAX_SAFE_TIMEOUT_DELAY_MS && !heartbeatTimeoutOverflowWarned) {
heartbeatTimeoutOverflowWarned = true;
log.warn("heartbeat: scheduled delay exceeds Node setTimeout cap; clamping to ~24.85d", {
rawDelayMs: rawDelay,
clampedMs: MAX_SAFE_TIMEOUT_DELAY_MS,
});
}
const delay = resolveSafeTimeoutDelayMs(rawDelay, { minMs: 0 });
state.timer = setTimeout(() => {
state.timer = null;
requestHeartbeatNow({ reason: "interval", coalesceMs: 0 });

View File

@@ -0,0 +1,34 @@
import { describe, expect, it, vi } from "vitest";
import {
MAX_SAFE_TIMEOUT_DELAY_MS,
resolveSafeTimeoutDelayMs,
setSafeTimeout,
} from "./timer-delay.js";
describe("resolveSafeTimeoutDelayMs", () => {
it("clamps to Node's signed-32-bit timer ceiling", () => {
expect(resolveSafeTimeoutDelayMs(3_000_000_000)).toBe(MAX_SAFE_TIMEOUT_DELAY_MS);
});
it("respects custom minimums", () => {
expect(resolveSafeTimeoutDelayMs(10, { minMs: 250 })).toBe(250);
expect(resolveSafeTimeoutDelayMs(10, { minMs: 0 })).toBe(10);
});
it("falls back to the minimum for non-finite input", () => {
expect(resolveSafeTimeoutDelayMs(Number.POSITIVE_INFINITY, { minMs: 250 })).toBe(250);
expect(resolveSafeTimeoutDelayMs(Number.NaN)).toBe(1);
});
});
describe("setSafeTimeout", () => {
it("arms setTimeout with the clamped delay", () => {
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
const timer = setSafeTimeout(() => undefined, 3_000_000_000);
clearTimeout(timer);
expect(timeoutSpy).toHaveBeenCalledWith(expect.any(Function), MAX_SAFE_TIMEOUT_DELAY_MS);
timeoutSpy.mockRestore();
});
});

19
src/utils/timer-delay.ts Normal file
View File

@@ -0,0 +1,19 @@
export const MAX_SAFE_TIMEOUT_DELAY_MS = 2_147_483_647;
export function resolveSafeTimeoutDelayMs(delayMs: number, opts?: { minMs?: number }): number {
const rawMinMs = opts?.minMs ?? 1;
const minMs = Math.min(
MAX_SAFE_TIMEOUT_DELAY_MS,
Math.max(0, Number.isFinite(rawMinMs) ? Math.floor(rawMinMs) : 1),
);
const candidateMs = Number.isFinite(delayMs) ? Math.floor(delayMs) : minMs;
return Math.min(MAX_SAFE_TIMEOUT_DELAY_MS, Math.max(minMs, candidateMs));
}
export function setSafeTimeout(
callback: () => void,
delayMs: number,
opts?: { minMs?: number },
): NodeJS.Timeout {
return setTimeout(callback, resolveSafeTimeoutDelayMs(delayMs, opts));
}