mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:20:43 +00:00
* fix(heartbeat): clamp scheduler delay to Node setTimeout cap (#71414) When `agents.defaults.heartbeat.every` resolves to >2_147_483_647 ms (~24.85d), the previous scheduleNext() called setTimeout with the raw delay. Node clamps any delay > 2^31-1 to 1 ms, fires the callback, and the heartbeat re-arms with the same oversized value - a tight loop that floods the log with TimeoutOverflowWarning and crashes the gateway with exit code 1. Clamp the computed delay to HEARTBEAT_MAX_TIMEOUT_MS (2_147_483_647) before calling setTimeout. The worst case is now one heartbeat every ~24.85d instead of crash-loop. Warn once per process when clamping fires, so a misconfigured "365d" remains visible without flooding. This is a defense-in-depth fix at the scheduler layer; loadConfig-level rejection is a broader change with more blast radius and a separate question (some users may legitimately want "every: 365d" to mean "effectively never"). The clamped behaviour is closer to that intent than the crash is. Test: new scheduler test sets heartbeat.every="365d" with fake timers, advances 60s, and asserts runSpy was never called (with the bug, it would be called ~60_000 times). * style: format heartbeat scheduler clamp * fix: share safe timeout delay clamp (#71478) (thanks @hclsys) --------- Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Heartbeat: clamp oversized scheduler delays through the shared safe timer helper, preventing `every` values over Node's timeout cap from becoming a 1 ms crash loop. Fixes #71414. (#71478) Thanks @hclsys.
|
||||
- Telegram: remove the startup persisted-offset `getUpdates` preflight so polling restarts do not self-conflict before the runner starts. Fixes #69304. (#69779) Thanks @chinar-amrutkar.
|
||||
- Browser/Playwright: ignore benign already-handled route races during guarded navigation so browser-page tasks no longer fail when Playwright tears down a route mid-flight. (#68708) Thanks @Steady-ai.
|
||||
- Browser/aria snapshots: bind `format=aria` `axN` refs to live DOM nodes through backend DOM ids when Playwright is available, so follow-up browser actions can use those refs without timing out. (#62434) Thanks @MrKipler.
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
const MAX_SAFE_TIMEOUT_MS = 2_147_483_647;
|
||||
import { resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js";
|
||||
|
||||
export function clampRuntimeAuthRefreshDelayMs(params: {
|
||||
refreshAt: number;
|
||||
now: number;
|
||||
minDelayMs: number;
|
||||
}): number {
|
||||
return Math.min(MAX_SAFE_TIMEOUT_MS, Math.max(params.minDelayMs, params.refreshAt - params.now));
|
||||
return resolveSafeTimeoutDelayMs(params.refreshAt - params.now, { minMs: params.minDelayMs });
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@ import {
|
||||
type GatewayClientMode,
|
||||
type GatewayClientName,
|
||||
} from "../utils/message-channel.js";
|
||||
import { resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js";
|
||||
import { VERSION } from "../version.js";
|
||||
import { GatewayClient, type GatewayClientOptions } from "./client.js";
|
||||
import {
|
||||
@@ -289,7 +290,7 @@ function resolveGatewayCallTimeout(timeoutValue: unknown): {
|
||||
} {
|
||||
const timeoutMs =
|
||||
typeof timeoutValue === "number" && Number.isFinite(timeoutValue) ? timeoutValue : 10_000;
|
||||
const safeTimerTimeoutMs = Math.max(1, Math.min(Math.floor(timeoutMs), 2_147_483_647));
|
||||
const safeTimerTimeoutMs = resolveSafeTimeoutDelayMs(timeoutMs);
|
||||
return { timeoutMs, safeTimerTimeoutMs };
|
||||
}
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ import {
|
||||
type GatewayClientMode,
|
||||
type GatewayClientName,
|
||||
} from "../utils/message-channel.js";
|
||||
import { resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js";
|
||||
import { VERSION } from "../version.js";
|
||||
import { buildDeviceAuthPayloadV3 } from "./device-auth.js";
|
||||
import { resolveConnectChallengeTimeoutMs } from "./handshake-timeouts.js";
|
||||
@@ -210,7 +211,7 @@ export class GatewayClient {
|
||||
};
|
||||
this.requestTimeoutMs =
|
||||
typeof opts.requestTimeoutMs === "number" && Number.isFinite(opts.requestTimeoutMs)
|
||||
? Math.max(1, Math.min(Math.floor(opts.requestTimeoutMs), 2_147_483_647))
|
||||
? resolveSafeTimeoutDelayMs(opts.requestTimeoutMs)
|
||||
: 30_000;
|
||||
}
|
||||
|
||||
@@ -931,7 +932,7 @@ export class GatewayClient {
|
||||
opts?.timeoutMs === null
|
||||
? null
|
||||
: typeof opts?.timeoutMs === "number" && Number.isFinite(opts.timeoutMs)
|
||||
? Math.max(1, Math.min(Math.floor(opts.timeoutMs), 2_147_483_647))
|
||||
? resolveSafeTimeoutDelayMs(opts.timeoutMs)
|
||||
: expectFinal
|
||||
? null
|
||||
: this.requestTimeoutMs;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
import { formatErrorMessage } from "../infra/errors.js";
|
||||
import type { SystemPresence } from "../infra/system-presence.js";
|
||||
import { MAX_SAFE_TIMEOUT_DELAY_MS, resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js";
|
||||
import { GatewayClient, GatewayClientRequestError } from "./client.js";
|
||||
import { READ_SCOPE } from "./method-scopes.js";
|
||||
import { isLoopbackHost } from "./net.js";
|
||||
@@ -46,14 +47,14 @@ export type GatewayProbeResult = {
|
||||
};
|
||||
|
||||
export const MIN_PROBE_TIMEOUT_MS = 250;
|
||||
export const MAX_TIMER_DELAY_MS = 2_147_483_647;
|
||||
export const MAX_TIMER_DELAY_MS = MAX_SAFE_TIMEOUT_DELAY_MS;
|
||||
const PAIRING_REQUIRED_PATTERN = /\bpairing required\b/i;
|
||||
const OPERATOR_READ_SCOPE = "operator.read";
|
||||
const OPERATOR_WRITE_SCOPE = "operator.write";
|
||||
const OPERATOR_ADMIN_SCOPE = "operator.admin";
|
||||
|
||||
export function clampProbeTimeoutMs(timeoutMs: number): number {
|
||||
return Math.min(MAX_TIMER_DELAY_MS, Math.max(MIN_PROBE_TIMEOUT_MS, timeoutMs));
|
||||
return resolveSafeTimeoutDelayMs(timeoutMs, { minMs: MIN_PROBE_TIMEOUT_MS });
|
||||
}
|
||||
|
||||
function formatProbeCloseError(close: GatewayProbeClose): string {
|
||||
|
||||
@@ -10,6 +10,7 @@ import { type AgentEventPayload, getAgentRunContext } from "../infra/agent-event
|
||||
import { detectErrorKind, type ErrorKind } from "../infra/errors.js";
|
||||
import { resolveHeartbeatVisibility } from "../infra/heartbeat-visibility.js";
|
||||
import { stripInlineDirectiveTagsForDisplay } from "../utils/directive-tags.js";
|
||||
import { setSafeTimeout } from "../utils/timer-delay.js";
|
||||
import {
|
||||
isSuppressedControlReplyLeadFragment,
|
||||
isSuppressedControlReplyText,
|
||||
@@ -671,11 +672,10 @@ export function createAgentEventHandler({
|
||||
opts?: { skipChatErrorFinal?: boolean },
|
||||
) => {
|
||||
clearPendingTerminalLifecycleError(evt.runId);
|
||||
const delayMs = Math.max(1, Math.min(Math.floor(lifecycleErrorRetryGraceMs), 2_147_483_647));
|
||||
const timer = setTimeout(() => {
|
||||
const timer = setSafeTimeout(() => {
|
||||
pendingTerminalLifecycleErrors.delete(evt.runId);
|
||||
finalizeLifecycleEvent(evt, opts);
|
||||
}, delayMs);
|
||||
}, lifecycleErrorRetryGraceMs);
|
||||
timer.unref?.();
|
||||
pendingTerminalLifecycleErrors.set(evt.runId, timer);
|
||||
};
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { onAgentEvent } from "../../infra/agent-events.js";
|
||||
import { setSafeTimeout } from "../../utils/timer-delay.js";
|
||||
|
||||
const AGENT_RUN_CACHE_TTL_MS = 10 * 60_000;
|
||||
/**
|
||||
@@ -254,8 +255,7 @@ export async function waitForAgentJob(params: {
|
||||
) => {
|
||||
clearPendingErrorTimer();
|
||||
clearPendingTimeoutTimer();
|
||||
const effectiveDelay = Math.max(1, Math.min(Math.floor(delayMs), 2_147_483_647));
|
||||
const timerRef = setTimeout(() => {
|
||||
const timerRef = setSafeTimeout(() => {
|
||||
const latest = ignoreCachedSnapshot ? undefined : getCachedAgentRun(runId);
|
||||
if (latest) {
|
||||
finish(latest);
|
||||
@@ -263,7 +263,7 @@ export async function waitForAgentJob(params: {
|
||||
}
|
||||
recordAgentRunSnapshot(snapshot);
|
||||
finish(snapshot);
|
||||
}, effectiveDelay);
|
||||
}, delayMs);
|
||||
timerRef.unref?.();
|
||||
if (kind === "error") {
|
||||
pendingErrorTimer = timerRef;
|
||||
@@ -335,8 +335,7 @@ export async function waitForAgentJob(params: {
|
||||
finish(snapshot);
|
||||
});
|
||||
|
||||
const timerDelayMs = Math.max(1, Math.min(Math.floor(timeoutMs), 2_147_483_647));
|
||||
const timer = setTimeout(() => finish(null), timerDelayMs);
|
||||
const timer = setSafeTimeout(() => finish(null), timeoutMs);
|
||||
onAbort = () => finish(null);
|
||||
signal?.addEventListener("abort", onAbort, { once: true });
|
||||
});
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { setSafeTimeout } from "../../utils/timer-delay.js";
|
||||
import type { DedupeEntry } from "../server-shared.js";
|
||||
|
||||
export type AgentWaitTerminalSnapshot = {
|
||||
@@ -194,8 +195,7 @@ export async function waitForTerminalGatewayDedupe(params: {
|
||||
return;
|
||||
}
|
||||
|
||||
const timeoutDelayMs = Math.max(1, Math.min(Math.floor(params.timeoutMs), 2_147_483_647));
|
||||
timeoutHandle = setTimeout(() => finish(null), timeoutDelayMs);
|
||||
timeoutHandle = setSafeTimeout(() => finish(null), params.timeoutMs);
|
||||
timeoutHandle.unref?.();
|
||||
|
||||
onAbort = () => finish(null);
|
||||
|
||||
@@ -346,6 +346,24 @@ describe("startHeartbeatRunner", () => {
|
||||
runner.stop();
|
||||
});
|
||||
|
||||
it("clamps oversized scheduler delays so heartbeats do not fire in a tight loop (#71414)", async () => {
|
||||
useFakeHeartbeatTime();
|
||||
const runSpy = vi.fn().mockResolvedValue({ status: "ran", durationMs: 1 });
|
||||
// 365d resolves to ~31_536_000_000 ms, well past Node setTimeout's
|
||||
// 2_147_483_647 ms cap. Without clamping, setTimeout would fire after
|
||||
// 1ms and re-arm in a tight loop, exhausting the runner.
|
||||
const runner = startHeartbeatRunner({
|
||||
cfg: heartbeatConfig([{ id: "main", heartbeat: { every: "365d" } }]),
|
||||
runOnce: runSpy,
|
||||
stableSchedulerSeed: TEST_SCHEDULER_SEED,
|
||||
});
|
||||
// Advance well past the broken 1ms re-arm but well under the clamped cap
|
||||
// (~24.85d). If the bug is present, runSpy gets called many times.
|
||||
await vi.advanceTimersByTimeAsync(60_000);
|
||||
expect(runSpy).not.toHaveBeenCalled();
|
||||
runner.stop();
|
||||
});
|
||||
|
||||
it("does not fan out to unrelated agents for session-scoped exec wakes", async () => {
|
||||
useFakeHeartbeatTime();
|
||||
const runSpy = vi.fn().mockResolvedValue({ status: "ran", durationMs: 1 });
|
||||
|
||||
70
src/infra/heartbeat-runner.timeout-warning.test.ts
Normal file
70
src/infra/heartbeat-runner.timeout-warning.test.ts
Normal file
@@ -0,0 +1,70 @@
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
|
||||
function createHeartbeatConfig(every: string): OpenClawConfig {
|
||||
return {
|
||||
agents: {
|
||||
defaults: { heartbeat: { every } },
|
||||
list: [{ id: "main", heartbeat: { every } }],
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
}
|
||||
|
||||
describe("startHeartbeatRunner timeout overflow warnings", () => {
|
||||
afterEach(() => {
|
||||
vi.useRealTimers();
|
||||
vi.resetModules();
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
it("warns once per runner lifetime when clamping an oversized scheduler delay", async () => {
|
||||
const warn = vi.fn();
|
||||
const noop = vi.fn();
|
||||
const logger = {
|
||||
subsystem: "gateway/heartbeat",
|
||||
isEnabled: vi.fn(() => true),
|
||||
trace: noop,
|
||||
debug: noop,
|
||||
info: noop,
|
||||
warn,
|
||||
error: noop,
|
||||
fatal: noop,
|
||||
raw: noop,
|
||||
child: vi.fn(() => logger),
|
||||
};
|
||||
|
||||
vi.doMock("../logging/subsystem.js", async () => {
|
||||
const actual =
|
||||
await vi.importActual<typeof import("../logging/subsystem.js")>("../logging/subsystem.js");
|
||||
return {
|
||||
...actual,
|
||||
createSubsystemLogger: vi.fn(() => logger),
|
||||
};
|
||||
});
|
||||
|
||||
const [{ startHeartbeatRunner }, { resetHeartbeatWakeStateForTests }] = await Promise.all([
|
||||
import("./heartbeat-runner.js"),
|
||||
import("./heartbeat-wake.js"),
|
||||
]);
|
||||
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date(0));
|
||||
const cfg = createHeartbeatConfig("365d");
|
||||
const runnerA = startHeartbeatRunner({
|
||||
cfg,
|
||||
runOnce: vi.fn().mockResolvedValue({ status: "ran", durationMs: 1 }),
|
||||
stableSchedulerSeed: "seed-0",
|
||||
});
|
||||
const runnerB = startHeartbeatRunner({
|
||||
cfg,
|
||||
runOnce: vi.fn().mockResolvedValue({ status: "ran", durationMs: 1 }),
|
||||
stableSchedulerSeed: "seed-0",
|
||||
});
|
||||
|
||||
expect(warn).toHaveBeenCalledTimes(2);
|
||||
|
||||
runnerA.stop();
|
||||
runnerB.stop();
|
||||
resetHeartbeatWakeStateForTests();
|
||||
});
|
||||
});
|
||||
@@ -64,6 +64,7 @@ import {
|
||||
normalizeOptionalString,
|
||||
} from "../shared/string-coerce.js";
|
||||
import { escapeRegExp } from "../utils.js";
|
||||
import { MAX_SAFE_TIMEOUT_DELAY_MS, resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js";
|
||||
import { loadOrCreateDeviceIdentity } from "./device-identity.js";
|
||||
import { formatErrorMessage, hasErrnoCode } from "./errors.js";
|
||||
import { isWithinActiveHours } from "./heartbeat-active-hours.js";
|
||||
@@ -1330,6 +1331,7 @@ export function startHeartbeatRunner(opts: {
|
||||
stopped: false,
|
||||
};
|
||||
let initialized = false;
|
||||
let heartbeatTimeoutOverflowWarned = false;
|
||||
|
||||
const resolveNextDue = (
|
||||
now: number,
|
||||
@@ -1384,7 +1386,15 @@ export function startHeartbeatRunner(opts: {
|
||||
if (!Number.isFinite(nextDue)) {
|
||||
return;
|
||||
}
|
||||
const delay = Math.max(0, nextDue - now);
|
||||
const rawDelay = Math.max(0, nextDue - now);
|
||||
if (rawDelay > MAX_SAFE_TIMEOUT_DELAY_MS && !heartbeatTimeoutOverflowWarned) {
|
||||
heartbeatTimeoutOverflowWarned = true;
|
||||
log.warn("heartbeat: scheduled delay exceeds Node setTimeout cap; clamping to ~24.85d", {
|
||||
rawDelayMs: rawDelay,
|
||||
clampedMs: MAX_SAFE_TIMEOUT_DELAY_MS,
|
||||
});
|
||||
}
|
||||
const delay = resolveSafeTimeoutDelayMs(rawDelay, { minMs: 0 });
|
||||
state.timer = setTimeout(() => {
|
||||
state.timer = null;
|
||||
requestHeartbeatNow({ reason: "interval", coalesceMs: 0 });
|
||||
|
||||
34
src/utils/timer-delay.test.ts
Normal file
34
src/utils/timer-delay.test.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import {
|
||||
MAX_SAFE_TIMEOUT_DELAY_MS,
|
||||
resolveSafeTimeoutDelayMs,
|
||||
setSafeTimeout,
|
||||
} from "./timer-delay.js";
|
||||
|
||||
describe("resolveSafeTimeoutDelayMs", () => {
|
||||
it("clamps to Node's signed-32-bit timer ceiling", () => {
|
||||
expect(resolveSafeTimeoutDelayMs(3_000_000_000)).toBe(MAX_SAFE_TIMEOUT_DELAY_MS);
|
||||
});
|
||||
|
||||
it("respects custom minimums", () => {
|
||||
expect(resolveSafeTimeoutDelayMs(10, { minMs: 250 })).toBe(250);
|
||||
expect(resolveSafeTimeoutDelayMs(10, { minMs: 0 })).toBe(10);
|
||||
});
|
||||
|
||||
it("falls back to the minimum for non-finite input", () => {
|
||||
expect(resolveSafeTimeoutDelayMs(Number.POSITIVE_INFINITY, { minMs: 250 })).toBe(250);
|
||||
expect(resolveSafeTimeoutDelayMs(Number.NaN)).toBe(1);
|
||||
});
|
||||
});
|
||||
|
||||
describe("setSafeTimeout", () => {
|
||||
it("arms setTimeout with the clamped delay", () => {
|
||||
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
|
||||
|
||||
const timer = setSafeTimeout(() => undefined, 3_000_000_000);
|
||||
clearTimeout(timer);
|
||||
|
||||
expect(timeoutSpy).toHaveBeenCalledWith(expect.any(Function), MAX_SAFE_TIMEOUT_DELAY_MS);
|
||||
timeoutSpy.mockRestore();
|
||||
});
|
||||
});
|
||||
19
src/utils/timer-delay.ts
Normal file
19
src/utils/timer-delay.ts
Normal file
@@ -0,0 +1,19 @@
|
||||
export const MAX_SAFE_TIMEOUT_DELAY_MS = 2_147_483_647;
|
||||
|
||||
export function resolveSafeTimeoutDelayMs(delayMs: number, opts?: { minMs?: number }): number {
|
||||
const rawMinMs = opts?.minMs ?? 1;
|
||||
const minMs = Math.min(
|
||||
MAX_SAFE_TIMEOUT_DELAY_MS,
|
||||
Math.max(0, Number.isFinite(rawMinMs) ? Math.floor(rawMinMs) : 1),
|
||||
);
|
||||
const candidateMs = Number.isFinite(delayMs) ? Math.floor(delayMs) : minMs;
|
||||
return Math.min(MAX_SAFE_TIMEOUT_DELAY_MS, Math.max(minMs, candidateMs));
|
||||
}
|
||||
|
||||
export function setSafeTimeout(
|
||||
callback: () => void,
|
||||
delayMs: number,
|
||||
opts?: { minMs?: number },
|
||||
): NodeJS.Timeout {
|
||||
return setTimeout(callback, resolveSafeTimeoutDelayMs(delayMs, opts));
|
||||
}
|
||||
Reference in New Issue
Block a user