fix: stabilize event loop health samples

This commit is contained in:
Shakker
2026-05-06 00:36:07 +01:00
parent 5af1fe1bd0
commit 7af1a87830
3 changed files with 196 additions and 24 deletions

View File

@@ -95,6 +95,7 @@ Docs: https://docs.openclaw.ai
- Agents/generated media: treat attachment-style message tool actions as completed chat sends, preventing duplicate fallback media posts when generated files were already uploaded.
- Control UI/sessions: show each session's agent runtime in the Sessions table and allow filtering by runtime labels, matching the Agents panel runtime wording. Thanks @vincentkoc.
- Discord/streaming: show live reasoning text in progress drafts instead of a bare `Reasoning` status line.
- Gateway/status: avoid marking fast repeated health/status samples as event-loop degraded from CPU/utilization alone until the Gateway has accumulated a sustained sampling window. Thanks @shakkernerd.
- Doctor/status: warn when `OPENCLAW_GATEWAY_TOKEN` would shadow a different active `gateway.auth.token` source for local CLI commands, while avoiding false positives when config points at the same env token. Fixes #74271. Thanks @yelog.
- Gateway/HTTP: avoid loading managed outgoing-image media handlers for unrelated requests, so disabled OpenAI-compatible routes return 404 without waiting on lazy media sidecars. Thanks @vincentkoc.
- Gateway/OpenAI-compatible: send the assistant role SSE chunk as soon as streaming chat-completion headers are accepted, so cold agent setup cannot leave `/v1/chat/completions` clients with a bodyless 200 response until their idle timeout fires.

View File

@@ -0,0 +1,121 @@
import type { monitorEventLoopDelay, performance } from "node:perf_hooks";
import { describe, expect, it, vi } from "vitest";
import { createGatewayEventLoopHealthMonitor } from "./event-loop-health.js";
type CpuUsage = ReturnType<typeof process.cpuUsage>;
type DelayMonitor = ReturnType<typeof monitorEventLoopDelay>;
type EventLoopUtilization = ReturnType<typeof performance.eventLoopUtilization>;
type GatewayEventLoopHealthMonitorDeps = NonNullable<
Parameters<typeof createGatewayEventLoopHealthMonitor>[0]
>;
function createMonitorHarness(params?: { cpuMsPerWallMs?: number; utilization?: number }) {
const startedAt = 10_000;
let nowMs = startedAt;
let delayP99Ms = 0;
let delayMaxMs = 0;
const cpuMsPerWallMs = params?.cpuMsPerWallMs ?? 1;
const utilization = params?.utilization ?? 1;
const delayMonitor = {
enable: vi.fn(),
disable: vi.fn(),
reset: vi.fn(),
percentile: vi.fn(() => delayP99Ms * 1_000_000),
get max() {
return delayMaxMs * 1_000_000;
},
} as unknown as DelayMonitor;
const cpuUsage = vi.fn((previous?: CpuUsage) => {
const current = {
user: Math.round(nowMs * cpuMsPerWallMs * 1_000),
system: 0,
};
if (!previous) {
return current;
}
return {
user: current.user - previous.user,
system: current.system - previous.system,
};
}) as NonNullable<GatewayEventLoopHealthMonitorDeps["cpuUsage"]>;
const eventLoopUtilization = vi.fn(
(current?: EventLoopUtilization, previous?: EventLoopUtilization) => {
if (!current || !previous) {
return { idle: 0, active: nowMs, utilization };
}
return {
idle: 0,
active: current.active - previous.active,
utilization,
};
},
) as NonNullable<GatewayEventLoopHealthMonitorDeps["eventLoopUtilization"]>;
const monitor = createGatewayEventLoopHealthMonitor({
now: () => nowMs,
cpuUsage,
eventLoopUtilization,
createDelayMonitor: () => delayMonitor,
});
return {
monitor,
cpuUsage,
eventLoopUtilization,
setNow: (value: number) => {
nowMs = startedAt + value;
},
setDelay: (value: { p99Ms?: number; maxMs?: number }) => {
delayP99Ms = value.p99Ms ?? delayP99Ms;
delayMaxMs = value.maxMs ?? delayMaxMs;
},
};
}
describe("createGatewayEventLoopHealthMonitor", () => {
it("waits for a sustained sample window before reporting CPU-only saturation", () => {
const harness = createMonitorHarness();
harness.setNow(42);
expect(harness.monitor.snapshot()).toBeUndefined();
expect(harness.cpuUsage).toHaveBeenCalledTimes(1);
expect(harness.eventLoopUtilization).toHaveBeenCalledTimes(1);
harness.setNow(1_000);
expect(harness.monitor.snapshot()).toMatchObject({
degraded: true,
reasons: ["event_loop_utilization", "cpu"],
intervalMs: 1_000,
delayP99Ms: 0,
delayMaxMs: 0,
utilization: 1,
cpuCoreRatio: 1,
});
});
it("does not wait for the sustained sample window before reporting event-loop delay", () => {
const harness = createMonitorHarness();
harness.setDelay({ maxMs: 1_500 });
harness.setNow(42);
expect(harness.monitor.snapshot()).toMatchObject({
degraded: true,
reasons: ["event_loop_delay"],
intervalMs: 42,
delayP99Ms: 0,
delayMaxMs: 1_500,
});
});
it("returns a non-degraded snapshot when the sustained load sample is healthy", () => {
const harness = createMonitorHarness({ cpuMsPerWallMs: 0.1, utilization: 0.2 });
harness.setNow(1_000);
expect(harness.monitor.snapshot()).toMatchObject({
degraded: false,
reasons: [],
intervalMs: 1_000,
utilization: 0.2,
cpuCoreRatio: 0.1,
});
});
});

View File

@@ -4,6 +4,7 @@ const EVENT_LOOP_MONITOR_RESOLUTION_MS = 20;
const EVENT_LOOP_DELAY_WARN_MS = 1_000;
const EVENT_LOOP_UTILIZATION_WARN = 0.95;
const CPU_CORE_RATIO_WARN = 0.9;
const SUSTAINED_LOAD_SAMPLE_MIN_INTERVAL_MS = 1_000;
type EventLoopDelayMonitor = ReturnType<typeof monitorEventLoopDelay>;
type EventLoopUtilization = ReturnType<typeof performance.eventLoopUtilization>;
@@ -26,6 +27,17 @@ export type GatewayEventLoopHealthMonitor = {
stop: () => void;
};
type EventLoopUtilizationReader = typeof performance.eventLoopUtilization;
type EventLoopDelayMonitorFactory = (resolutionMs: number) => EventLoopDelayMonitor;
type GatewayEventLoopHealthMonitorDeps = {
now?: () => number;
cpuUsage?: typeof process.cpuUsage;
eventLoopUtilization?: EventLoopUtilizationReader;
createDelayMonitor?: EventLoopDelayMonitorFactory;
};
function roundMetric(value: number, digits = 3): number {
if (!Number.isFinite(value)) {
return 0;
@@ -38,14 +50,49 @@ function nanosecondsToMilliseconds(value: number): number {
return roundMetric(value / 1_000_000, 1);
}
export function createGatewayEventLoopHealthMonitor(): GatewayEventLoopHealthMonitor {
function resolveGatewayEventLoopHealthReasons(params: {
intervalMs: number;
delayP99Ms: number;
delayMaxMs: number;
utilization: number;
cpuCoreRatio: number;
}): GatewayEventLoopHealthReason[] {
const reasons: GatewayEventLoopHealthReason[] = [];
const hasSustainedLoadWindow = params.intervalMs >= SUSTAINED_LOAD_SAMPLE_MIN_INTERVAL_MS;
if (
params.delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS ||
params.delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS
) {
reasons.push("event_loop_delay");
}
if (hasSustainedLoadWindow && params.utilization >= EVENT_LOOP_UTILIZATION_WARN) {
reasons.push("event_loop_utilization");
}
if (hasSustainedLoadWindow && params.cpuCoreRatio >= CPU_CORE_RATIO_WARN) {
reasons.push("cpu");
}
return reasons;
}
export function createGatewayEventLoopHealthMonitor(
deps: GatewayEventLoopHealthMonitorDeps = {},
): GatewayEventLoopHealthMonitor {
const nowMs = deps.now ?? Date.now;
const readCpuUsage = deps.cpuUsage ?? process.cpuUsage.bind(process);
const readEventLoopUtilization =
deps.eventLoopUtilization ?? performance.eventLoopUtilization.bind(performance);
const createDelayMonitor =
deps.createDelayMonitor ??
((resolutionMs: number) => monitorEventLoopDelay({ resolution: resolutionMs }));
let monitor: EventLoopDelayMonitor | null = null;
let lastWallAt = Date.now();
let lastCpuUsage: CpuUsage | null = process.cpuUsage();
let lastEventLoopUtilization: EventLoopUtilization | null = performance.eventLoopUtilization();
let lastWallAt = nowMs();
let lastCpuUsage: CpuUsage | null = readCpuUsage();
let lastEventLoopUtilization: EventLoopUtilization | null = readEventLoopUtilization();
try {
monitor = monitorEventLoopDelay({ resolution: EVENT_LOOP_MONITOR_RESOLUTION_MS });
monitor = createDelayMonitor(EVENT_LOOP_MONITOR_RESOLUTION_MS);
monitor.enable();
monitor.reset();
} catch {
@@ -58,33 +105,36 @@ export function createGatewayEventLoopHealthMonitor(): GatewayEventLoopHealthMon
return undefined;
}
const now = Date.now();
const now = nowMs();
const intervalMs = Math.max(1, now - lastWallAt);
const cpuUsage = process.cpuUsage(lastCpuUsage);
const currentEventLoopUtilization = performance.eventLoopUtilization();
const utilization = roundMetric(
performance.eventLoopUtilization(currentEventLoopUtilization, lastEventLoopUtilization)
.utilization,
);
const delayP99Ms = nanosecondsToMilliseconds(monitor.percentile(99));
const delayMaxMs = nanosecondsToMilliseconds(monitor.max);
const hasDelayWarning =
delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS;
if (!hasDelayWarning && intervalMs < SUSTAINED_LOAD_SAMPLE_MIN_INTERVAL_MS) {
monitor.reset();
return undefined;
}
const cpuUsage = readCpuUsage(lastCpuUsage);
const currentEventLoopUtilization = readEventLoopUtilization();
const utilization = roundMetric(
readEventLoopUtilization(currentEventLoopUtilization, lastEventLoopUtilization).utilization,
);
const cpuTotalMs = roundMetric((cpuUsage.user + cpuUsage.system) / 1_000, 1);
const cpuCoreRatio = roundMetric(cpuTotalMs / intervalMs);
const reasons: GatewayEventLoopHealthReason[] = [];
if (delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS) {
reasons.push("event_loop_delay");
}
if (utilization >= EVENT_LOOP_UTILIZATION_WARN) {
reasons.push("event_loop_utilization");
}
if (cpuCoreRatio >= CPU_CORE_RATIO_WARN) {
reasons.push("cpu");
}
const reasons = resolveGatewayEventLoopHealthReasons({
intervalMs,
delayP99Ms,
delayMaxMs,
utilization,
cpuCoreRatio,
});
monitor.reset();
lastWallAt = now;
lastCpuUsage = process.cpuUsage();
lastCpuUsage = readCpuUsage();
lastEventLoopUtilization = currentEventLoopUtilization;
return {