mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:30:42 +00:00
fix: stabilize event loop health samples
This commit is contained in:
@@ -95,6 +95,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Agents/generated media: treat attachment-style message tool actions as completed chat sends, preventing duplicate fallback media posts when generated files were already uploaded.
|
||||
- Control UI/sessions: show each session's agent runtime in the Sessions table and allow filtering by runtime labels, matching the Agents panel runtime wording. Thanks @vincentkoc.
|
||||
- Discord/streaming: show live reasoning text in progress drafts instead of a bare `Reasoning` status line.
|
||||
- Gateway/status: avoid marking fast repeated health/status samples as event-loop degraded from CPU/utilization alone until the Gateway has accumulated a sustained sampling window. Thanks @shakkernerd.
|
||||
- Doctor/status: warn when `OPENCLAW_GATEWAY_TOKEN` would shadow a different active `gateway.auth.token` source for local CLI commands, while avoiding false positives when config points at the same env token. Fixes #74271. Thanks @yelog.
|
||||
- Gateway/HTTP: avoid loading managed outgoing-image media handlers for unrelated requests, so disabled OpenAI-compatible routes return 404 without waiting on lazy media sidecars. Thanks @vincentkoc.
|
||||
- Gateway/OpenAI-compatible: send the assistant role SSE chunk as soon as streaming chat-completion headers are accepted, so cold agent setup cannot leave `/v1/chat/completions` clients with a bodyless 200 response until their idle timeout fires.
|
||||
|
||||
121
src/gateway/server/event-loop-health.test.ts
Normal file
121
src/gateway/server/event-loop-health.test.ts
Normal file
@@ -0,0 +1,121 @@
|
||||
import type { monitorEventLoopDelay, performance } from "node:perf_hooks";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import { createGatewayEventLoopHealthMonitor } from "./event-loop-health.js";
|
||||
|
||||
type CpuUsage = ReturnType<typeof process.cpuUsage>;
|
||||
type DelayMonitor = ReturnType<typeof monitorEventLoopDelay>;
|
||||
type EventLoopUtilization = ReturnType<typeof performance.eventLoopUtilization>;
|
||||
type GatewayEventLoopHealthMonitorDeps = NonNullable<
|
||||
Parameters<typeof createGatewayEventLoopHealthMonitor>[0]
|
||||
>;
|
||||
|
||||
function createMonitorHarness(params?: { cpuMsPerWallMs?: number; utilization?: number }) {
|
||||
const startedAt = 10_000;
|
||||
let nowMs = startedAt;
|
||||
let delayP99Ms = 0;
|
||||
let delayMaxMs = 0;
|
||||
const cpuMsPerWallMs = params?.cpuMsPerWallMs ?? 1;
|
||||
const utilization = params?.utilization ?? 1;
|
||||
const delayMonitor = {
|
||||
enable: vi.fn(),
|
||||
disable: vi.fn(),
|
||||
reset: vi.fn(),
|
||||
percentile: vi.fn(() => delayP99Ms * 1_000_000),
|
||||
get max() {
|
||||
return delayMaxMs * 1_000_000;
|
||||
},
|
||||
} as unknown as DelayMonitor;
|
||||
const cpuUsage = vi.fn((previous?: CpuUsage) => {
|
||||
const current = {
|
||||
user: Math.round(nowMs * cpuMsPerWallMs * 1_000),
|
||||
system: 0,
|
||||
};
|
||||
if (!previous) {
|
||||
return current;
|
||||
}
|
||||
return {
|
||||
user: current.user - previous.user,
|
||||
system: current.system - previous.system,
|
||||
};
|
||||
}) as NonNullable<GatewayEventLoopHealthMonitorDeps["cpuUsage"]>;
|
||||
const eventLoopUtilization = vi.fn(
|
||||
(current?: EventLoopUtilization, previous?: EventLoopUtilization) => {
|
||||
if (!current || !previous) {
|
||||
return { idle: 0, active: nowMs, utilization };
|
||||
}
|
||||
return {
|
||||
idle: 0,
|
||||
active: current.active - previous.active,
|
||||
utilization,
|
||||
};
|
||||
},
|
||||
) as NonNullable<GatewayEventLoopHealthMonitorDeps["eventLoopUtilization"]>;
|
||||
const monitor = createGatewayEventLoopHealthMonitor({
|
||||
now: () => nowMs,
|
||||
cpuUsage,
|
||||
eventLoopUtilization,
|
||||
createDelayMonitor: () => delayMonitor,
|
||||
});
|
||||
|
||||
return {
|
||||
monitor,
|
||||
cpuUsage,
|
||||
eventLoopUtilization,
|
||||
setNow: (value: number) => {
|
||||
nowMs = startedAt + value;
|
||||
},
|
||||
setDelay: (value: { p99Ms?: number; maxMs?: number }) => {
|
||||
delayP99Ms = value.p99Ms ?? delayP99Ms;
|
||||
delayMaxMs = value.maxMs ?? delayMaxMs;
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe("createGatewayEventLoopHealthMonitor", () => {
|
||||
it("waits for a sustained sample window before reporting CPU-only saturation", () => {
|
||||
const harness = createMonitorHarness();
|
||||
|
||||
harness.setNow(42);
|
||||
expect(harness.monitor.snapshot()).toBeUndefined();
|
||||
expect(harness.cpuUsage).toHaveBeenCalledTimes(1);
|
||||
expect(harness.eventLoopUtilization).toHaveBeenCalledTimes(1);
|
||||
|
||||
harness.setNow(1_000);
|
||||
expect(harness.monitor.snapshot()).toMatchObject({
|
||||
degraded: true,
|
||||
reasons: ["event_loop_utilization", "cpu"],
|
||||
intervalMs: 1_000,
|
||||
delayP99Ms: 0,
|
||||
delayMaxMs: 0,
|
||||
utilization: 1,
|
||||
cpuCoreRatio: 1,
|
||||
});
|
||||
});
|
||||
|
||||
it("does not wait for the sustained sample window before reporting event-loop delay", () => {
|
||||
const harness = createMonitorHarness();
|
||||
harness.setDelay({ maxMs: 1_500 });
|
||||
harness.setNow(42);
|
||||
|
||||
expect(harness.monitor.snapshot()).toMatchObject({
|
||||
degraded: true,
|
||||
reasons: ["event_loop_delay"],
|
||||
intervalMs: 42,
|
||||
delayP99Ms: 0,
|
||||
delayMaxMs: 1_500,
|
||||
});
|
||||
});
|
||||
|
||||
it("returns a non-degraded snapshot when the sustained load sample is healthy", () => {
|
||||
const harness = createMonitorHarness({ cpuMsPerWallMs: 0.1, utilization: 0.2 });
|
||||
harness.setNow(1_000);
|
||||
|
||||
expect(harness.monitor.snapshot()).toMatchObject({
|
||||
degraded: false,
|
||||
reasons: [],
|
||||
intervalMs: 1_000,
|
||||
utilization: 0.2,
|
||||
cpuCoreRatio: 0.1,
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -4,6 +4,7 @@ const EVENT_LOOP_MONITOR_RESOLUTION_MS = 20;
|
||||
const EVENT_LOOP_DELAY_WARN_MS = 1_000;
|
||||
const EVENT_LOOP_UTILIZATION_WARN = 0.95;
|
||||
const CPU_CORE_RATIO_WARN = 0.9;
|
||||
const SUSTAINED_LOAD_SAMPLE_MIN_INTERVAL_MS = 1_000;
|
||||
|
||||
type EventLoopDelayMonitor = ReturnType<typeof monitorEventLoopDelay>;
|
||||
type EventLoopUtilization = ReturnType<typeof performance.eventLoopUtilization>;
|
||||
@@ -26,6 +27,17 @@ export type GatewayEventLoopHealthMonitor = {
|
||||
stop: () => void;
|
||||
};
|
||||
|
||||
type EventLoopUtilizationReader = typeof performance.eventLoopUtilization;
|
||||
|
||||
type EventLoopDelayMonitorFactory = (resolutionMs: number) => EventLoopDelayMonitor;
|
||||
|
||||
type GatewayEventLoopHealthMonitorDeps = {
|
||||
now?: () => number;
|
||||
cpuUsage?: typeof process.cpuUsage;
|
||||
eventLoopUtilization?: EventLoopUtilizationReader;
|
||||
createDelayMonitor?: EventLoopDelayMonitorFactory;
|
||||
};
|
||||
|
||||
function roundMetric(value: number, digits = 3): number {
|
||||
if (!Number.isFinite(value)) {
|
||||
return 0;
|
||||
@@ -38,14 +50,49 @@ function nanosecondsToMilliseconds(value: number): number {
|
||||
return roundMetric(value / 1_000_000, 1);
|
||||
}
|
||||
|
||||
export function createGatewayEventLoopHealthMonitor(): GatewayEventLoopHealthMonitor {
|
||||
function resolveGatewayEventLoopHealthReasons(params: {
|
||||
intervalMs: number;
|
||||
delayP99Ms: number;
|
||||
delayMaxMs: number;
|
||||
utilization: number;
|
||||
cpuCoreRatio: number;
|
||||
}): GatewayEventLoopHealthReason[] {
|
||||
const reasons: GatewayEventLoopHealthReason[] = [];
|
||||
const hasSustainedLoadWindow = params.intervalMs >= SUSTAINED_LOAD_SAMPLE_MIN_INTERVAL_MS;
|
||||
|
||||
if (
|
||||
params.delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS ||
|
||||
params.delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS
|
||||
) {
|
||||
reasons.push("event_loop_delay");
|
||||
}
|
||||
if (hasSustainedLoadWindow && params.utilization >= EVENT_LOOP_UTILIZATION_WARN) {
|
||||
reasons.push("event_loop_utilization");
|
||||
}
|
||||
if (hasSustainedLoadWindow && params.cpuCoreRatio >= CPU_CORE_RATIO_WARN) {
|
||||
reasons.push("cpu");
|
||||
}
|
||||
|
||||
return reasons;
|
||||
}
|
||||
|
||||
export function createGatewayEventLoopHealthMonitor(
|
||||
deps: GatewayEventLoopHealthMonitorDeps = {},
|
||||
): GatewayEventLoopHealthMonitor {
|
||||
const nowMs = deps.now ?? Date.now;
|
||||
const readCpuUsage = deps.cpuUsage ?? process.cpuUsage.bind(process);
|
||||
const readEventLoopUtilization =
|
||||
deps.eventLoopUtilization ?? performance.eventLoopUtilization.bind(performance);
|
||||
const createDelayMonitor =
|
||||
deps.createDelayMonitor ??
|
||||
((resolutionMs: number) => monitorEventLoopDelay({ resolution: resolutionMs }));
|
||||
let monitor: EventLoopDelayMonitor | null = null;
|
||||
let lastWallAt = Date.now();
|
||||
let lastCpuUsage: CpuUsage | null = process.cpuUsage();
|
||||
let lastEventLoopUtilization: EventLoopUtilization | null = performance.eventLoopUtilization();
|
||||
let lastWallAt = nowMs();
|
||||
let lastCpuUsage: CpuUsage | null = readCpuUsage();
|
||||
let lastEventLoopUtilization: EventLoopUtilization | null = readEventLoopUtilization();
|
||||
|
||||
try {
|
||||
monitor = monitorEventLoopDelay({ resolution: EVENT_LOOP_MONITOR_RESOLUTION_MS });
|
||||
monitor = createDelayMonitor(EVENT_LOOP_MONITOR_RESOLUTION_MS);
|
||||
monitor.enable();
|
||||
monitor.reset();
|
||||
} catch {
|
||||
@@ -58,33 +105,36 @@ export function createGatewayEventLoopHealthMonitor(): GatewayEventLoopHealthMon
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const now = Date.now();
|
||||
const now = nowMs();
|
||||
const intervalMs = Math.max(1, now - lastWallAt);
|
||||
const cpuUsage = process.cpuUsage(lastCpuUsage);
|
||||
const currentEventLoopUtilization = performance.eventLoopUtilization();
|
||||
const utilization = roundMetric(
|
||||
performance.eventLoopUtilization(currentEventLoopUtilization, lastEventLoopUtilization)
|
||||
.utilization,
|
||||
);
|
||||
const delayP99Ms = nanosecondsToMilliseconds(monitor.percentile(99));
|
||||
const delayMaxMs = nanosecondsToMilliseconds(monitor.max);
|
||||
const hasDelayWarning =
|
||||
delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS;
|
||||
|
||||
if (!hasDelayWarning && intervalMs < SUSTAINED_LOAD_SAMPLE_MIN_INTERVAL_MS) {
|
||||
monitor.reset();
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const cpuUsage = readCpuUsage(lastCpuUsage);
|
||||
const currentEventLoopUtilization = readEventLoopUtilization();
|
||||
const utilization = roundMetric(
|
||||
readEventLoopUtilization(currentEventLoopUtilization, lastEventLoopUtilization).utilization,
|
||||
);
|
||||
const cpuTotalMs = roundMetric((cpuUsage.user + cpuUsage.system) / 1_000, 1);
|
||||
const cpuCoreRatio = roundMetric(cpuTotalMs / intervalMs);
|
||||
const reasons: GatewayEventLoopHealthReason[] = [];
|
||||
|
||||
if (delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS) {
|
||||
reasons.push("event_loop_delay");
|
||||
}
|
||||
if (utilization >= EVENT_LOOP_UTILIZATION_WARN) {
|
||||
reasons.push("event_loop_utilization");
|
||||
}
|
||||
if (cpuCoreRatio >= CPU_CORE_RATIO_WARN) {
|
||||
reasons.push("cpu");
|
||||
}
|
||||
const reasons = resolveGatewayEventLoopHealthReasons({
|
||||
intervalMs,
|
||||
delayP99Ms,
|
||||
delayMaxMs,
|
||||
utilization,
|
||||
cpuCoreRatio,
|
||||
});
|
||||
|
||||
monitor.reset();
|
||||
lastWallAt = now;
|
||||
lastCpuUsage = process.cpuUsage();
|
||||
lastCpuUsage = readCpuUsage();
|
||||
lastEventLoopUtilization = currentEventLoopUtilization;
|
||||
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user