fix(gateway): expose event loop health in readiness

This commit is contained in:
Vincent Koc
2026-04-28 03:56:15 -07:00
parent 9f7932fbcc
commit 75ba8398f9
6 changed files with 167 additions and 4 deletions

View File

@@ -145,7 +145,7 @@ When you set `--url`, the CLI does not fall back to config or environment creden
openclaw gateway health --url ws://127.0.0.1:18789
```
The HTTP `/healthz` endpoint is a liveness probe: it returns once the server can answer HTTP. The HTTP `/readyz` endpoint is stricter and stays red while startup sidecars, channels, or configured hooks are still settling.
The HTTP `/healthz` endpoint is a liveness probe: it returns once the server can answer HTTP. The HTTP `/readyz` endpoint is stricter and stays red while startup sidecars, channels, or configured hooks are still settling. Local or authenticated detailed readiness responses include an `eventLoop` diagnostic block with event-loop delay, event-loop utilization, CPU core ratio, and a `degraded` flag.
### `gateway usage-cost`

View File

@@ -112,6 +112,7 @@ export async function runGatewayClosePrelude(params: {
disposeBrowserAuthRateLimiter: () => void;
stopModelPricingRefresh?: () => void;
stopChannelHealthMonitor?: () => void;
stopReadinessEventLoopHealth?: () => void;
clearSecretsRuntimeSnapshot?: () => void;
closeMcpServer?: () => Promise<void>;
}): Promise<void> {
@@ -122,6 +123,7 @@ export async function runGatewayClosePrelude(params: {
params.disposeBrowserAuthRateLimiter();
params.stopModelPricingRefresh?.();
params.stopChannelHealthMonitor?.();
params.stopReadinessEventLoopHealth?.();
params.clearSecretsRuntimeSnapshot?.();
await params.closeMcpServer?.().catch(() => {});
}

View File

@@ -86,6 +86,7 @@ import { STARTUP_UNAVAILABLE_GATEWAY_METHODS } from "./server-startup-unavailabl
import { startGatewayEarlyRuntime, startGatewayPostAttachRuntime } from "./server-startup.js";
import { createWizardSessionTracker } from "./server-wizard-sessions.js";
import { attachGatewayWsHandlers } from "./server-ws-runtime.js";
import { createGatewayEventLoopHealthMonitor } from "./server/event-loop-health.js";
import {
getHealthCache,
getHealthVersion,
@@ -565,6 +566,7 @@ export async function startGatewayServer(
throw new Error(gatewayTls.error ?? "gateway tls: failed to enable");
}
const serverStartedAt = Date.now();
const readinessEventLoopHealth = createGatewayEventLoopHealthMonitor();
let startupSidecarsReady = minimalTestGateway;
const channelManager = createChannelManager({
getRuntimeConfig: () =>
@@ -582,6 +584,7 @@ export async function startGatewayServer(
channelManager,
startedAt: serverStartedAt,
getStartupPending: () => !startupSidecarsReady,
getEventLoopHealth: readinessEventLoopHealth.snapshot,
});
log.info("starting HTTP server...");
const {
@@ -682,6 +685,7 @@ export async function startGatewayServer(
disposeBrowserAuthRateLimiter: () => browserAuthRateLimiter.dispose(),
stopModelPricingRefresh: runtimeState.stopModelPricingRefresh,
stopChannelHealthMonitor: () => runtimeState?.channelHealthMonitor?.stop(),
stopReadinessEventLoopHealth: readinessEventLoopHealth.stop,
clearSecretsRuntimeSnapshot,
closeMcpServer: closeMcpLoopbackServerOnDemand,
});

View File

@@ -0,0 +1,108 @@
import { monitorEventLoopDelay, performance } from "node:perf_hooks";
const EVENT_LOOP_MONITOR_RESOLUTION_MS = 20;
const EVENT_LOOP_DELAY_WARN_MS = 1_000;
const EVENT_LOOP_UTILIZATION_WARN = 0.95;
const CPU_CORE_RATIO_WARN = 0.9;
type EventLoopDelayMonitor = ReturnType<typeof monitorEventLoopDelay>;
type EventLoopUtilization = ReturnType<typeof performance.eventLoopUtilization>;
type CpuUsage = ReturnType<typeof process.cpuUsage>;
export type GatewayEventLoopHealthReason = "event_loop_delay" | "event_loop_utilization" | "cpu";
export type GatewayEventLoopHealth = {
degraded: boolean;
reasons: GatewayEventLoopHealthReason[];
intervalMs: number;
delayP99Ms: number;
delayMaxMs: number;
utilization: number;
cpuCoreRatio: number;
};
export type GatewayEventLoopHealthMonitor = {
snapshot: () => GatewayEventLoopHealth | undefined;
stop: () => void;
};
function roundMetric(value: number, digits = 3): number {
if (!Number.isFinite(value)) {
return 0;
}
const factor = 10 ** digits;
return Math.round(value * factor) / factor;
}
function nanosecondsToMilliseconds(value: number): number {
return roundMetric(value / 1_000_000, 1);
}
export function createGatewayEventLoopHealthMonitor(): GatewayEventLoopHealthMonitor {
let monitor: EventLoopDelayMonitor | null = null;
let lastWallAt = Date.now();
let lastCpuUsage: CpuUsage | null = process.cpuUsage();
let lastEventLoopUtilization: EventLoopUtilization | null = performance.eventLoopUtilization();
try {
monitor = monitorEventLoopDelay({ resolution: EVENT_LOOP_MONITOR_RESOLUTION_MS });
monitor.enable();
monitor.reset();
} catch {
monitor = null;
}
return {
snapshot: () => {
if (!monitor || !lastCpuUsage || !lastEventLoopUtilization || lastWallAt <= 0) {
return undefined;
}
const now = Date.now();
const intervalMs = Math.max(1, now - lastWallAt);
const cpuUsage = process.cpuUsage(lastCpuUsage);
const currentEventLoopUtilization = performance.eventLoopUtilization();
const utilization = roundMetric(
performance.eventLoopUtilization(currentEventLoopUtilization, lastEventLoopUtilization)
.utilization,
);
const delayP99Ms = nanosecondsToMilliseconds(monitor.percentile(99));
const delayMaxMs = nanosecondsToMilliseconds(monitor.max);
const cpuTotalMs = roundMetric((cpuUsage.user + cpuUsage.system) / 1_000, 1);
const cpuCoreRatio = roundMetric(cpuTotalMs / intervalMs);
const reasons: GatewayEventLoopHealthReason[] = [];
if (delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS) {
reasons.push("event_loop_delay");
}
if (utilization >= EVENT_LOOP_UTILIZATION_WARN) {
reasons.push("event_loop_utilization");
}
if (cpuCoreRatio >= CPU_CORE_RATIO_WARN) {
reasons.push("cpu");
}
monitor.reset();
lastWallAt = now;
lastCpuUsage = process.cpuUsage();
lastEventLoopUtilization = currentEventLoopUtilization;
return {
degraded: reasons.length > 0,
reasons,
intervalMs,
delayP99Ms,
delayMaxMs,
utilization,
cpuCoreRatio,
};
},
stop: () => {
monitor?.disable();
monitor = null;
lastWallAt = 0;
lastCpuUsage = null;
lastEventLoopUtilization = null;
},
};
}

View File

@@ -64,6 +64,7 @@ function createReadinessHarness(params: {
startedAgoMs: number;
accounts: Record<string, Partial<ChannelAccountSnapshot>>;
getStartupPending?: () => boolean;
getEventLoopHealth?: Parameters<typeof createReadinessChecker>[0]["getEventLoopHealth"];
cacheTtlMs?: number;
}) {
const startedAt = Date.now() - params.startedAgoMs;
@@ -74,6 +75,7 @@ function createReadinessHarness(params: {
channelManager: manager,
startedAt,
getStartupPending: params.getStartupPending,
getEventLoopHealth: params.getEventLoopHealth,
cacheTtlMs: params.cacheTtlMs,
}),
};
@@ -273,4 +275,37 @@ describe("createReadinessChecker", () => {
expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(2);
});
});
it("adds event-loop health to detailed readiness without changing readiness state", () => {
withReadinessClock(() => {
const { readiness } = createReadinessHarness({
startedAgoMs: 5 * 60_000,
accounts: {},
getEventLoopHealth: () => ({
degraded: true,
reasons: ["cpu", "event_loop_utilization"],
intervalMs: 2_000,
delayP99Ms: 42.1,
delayMaxMs: 88.7,
utilization: 0.991,
cpuCoreRatio: 0.973,
}),
});
expect(readiness()).toEqual({
ready: true,
failing: [],
uptimeMs: 300_000,
eventLoop: {
degraded: true,
reasons: ["cpu", "event_loop_utilization"],
intervalMs: 2_000,
delayP99Ms: 42.1,
delayMaxMs: 88.7,
utilization: 0.991,
cpuCoreRatio: 0.973,
},
});
});
});
});

View File

@@ -7,11 +7,13 @@ import {
type ChannelHealthEvaluation,
} from "../channel-health-policy.js";
import type { ChannelManager } from "../server-channels.js";
import type { GatewayEventLoopHealth } from "./event-loop-health.js";
export type ReadinessResult = {
ready: boolean;
failing: string[];
uptimeMs: number;
eventLoop?: GatewayEventLoopHealth;
};
export type ReadinessChecker = () => ReadinessResult;
@@ -35,6 +37,7 @@ export function createReadinessChecker(deps: {
channelManager: ChannelManager;
startedAt: number;
getStartupPending?: () => boolean;
getEventLoopHealth?: () => GatewayEventLoopHealth | undefined;
cacheTtlMs?: number;
}): ReadinessChecker {
const { channelManager, startedAt } = deps;
@@ -46,10 +49,13 @@ export function createReadinessChecker(deps: {
const now = Date.now();
const uptimeMs = now - startedAt;
if (deps.getStartupPending?.()) {
return { ready: false, failing: ["startup-sidecars"], uptimeMs };
return withEventLoopHealth(
{ ready: false, failing: ["startup-sidecars"], uptimeMs },
deps.getEventLoopHealth,
);
}
if (cachedState && now - cachedAt < cacheTtlMs) {
return { ...cachedState, uptimeMs };
return withEventLoopHealth({ ...cachedState, uptimeMs }, deps.getEventLoopHealth);
}
const snapshot = channelManager.getRuntimeSnapshot();
@@ -79,6 +85,14 @@ export function createReadinessChecker(deps: {
cachedAt = now;
cachedState = { ready: failing.length === 0, failing };
return { ...cachedState, uptimeMs };
return withEventLoopHealth({ ...cachedState, uptimeMs }, deps.getEventLoopHealth);
};
}
function withEventLoopHealth(
result: ReadinessResult,
getEventLoopHealth?: () => GatewayEventLoopHealth | undefined,
): ReadinessResult {
const eventLoop = getEventLoopHealth?.();
return eventLoop ? { ...result, eventLoop } : result;
}