From 75ba8398f939d1793322fed78f70f06a031c36bf Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Tue, 28 Apr 2026 03:56:15 -0700 Subject: [PATCH] fix(gateway): expose event loop health in readiness --- docs/cli/gateway.md | 2 +- src/gateway/server-close.ts | 2 + src/gateway/server.impl.ts | 4 + src/gateway/server/event-loop-health.ts | 108 ++++++++++++++++++++++++ src/gateway/server/readiness.test.ts | 35 ++++++++ src/gateway/server/readiness.ts | 20 ++++- 6 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 src/gateway/server/event-loop-health.ts diff --git a/docs/cli/gateway.md b/docs/cli/gateway.md index 47a547c061c..4da4db1b67f 100644 --- a/docs/cli/gateway.md +++ b/docs/cli/gateway.md @@ -145,7 +145,7 @@ When you set `--url`, the CLI does not fall back to config or environment creden openclaw gateway health --url ws://127.0.0.1:18789 ``` -The HTTP `/healthz` endpoint is a liveness probe: it returns once the server can answer HTTP. The HTTP `/readyz` endpoint is stricter and stays red while startup sidecars, channels, or configured hooks are still settling. +The HTTP `/healthz` endpoint is a liveness probe: it returns once the server can answer HTTP. The HTTP `/readyz` endpoint is stricter and stays red while startup sidecars, channels, or configured hooks are still settling. Local or authenticated detailed readiness responses include an `eventLoop` diagnostic block with event-loop delay, event-loop utilization, CPU core ratio, and a `degraded` flag. ### `gateway usage-cost` diff --git a/src/gateway/server-close.ts b/src/gateway/server-close.ts index 099ba1792e8..67a196f366c 100644 --- a/src/gateway/server-close.ts +++ b/src/gateway/server-close.ts @@ -112,6 +112,7 @@ export async function runGatewayClosePrelude(params: { disposeBrowserAuthRateLimiter: () => void; stopModelPricingRefresh?: () => void; stopChannelHealthMonitor?: () => void; + stopReadinessEventLoopHealth?: () => void; clearSecretsRuntimeSnapshot?: () => void; closeMcpServer?: () => Promise; }): Promise { @@ -122,6 +123,7 @@ export async function runGatewayClosePrelude(params: { params.disposeBrowserAuthRateLimiter(); params.stopModelPricingRefresh?.(); params.stopChannelHealthMonitor?.(); + params.stopReadinessEventLoopHealth?.(); params.clearSecretsRuntimeSnapshot?.(); await params.closeMcpServer?.().catch(() => {}); } diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index cfd8756092a..218ef48991c 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -86,6 +86,7 @@ import { STARTUP_UNAVAILABLE_GATEWAY_METHODS } from "./server-startup-unavailabl import { startGatewayEarlyRuntime, startGatewayPostAttachRuntime } from "./server-startup.js"; import { createWizardSessionTracker } from "./server-wizard-sessions.js"; import { attachGatewayWsHandlers } from "./server-ws-runtime.js"; +import { createGatewayEventLoopHealthMonitor } from "./server/event-loop-health.js"; import { getHealthCache, getHealthVersion, @@ -565,6 +566,7 @@ export async function startGatewayServer( throw new Error(gatewayTls.error ?? "gateway tls: failed to enable"); } const serverStartedAt = Date.now(); + const readinessEventLoopHealth = createGatewayEventLoopHealthMonitor(); let startupSidecarsReady = minimalTestGateway; const channelManager = createChannelManager({ getRuntimeConfig: () => @@ -582,6 +584,7 @@ export async function startGatewayServer( channelManager, startedAt: serverStartedAt, getStartupPending: () => !startupSidecarsReady, + getEventLoopHealth: readinessEventLoopHealth.snapshot, }); log.info("starting HTTP server..."); const { @@ -682,6 +685,7 @@ export async function startGatewayServer( disposeBrowserAuthRateLimiter: () => browserAuthRateLimiter.dispose(), stopModelPricingRefresh: runtimeState.stopModelPricingRefresh, stopChannelHealthMonitor: () => runtimeState?.channelHealthMonitor?.stop(), + stopReadinessEventLoopHealth: readinessEventLoopHealth.stop, clearSecretsRuntimeSnapshot, closeMcpServer: closeMcpLoopbackServerOnDemand, }); diff --git a/src/gateway/server/event-loop-health.ts b/src/gateway/server/event-loop-health.ts new file mode 100644 index 00000000000..f59d2981fdf --- /dev/null +++ b/src/gateway/server/event-loop-health.ts @@ -0,0 +1,108 @@ +import { monitorEventLoopDelay, performance } from "node:perf_hooks"; + +const EVENT_LOOP_MONITOR_RESOLUTION_MS = 20; +const EVENT_LOOP_DELAY_WARN_MS = 1_000; +const EVENT_LOOP_UTILIZATION_WARN = 0.95; +const CPU_CORE_RATIO_WARN = 0.9; + +type EventLoopDelayMonitor = ReturnType; +type EventLoopUtilization = ReturnType; +type CpuUsage = ReturnType; + +export type GatewayEventLoopHealthReason = "event_loop_delay" | "event_loop_utilization" | "cpu"; + +export type GatewayEventLoopHealth = { + degraded: boolean; + reasons: GatewayEventLoopHealthReason[]; + intervalMs: number; + delayP99Ms: number; + delayMaxMs: number; + utilization: number; + cpuCoreRatio: number; +}; + +export type GatewayEventLoopHealthMonitor = { + snapshot: () => GatewayEventLoopHealth | undefined; + stop: () => void; +}; + +function roundMetric(value: number, digits = 3): number { + if (!Number.isFinite(value)) { + return 0; + } + const factor = 10 ** digits; + return Math.round(value * factor) / factor; +} + +function nanosecondsToMilliseconds(value: number): number { + return roundMetric(value / 1_000_000, 1); +} + +export function createGatewayEventLoopHealthMonitor(): GatewayEventLoopHealthMonitor { + let monitor: EventLoopDelayMonitor | null = null; + let lastWallAt = Date.now(); + let lastCpuUsage: CpuUsage | null = process.cpuUsage(); + let lastEventLoopUtilization: EventLoopUtilization | null = performance.eventLoopUtilization(); + + try { + monitor = monitorEventLoopDelay({ resolution: EVENT_LOOP_MONITOR_RESOLUTION_MS }); + monitor.enable(); + monitor.reset(); + } catch { + monitor = null; + } + + return { + snapshot: () => { + if (!monitor || !lastCpuUsage || !lastEventLoopUtilization || lastWallAt <= 0) { + return undefined; + } + + const now = Date.now(); + const intervalMs = Math.max(1, now - lastWallAt); + const cpuUsage = process.cpuUsage(lastCpuUsage); + const currentEventLoopUtilization = performance.eventLoopUtilization(); + const utilization = roundMetric( + performance.eventLoopUtilization(currentEventLoopUtilization, lastEventLoopUtilization) + .utilization, + ); + const delayP99Ms = nanosecondsToMilliseconds(monitor.percentile(99)); + const delayMaxMs = nanosecondsToMilliseconds(monitor.max); + const cpuTotalMs = roundMetric((cpuUsage.user + cpuUsage.system) / 1_000, 1); + const cpuCoreRatio = roundMetric(cpuTotalMs / intervalMs); + const reasons: GatewayEventLoopHealthReason[] = []; + + if (delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS) { + reasons.push("event_loop_delay"); + } + if (utilization >= EVENT_LOOP_UTILIZATION_WARN) { + reasons.push("event_loop_utilization"); + } + if (cpuCoreRatio >= CPU_CORE_RATIO_WARN) { + reasons.push("cpu"); + } + + monitor.reset(); + lastWallAt = now; + lastCpuUsage = process.cpuUsage(); + lastEventLoopUtilization = currentEventLoopUtilization; + + return { + degraded: reasons.length > 0, + reasons, + intervalMs, + delayP99Ms, + delayMaxMs, + utilization, + cpuCoreRatio, + }; + }, + stop: () => { + monitor?.disable(); + monitor = null; + lastWallAt = 0; + lastCpuUsage = null; + lastEventLoopUtilization = null; + }, + }; +} diff --git a/src/gateway/server/readiness.test.ts b/src/gateway/server/readiness.test.ts index 6541de9d0ca..bdde7d4418f 100644 --- a/src/gateway/server/readiness.test.ts +++ b/src/gateway/server/readiness.test.ts @@ -64,6 +64,7 @@ function createReadinessHarness(params: { startedAgoMs: number; accounts: Record>; getStartupPending?: () => boolean; + getEventLoopHealth?: Parameters[0]["getEventLoopHealth"]; cacheTtlMs?: number; }) { const startedAt = Date.now() - params.startedAgoMs; @@ -74,6 +75,7 @@ function createReadinessHarness(params: { channelManager: manager, startedAt, getStartupPending: params.getStartupPending, + getEventLoopHealth: params.getEventLoopHealth, cacheTtlMs: params.cacheTtlMs, }), }; @@ -273,4 +275,37 @@ describe("createReadinessChecker", () => { expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(2); }); }); + + it("adds event-loop health to detailed readiness without changing readiness state", () => { + withReadinessClock(() => { + const { readiness } = createReadinessHarness({ + startedAgoMs: 5 * 60_000, + accounts: {}, + getEventLoopHealth: () => ({ + degraded: true, + reasons: ["cpu", "event_loop_utilization"], + intervalMs: 2_000, + delayP99Ms: 42.1, + delayMaxMs: 88.7, + utilization: 0.991, + cpuCoreRatio: 0.973, + }), + }); + + expect(readiness()).toEqual({ + ready: true, + failing: [], + uptimeMs: 300_000, + eventLoop: { + degraded: true, + reasons: ["cpu", "event_loop_utilization"], + intervalMs: 2_000, + delayP99Ms: 42.1, + delayMaxMs: 88.7, + utilization: 0.991, + cpuCoreRatio: 0.973, + }, + }); + }); + }); }); diff --git a/src/gateway/server/readiness.ts b/src/gateway/server/readiness.ts index 40a6a1ff800..953002a5ba5 100644 --- a/src/gateway/server/readiness.ts +++ b/src/gateway/server/readiness.ts @@ -7,11 +7,13 @@ import { type ChannelHealthEvaluation, } from "../channel-health-policy.js"; import type { ChannelManager } from "../server-channels.js"; +import type { GatewayEventLoopHealth } from "./event-loop-health.js"; export type ReadinessResult = { ready: boolean; failing: string[]; uptimeMs: number; + eventLoop?: GatewayEventLoopHealth; }; export type ReadinessChecker = () => ReadinessResult; @@ -35,6 +37,7 @@ export function createReadinessChecker(deps: { channelManager: ChannelManager; startedAt: number; getStartupPending?: () => boolean; + getEventLoopHealth?: () => GatewayEventLoopHealth | undefined; cacheTtlMs?: number; }): ReadinessChecker { const { channelManager, startedAt } = deps; @@ -46,10 +49,13 @@ export function createReadinessChecker(deps: { const now = Date.now(); const uptimeMs = now - startedAt; if (deps.getStartupPending?.()) { - return { ready: false, failing: ["startup-sidecars"], uptimeMs }; + return withEventLoopHealth( + { ready: false, failing: ["startup-sidecars"], uptimeMs }, + deps.getEventLoopHealth, + ); } if (cachedState && now - cachedAt < cacheTtlMs) { - return { ...cachedState, uptimeMs }; + return withEventLoopHealth({ ...cachedState, uptimeMs }, deps.getEventLoopHealth); } const snapshot = channelManager.getRuntimeSnapshot(); @@ -79,6 +85,14 @@ export function createReadinessChecker(deps: { cachedAt = now; cachedState = { ready: failing.length === 0, failing }; - return { ...cachedState, uptimeMs }; + return withEventLoopHealth({ ...cachedState, uptimeMs }, deps.getEventLoopHealth); }; } + +function withEventLoopHealth( + result: ReadinessResult, + getEventLoopHealth?: () => GatewayEventLoopHealth | undefined, +): ReadinessResult { + const eventLoop = getEventLoopHealth?.(); + return eventLoop ? { ...result, eventLoop } : result; +}