mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:30:42 +00:00
fix(gateway): expose event loop health in readiness
This commit is contained in:
@@ -145,7 +145,7 @@ When you set `--url`, the CLI does not fall back to config or environment creden
|
||||
openclaw gateway health --url ws://127.0.0.1:18789
|
||||
```
|
||||
|
||||
The HTTP `/healthz` endpoint is a liveness probe: it returns once the server can answer HTTP. The HTTP `/readyz` endpoint is stricter and stays red while startup sidecars, channels, or configured hooks are still settling.
|
||||
The HTTP `/healthz` endpoint is a liveness probe: it returns once the server can answer HTTP. The HTTP `/readyz` endpoint is stricter and stays red while startup sidecars, channels, or configured hooks are still settling. Local or authenticated detailed readiness responses include an `eventLoop` diagnostic block with event-loop delay, event-loop utilization, CPU core ratio, and a `degraded` flag.
|
||||
|
||||
### `gateway usage-cost`
|
||||
|
||||
|
||||
@@ -112,6 +112,7 @@ export async function runGatewayClosePrelude(params: {
|
||||
disposeBrowserAuthRateLimiter: () => void;
|
||||
stopModelPricingRefresh?: () => void;
|
||||
stopChannelHealthMonitor?: () => void;
|
||||
stopReadinessEventLoopHealth?: () => void;
|
||||
clearSecretsRuntimeSnapshot?: () => void;
|
||||
closeMcpServer?: () => Promise<void>;
|
||||
}): Promise<void> {
|
||||
@@ -122,6 +123,7 @@ export async function runGatewayClosePrelude(params: {
|
||||
params.disposeBrowserAuthRateLimiter();
|
||||
params.stopModelPricingRefresh?.();
|
||||
params.stopChannelHealthMonitor?.();
|
||||
params.stopReadinessEventLoopHealth?.();
|
||||
params.clearSecretsRuntimeSnapshot?.();
|
||||
await params.closeMcpServer?.().catch(() => {});
|
||||
}
|
||||
|
||||
@@ -86,6 +86,7 @@ import { STARTUP_UNAVAILABLE_GATEWAY_METHODS } from "./server-startup-unavailabl
|
||||
import { startGatewayEarlyRuntime, startGatewayPostAttachRuntime } from "./server-startup.js";
|
||||
import { createWizardSessionTracker } from "./server-wizard-sessions.js";
|
||||
import { attachGatewayWsHandlers } from "./server-ws-runtime.js";
|
||||
import { createGatewayEventLoopHealthMonitor } from "./server/event-loop-health.js";
|
||||
import {
|
||||
getHealthCache,
|
||||
getHealthVersion,
|
||||
@@ -565,6 +566,7 @@ export async function startGatewayServer(
|
||||
throw new Error(gatewayTls.error ?? "gateway tls: failed to enable");
|
||||
}
|
||||
const serverStartedAt = Date.now();
|
||||
const readinessEventLoopHealth = createGatewayEventLoopHealthMonitor();
|
||||
let startupSidecarsReady = minimalTestGateway;
|
||||
const channelManager = createChannelManager({
|
||||
getRuntimeConfig: () =>
|
||||
@@ -582,6 +584,7 @@ export async function startGatewayServer(
|
||||
channelManager,
|
||||
startedAt: serverStartedAt,
|
||||
getStartupPending: () => !startupSidecarsReady,
|
||||
getEventLoopHealth: readinessEventLoopHealth.snapshot,
|
||||
});
|
||||
log.info("starting HTTP server...");
|
||||
const {
|
||||
@@ -682,6 +685,7 @@ export async function startGatewayServer(
|
||||
disposeBrowserAuthRateLimiter: () => browserAuthRateLimiter.dispose(),
|
||||
stopModelPricingRefresh: runtimeState.stopModelPricingRefresh,
|
||||
stopChannelHealthMonitor: () => runtimeState?.channelHealthMonitor?.stop(),
|
||||
stopReadinessEventLoopHealth: readinessEventLoopHealth.stop,
|
||||
clearSecretsRuntimeSnapshot,
|
||||
closeMcpServer: closeMcpLoopbackServerOnDemand,
|
||||
});
|
||||
|
||||
108
src/gateway/server/event-loop-health.ts
Normal file
108
src/gateway/server/event-loop-health.ts
Normal file
@@ -0,0 +1,108 @@
|
||||
import { monitorEventLoopDelay, performance } from "node:perf_hooks";
|
||||
|
||||
const EVENT_LOOP_MONITOR_RESOLUTION_MS = 20;
|
||||
const EVENT_LOOP_DELAY_WARN_MS = 1_000;
|
||||
const EVENT_LOOP_UTILIZATION_WARN = 0.95;
|
||||
const CPU_CORE_RATIO_WARN = 0.9;
|
||||
|
||||
type EventLoopDelayMonitor = ReturnType<typeof monitorEventLoopDelay>;
|
||||
type EventLoopUtilization = ReturnType<typeof performance.eventLoopUtilization>;
|
||||
type CpuUsage = ReturnType<typeof process.cpuUsage>;
|
||||
|
||||
export type GatewayEventLoopHealthReason = "event_loop_delay" | "event_loop_utilization" | "cpu";
|
||||
|
||||
export type GatewayEventLoopHealth = {
|
||||
degraded: boolean;
|
||||
reasons: GatewayEventLoopHealthReason[];
|
||||
intervalMs: number;
|
||||
delayP99Ms: number;
|
||||
delayMaxMs: number;
|
||||
utilization: number;
|
||||
cpuCoreRatio: number;
|
||||
};
|
||||
|
||||
export type GatewayEventLoopHealthMonitor = {
|
||||
snapshot: () => GatewayEventLoopHealth | undefined;
|
||||
stop: () => void;
|
||||
};
|
||||
|
||||
function roundMetric(value: number, digits = 3): number {
|
||||
if (!Number.isFinite(value)) {
|
||||
return 0;
|
||||
}
|
||||
const factor = 10 ** digits;
|
||||
return Math.round(value * factor) / factor;
|
||||
}
|
||||
|
||||
function nanosecondsToMilliseconds(value: number): number {
|
||||
return roundMetric(value / 1_000_000, 1);
|
||||
}
|
||||
|
||||
export function createGatewayEventLoopHealthMonitor(): GatewayEventLoopHealthMonitor {
|
||||
let monitor: EventLoopDelayMonitor | null = null;
|
||||
let lastWallAt = Date.now();
|
||||
let lastCpuUsage: CpuUsage | null = process.cpuUsage();
|
||||
let lastEventLoopUtilization: EventLoopUtilization | null = performance.eventLoopUtilization();
|
||||
|
||||
try {
|
||||
monitor = monitorEventLoopDelay({ resolution: EVENT_LOOP_MONITOR_RESOLUTION_MS });
|
||||
monitor.enable();
|
||||
monitor.reset();
|
||||
} catch {
|
||||
monitor = null;
|
||||
}
|
||||
|
||||
return {
|
||||
snapshot: () => {
|
||||
if (!monitor || !lastCpuUsage || !lastEventLoopUtilization || lastWallAt <= 0) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const now = Date.now();
|
||||
const intervalMs = Math.max(1, now - lastWallAt);
|
||||
const cpuUsage = process.cpuUsage(lastCpuUsage);
|
||||
const currentEventLoopUtilization = performance.eventLoopUtilization();
|
||||
const utilization = roundMetric(
|
||||
performance.eventLoopUtilization(currentEventLoopUtilization, lastEventLoopUtilization)
|
||||
.utilization,
|
||||
);
|
||||
const delayP99Ms = nanosecondsToMilliseconds(monitor.percentile(99));
|
||||
const delayMaxMs = nanosecondsToMilliseconds(monitor.max);
|
||||
const cpuTotalMs = roundMetric((cpuUsage.user + cpuUsage.system) / 1_000, 1);
|
||||
const cpuCoreRatio = roundMetric(cpuTotalMs / intervalMs);
|
||||
const reasons: GatewayEventLoopHealthReason[] = [];
|
||||
|
||||
if (delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS) {
|
||||
reasons.push("event_loop_delay");
|
||||
}
|
||||
if (utilization >= EVENT_LOOP_UTILIZATION_WARN) {
|
||||
reasons.push("event_loop_utilization");
|
||||
}
|
||||
if (cpuCoreRatio >= CPU_CORE_RATIO_WARN) {
|
||||
reasons.push("cpu");
|
||||
}
|
||||
|
||||
monitor.reset();
|
||||
lastWallAt = now;
|
||||
lastCpuUsage = process.cpuUsage();
|
||||
lastEventLoopUtilization = currentEventLoopUtilization;
|
||||
|
||||
return {
|
||||
degraded: reasons.length > 0,
|
||||
reasons,
|
||||
intervalMs,
|
||||
delayP99Ms,
|
||||
delayMaxMs,
|
||||
utilization,
|
||||
cpuCoreRatio,
|
||||
};
|
||||
},
|
||||
stop: () => {
|
||||
monitor?.disable();
|
||||
monitor = null;
|
||||
lastWallAt = 0;
|
||||
lastCpuUsage = null;
|
||||
lastEventLoopUtilization = null;
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -64,6 +64,7 @@ function createReadinessHarness(params: {
|
||||
startedAgoMs: number;
|
||||
accounts: Record<string, Partial<ChannelAccountSnapshot>>;
|
||||
getStartupPending?: () => boolean;
|
||||
getEventLoopHealth?: Parameters<typeof createReadinessChecker>[0]["getEventLoopHealth"];
|
||||
cacheTtlMs?: number;
|
||||
}) {
|
||||
const startedAt = Date.now() - params.startedAgoMs;
|
||||
@@ -74,6 +75,7 @@ function createReadinessHarness(params: {
|
||||
channelManager: manager,
|
||||
startedAt,
|
||||
getStartupPending: params.getStartupPending,
|
||||
getEventLoopHealth: params.getEventLoopHealth,
|
||||
cacheTtlMs: params.cacheTtlMs,
|
||||
}),
|
||||
};
|
||||
@@ -273,4 +275,37 @@ describe("createReadinessChecker", () => {
|
||||
expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
});
|
||||
|
||||
it("adds event-loop health to detailed readiness without changing readiness state", () => {
|
||||
withReadinessClock(() => {
|
||||
const { readiness } = createReadinessHarness({
|
||||
startedAgoMs: 5 * 60_000,
|
||||
accounts: {},
|
||||
getEventLoopHealth: () => ({
|
||||
degraded: true,
|
||||
reasons: ["cpu", "event_loop_utilization"],
|
||||
intervalMs: 2_000,
|
||||
delayP99Ms: 42.1,
|
||||
delayMaxMs: 88.7,
|
||||
utilization: 0.991,
|
||||
cpuCoreRatio: 0.973,
|
||||
}),
|
||||
});
|
||||
|
||||
expect(readiness()).toEqual({
|
||||
ready: true,
|
||||
failing: [],
|
||||
uptimeMs: 300_000,
|
||||
eventLoop: {
|
||||
degraded: true,
|
||||
reasons: ["cpu", "event_loop_utilization"],
|
||||
intervalMs: 2_000,
|
||||
delayP99Ms: 42.1,
|
||||
delayMaxMs: 88.7,
|
||||
utilization: 0.991,
|
||||
cpuCoreRatio: 0.973,
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -7,11 +7,13 @@ import {
|
||||
type ChannelHealthEvaluation,
|
||||
} from "../channel-health-policy.js";
|
||||
import type { ChannelManager } from "../server-channels.js";
|
||||
import type { GatewayEventLoopHealth } from "./event-loop-health.js";
|
||||
|
||||
export type ReadinessResult = {
|
||||
ready: boolean;
|
||||
failing: string[];
|
||||
uptimeMs: number;
|
||||
eventLoop?: GatewayEventLoopHealth;
|
||||
};
|
||||
|
||||
export type ReadinessChecker = () => ReadinessResult;
|
||||
@@ -35,6 +37,7 @@ export function createReadinessChecker(deps: {
|
||||
channelManager: ChannelManager;
|
||||
startedAt: number;
|
||||
getStartupPending?: () => boolean;
|
||||
getEventLoopHealth?: () => GatewayEventLoopHealth | undefined;
|
||||
cacheTtlMs?: number;
|
||||
}): ReadinessChecker {
|
||||
const { channelManager, startedAt } = deps;
|
||||
@@ -46,10 +49,13 @@ export function createReadinessChecker(deps: {
|
||||
const now = Date.now();
|
||||
const uptimeMs = now - startedAt;
|
||||
if (deps.getStartupPending?.()) {
|
||||
return { ready: false, failing: ["startup-sidecars"], uptimeMs };
|
||||
return withEventLoopHealth(
|
||||
{ ready: false, failing: ["startup-sidecars"], uptimeMs },
|
||||
deps.getEventLoopHealth,
|
||||
);
|
||||
}
|
||||
if (cachedState && now - cachedAt < cacheTtlMs) {
|
||||
return { ...cachedState, uptimeMs };
|
||||
return withEventLoopHealth({ ...cachedState, uptimeMs }, deps.getEventLoopHealth);
|
||||
}
|
||||
|
||||
const snapshot = channelManager.getRuntimeSnapshot();
|
||||
@@ -79,6 +85,14 @@ export function createReadinessChecker(deps: {
|
||||
|
||||
cachedAt = now;
|
||||
cachedState = { ready: failing.length === 0, failing };
|
||||
return { ...cachedState, uptimeMs };
|
||||
return withEventLoopHealth({ ...cachedState, uptimeMs }, deps.getEventLoopHealth);
|
||||
};
|
||||
}
|
||||
|
||||
function withEventLoopHealth(
|
||||
result: ReadinessResult,
|
||||
getEventLoopHealth?: () => GatewayEventLoopHealth | undefined,
|
||||
): ReadinessResult {
|
||||
const eventLoop = getEventLoopHealth?.();
|
||||
return eventLoop ? { ...result, eventLoop } : result;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user