mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 10:40:43 +00:00
fix(gateway): expose event loop health in readiness
This commit is contained in:
@@ -145,7 +145,7 @@ When you set `--url`, the CLI does not fall back to config or environment creden
|
|||||||
openclaw gateway health --url ws://127.0.0.1:18789
|
openclaw gateway health --url ws://127.0.0.1:18789
|
||||||
```
|
```
|
||||||
|
|
||||||
The HTTP `/healthz` endpoint is a liveness probe: it returns once the server can answer HTTP. The HTTP `/readyz` endpoint is stricter and stays red while startup sidecars, channels, or configured hooks are still settling.
|
The HTTP `/healthz` endpoint is a liveness probe: it returns once the server can answer HTTP. The HTTP `/readyz` endpoint is stricter and stays red while startup sidecars, channels, or configured hooks are still settling. Local or authenticated detailed readiness responses include an `eventLoop` diagnostic block with event-loop delay, event-loop utilization, CPU core ratio, and a `degraded` flag.
|
||||||
|
|
||||||
### `gateway usage-cost`
|
### `gateway usage-cost`
|
||||||
|
|
||||||
|
|||||||
@@ -112,6 +112,7 @@ export async function runGatewayClosePrelude(params: {
|
|||||||
disposeBrowserAuthRateLimiter: () => void;
|
disposeBrowserAuthRateLimiter: () => void;
|
||||||
stopModelPricingRefresh?: () => void;
|
stopModelPricingRefresh?: () => void;
|
||||||
stopChannelHealthMonitor?: () => void;
|
stopChannelHealthMonitor?: () => void;
|
||||||
|
stopReadinessEventLoopHealth?: () => void;
|
||||||
clearSecretsRuntimeSnapshot?: () => void;
|
clearSecretsRuntimeSnapshot?: () => void;
|
||||||
closeMcpServer?: () => Promise<void>;
|
closeMcpServer?: () => Promise<void>;
|
||||||
}): Promise<void> {
|
}): Promise<void> {
|
||||||
@@ -122,6 +123,7 @@ export async function runGatewayClosePrelude(params: {
|
|||||||
params.disposeBrowserAuthRateLimiter();
|
params.disposeBrowserAuthRateLimiter();
|
||||||
params.stopModelPricingRefresh?.();
|
params.stopModelPricingRefresh?.();
|
||||||
params.stopChannelHealthMonitor?.();
|
params.stopChannelHealthMonitor?.();
|
||||||
|
params.stopReadinessEventLoopHealth?.();
|
||||||
params.clearSecretsRuntimeSnapshot?.();
|
params.clearSecretsRuntimeSnapshot?.();
|
||||||
await params.closeMcpServer?.().catch(() => {});
|
await params.closeMcpServer?.().catch(() => {});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -86,6 +86,7 @@ import { STARTUP_UNAVAILABLE_GATEWAY_METHODS } from "./server-startup-unavailabl
|
|||||||
import { startGatewayEarlyRuntime, startGatewayPostAttachRuntime } from "./server-startup.js";
|
import { startGatewayEarlyRuntime, startGatewayPostAttachRuntime } from "./server-startup.js";
|
||||||
import { createWizardSessionTracker } from "./server-wizard-sessions.js";
|
import { createWizardSessionTracker } from "./server-wizard-sessions.js";
|
||||||
import { attachGatewayWsHandlers } from "./server-ws-runtime.js";
|
import { attachGatewayWsHandlers } from "./server-ws-runtime.js";
|
||||||
|
import { createGatewayEventLoopHealthMonitor } from "./server/event-loop-health.js";
|
||||||
import {
|
import {
|
||||||
getHealthCache,
|
getHealthCache,
|
||||||
getHealthVersion,
|
getHealthVersion,
|
||||||
@@ -565,6 +566,7 @@ export async function startGatewayServer(
|
|||||||
throw new Error(gatewayTls.error ?? "gateway tls: failed to enable");
|
throw new Error(gatewayTls.error ?? "gateway tls: failed to enable");
|
||||||
}
|
}
|
||||||
const serverStartedAt = Date.now();
|
const serverStartedAt = Date.now();
|
||||||
|
const readinessEventLoopHealth = createGatewayEventLoopHealthMonitor();
|
||||||
let startupSidecarsReady = minimalTestGateway;
|
let startupSidecarsReady = minimalTestGateway;
|
||||||
const channelManager = createChannelManager({
|
const channelManager = createChannelManager({
|
||||||
getRuntimeConfig: () =>
|
getRuntimeConfig: () =>
|
||||||
@@ -582,6 +584,7 @@ export async function startGatewayServer(
|
|||||||
channelManager,
|
channelManager,
|
||||||
startedAt: serverStartedAt,
|
startedAt: serverStartedAt,
|
||||||
getStartupPending: () => !startupSidecarsReady,
|
getStartupPending: () => !startupSidecarsReady,
|
||||||
|
getEventLoopHealth: readinessEventLoopHealth.snapshot,
|
||||||
});
|
});
|
||||||
log.info("starting HTTP server...");
|
log.info("starting HTTP server...");
|
||||||
const {
|
const {
|
||||||
@@ -682,6 +685,7 @@ export async function startGatewayServer(
|
|||||||
disposeBrowserAuthRateLimiter: () => browserAuthRateLimiter.dispose(),
|
disposeBrowserAuthRateLimiter: () => browserAuthRateLimiter.dispose(),
|
||||||
stopModelPricingRefresh: runtimeState.stopModelPricingRefresh,
|
stopModelPricingRefresh: runtimeState.stopModelPricingRefresh,
|
||||||
stopChannelHealthMonitor: () => runtimeState?.channelHealthMonitor?.stop(),
|
stopChannelHealthMonitor: () => runtimeState?.channelHealthMonitor?.stop(),
|
||||||
|
stopReadinessEventLoopHealth: readinessEventLoopHealth.stop,
|
||||||
clearSecretsRuntimeSnapshot,
|
clearSecretsRuntimeSnapshot,
|
||||||
closeMcpServer: closeMcpLoopbackServerOnDemand,
|
closeMcpServer: closeMcpLoopbackServerOnDemand,
|
||||||
});
|
});
|
||||||
|
|||||||
108
src/gateway/server/event-loop-health.ts
Normal file
108
src/gateway/server/event-loop-health.ts
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
import { monitorEventLoopDelay, performance } from "node:perf_hooks";
|
||||||
|
|
||||||
|
const EVENT_LOOP_MONITOR_RESOLUTION_MS = 20;
|
||||||
|
const EVENT_LOOP_DELAY_WARN_MS = 1_000;
|
||||||
|
const EVENT_LOOP_UTILIZATION_WARN = 0.95;
|
||||||
|
const CPU_CORE_RATIO_WARN = 0.9;
|
||||||
|
|
||||||
|
type EventLoopDelayMonitor = ReturnType<typeof monitorEventLoopDelay>;
|
||||||
|
type EventLoopUtilization = ReturnType<typeof performance.eventLoopUtilization>;
|
||||||
|
type CpuUsage = ReturnType<typeof process.cpuUsage>;
|
||||||
|
|
||||||
|
export type GatewayEventLoopHealthReason = "event_loop_delay" | "event_loop_utilization" | "cpu";
|
||||||
|
|
||||||
|
export type GatewayEventLoopHealth = {
|
||||||
|
degraded: boolean;
|
||||||
|
reasons: GatewayEventLoopHealthReason[];
|
||||||
|
intervalMs: number;
|
||||||
|
delayP99Ms: number;
|
||||||
|
delayMaxMs: number;
|
||||||
|
utilization: number;
|
||||||
|
cpuCoreRatio: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type GatewayEventLoopHealthMonitor = {
|
||||||
|
snapshot: () => GatewayEventLoopHealth | undefined;
|
||||||
|
stop: () => void;
|
||||||
|
};
|
||||||
|
|
||||||
|
function roundMetric(value: number, digits = 3): number {
|
||||||
|
if (!Number.isFinite(value)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
const factor = 10 ** digits;
|
||||||
|
return Math.round(value * factor) / factor;
|
||||||
|
}
|
||||||
|
|
||||||
|
function nanosecondsToMilliseconds(value: number): number {
|
||||||
|
return roundMetric(value / 1_000_000, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function createGatewayEventLoopHealthMonitor(): GatewayEventLoopHealthMonitor {
|
||||||
|
let monitor: EventLoopDelayMonitor | null = null;
|
||||||
|
let lastWallAt = Date.now();
|
||||||
|
let lastCpuUsage: CpuUsage | null = process.cpuUsage();
|
||||||
|
let lastEventLoopUtilization: EventLoopUtilization | null = performance.eventLoopUtilization();
|
||||||
|
|
||||||
|
try {
|
||||||
|
monitor = monitorEventLoopDelay({ resolution: EVENT_LOOP_MONITOR_RESOLUTION_MS });
|
||||||
|
monitor.enable();
|
||||||
|
monitor.reset();
|
||||||
|
} catch {
|
||||||
|
monitor = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
snapshot: () => {
|
||||||
|
if (!monitor || !lastCpuUsage || !lastEventLoopUtilization || lastWallAt <= 0) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
const now = Date.now();
|
||||||
|
const intervalMs = Math.max(1, now - lastWallAt);
|
||||||
|
const cpuUsage = process.cpuUsage(lastCpuUsage);
|
||||||
|
const currentEventLoopUtilization = performance.eventLoopUtilization();
|
||||||
|
const utilization = roundMetric(
|
||||||
|
performance.eventLoopUtilization(currentEventLoopUtilization, lastEventLoopUtilization)
|
||||||
|
.utilization,
|
||||||
|
);
|
||||||
|
const delayP99Ms = nanosecondsToMilliseconds(monitor.percentile(99));
|
||||||
|
const delayMaxMs = nanosecondsToMilliseconds(monitor.max);
|
||||||
|
const cpuTotalMs = roundMetric((cpuUsage.user + cpuUsage.system) / 1_000, 1);
|
||||||
|
const cpuCoreRatio = roundMetric(cpuTotalMs / intervalMs);
|
||||||
|
const reasons: GatewayEventLoopHealthReason[] = [];
|
||||||
|
|
||||||
|
if (delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS) {
|
||||||
|
reasons.push("event_loop_delay");
|
||||||
|
}
|
||||||
|
if (utilization >= EVENT_LOOP_UTILIZATION_WARN) {
|
||||||
|
reasons.push("event_loop_utilization");
|
||||||
|
}
|
||||||
|
if (cpuCoreRatio >= CPU_CORE_RATIO_WARN) {
|
||||||
|
reasons.push("cpu");
|
||||||
|
}
|
||||||
|
|
||||||
|
monitor.reset();
|
||||||
|
lastWallAt = now;
|
||||||
|
lastCpuUsage = process.cpuUsage();
|
||||||
|
lastEventLoopUtilization = currentEventLoopUtilization;
|
||||||
|
|
||||||
|
return {
|
||||||
|
degraded: reasons.length > 0,
|
||||||
|
reasons,
|
||||||
|
intervalMs,
|
||||||
|
delayP99Ms,
|
||||||
|
delayMaxMs,
|
||||||
|
utilization,
|
||||||
|
cpuCoreRatio,
|
||||||
|
};
|
||||||
|
},
|
||||||
|
stop: () => {
|
||||||
|
monitor?.disable();
|
||||||
|
monitor = null;
|
||||||
|
lastWallAt = 0;
|
||||||
|
lastCpuUsage = null;
|
||||||
|
lastEventLoopUtilization = null;
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -64,6 +64,7 @@ function createReadinessHarness(params: {
|
|||||||
startedAgoMs: number;
|
startedAgoMs: number;
|
||||||
accounts: Record<string, Partial<ChannelAccountSnapshot>>;
|
accounts: Record<string, Partial<ChannelAccountSnapshot>>;
|
||||||
getStartupPending?: () => boolean;
|
getStartupPending?: () => boolean;
|
||||||
|
getEventLoopHealth?: Parameters<typeof createReadinessChecker>[0]["getEventLoopHealth"];
|
||||||
cacheTtlMs?: number;
|
cacheTtlMs?: number;
|
||||||
}) {
|
}) {
|
||||||
const startedAt = Date.now() - params.startedAgoMs;
|
const startedAt = Date.now() - params.startedAgoMs;
|
||||||
@@ -74,6 +75,7 @@ function createReadinessHarness(params: {
|
|||||||
channelManager: manager,
|
channelManager: manager,
|
||||||
startedAt,
|
startedAt,
|
||||||
getStartupPending: params.getStartupPending,
|
getStartupPending: params.getStartupPending,
|
||||||
|
getEventLoopHealth: params.getEventLoopHealth,
|
||||||
cacheTtlMs: params.cacheTtlMs,
|
cacheTtlMs: params.cacheTtlMs,
|
||||||
}),
|
}),
|
||||||
};
|
};
|
||||||
@@ -273,4 +275,37 @@ describe("createReadinessChecker", () => {
|
|||||||
expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(2);
|
expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(2);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("adds event-loop health to detailed readiness without changing readiness state", () => {
|
||||||
|
withReadinessClock(() => {
|
||||||
|
const { readiness } = createReadinessHarness({
|
||||||
|
startedAgoMs: 5 * 60_000,
|
||||||
|
accounts: {},
|
||||||
|
getEventLoopHealth: () => ({
|
||||||
|
degraded: true,
|
||||||
|
reasons: ["cpu", "event_loop_utilization"],
|
||||||
|
intervalMs: 2_000,
|
||||||
|
delayP99Ms: 42.1,
|
||||||
|
delayMaxMs: 88.7,
|
||||||
|
utilization: 0.991,
|
||||||
|
cpuCoreRatio: 0.973,
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(readiness()).toEqual({
|
||||||
|
ready: true,
|
||||||
|
failing: [],
|
||||||
|
uptimeMs: 300_000,
|
||||||
|
eventLoop: {
|
||||||
|
degraded: true,
|
||||||
|
reasons: ["cpu", "event_loop_utilization"],
|
||||||
|
intervalMs: 2_000,
|
||||||
|
delayP99Ms: 42.1,
|
||||||
|
delayMaxMs: 88.7,
|
||||||
|
utilization: 0.991,
|
||||||
|
cpuCoreRatio: 0.973,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -7,11 +7,13 @@ import {
|
|||||||
type ChannelHealthEvaluation,
|
type ChannelHealthEvaluation,
|
||||||
} from "../channel-health-policy.js";
|
} from "../channel-health-policy.js";
|
||||||
import type { ChannelManager } from "../server-channels.js";
|
import type { ChannelManager } from "../server-channels.js";
|
||||||
|
import type { GatewayEventLoopHealth } from "./event-loop-health.js";
|
||||||
|
|
||||||
export type ReadinessResult = {
|
export type ReadinessResult = {
|
||||||
ready: boolean;
|
ready: boolean;
|
||||||
failing: string[];
|
failing: string[];
|
||||||
uptimeMs: number;
|
uptimeMs: number;
|
||||||
|
eventLoop?: GatewayEventLoopHealth;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ReadinessChecker = () => ReadinessResult;
|
export type ReadinessChecker = () => ReadinessResult;
|
||||||
@@ -35,6 +37,7 @@ export function createReadinessChecker(deps: {
|
|||||||
channelManager: ChannelManager;
|
channelManager: ChannelManager;
|
||||||
startedAt: number;
|
startedAt: number;
|
||||||
getStartupPending?: () => boolean;
|
getStartupPending?: () => boolean;
|
||||||
|
getEventLoopHealth?: () => GatewayEventLoopHealth | undefined;
|
||||||
cacheTtlMs?: number;
|
cacheTtlMs?: number;
|
||||||
}): ReadinessChecker {
|
}): ReadinessChecker {
|
||||||
const { channelManager, startedAt } = deps;
|
const { channelManager, startedAt } = deps;
|
||||||
@@ -46,10 +49,13 @@ export function createReadinessChecker(deps: {
|
|||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
const uptimeMs = now - startedAt;
|
const uptimeMs = now - startedAt;
|
||||||
if (deps.getStartupPending?.()) {
|
if (deps.getStartupPending?.()) {
|
||||||
return { ready: false, failing: ["startup-sidecars"], uptimeMs };
|
return withEventLoopHealth(
|
||||||
|
{ ready: false, failing: ["startup-sidecars"], uptimeMs },
|
||||||
|
deps.getEventLoopHealth,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
if (cachedState && now - cachedAt < cacheTtlMs) {
|
if (cachedState && now - cachedAt < cacheTtlMs) {
|
||||||
return { ...cachedState, uptimeMs };
|
return withEventLoopHealth({ ...cachedState, uptimeMs }, deps.getEventLoopHealth);
|
||||||
}
|
}
|
||||||
|
|
||||||
const snapshot = channelManager.getRuntimeSnapshot();
|
const snapshot = channelManager.getRuntimeSnapshot();
|
||||||
@@ -79,6 +85,14 @@ export function createReadinessChecker(deps: {
|
|||||||
|
|
||||||
cachedAt = now;
|
cachedAt = now;
|
||||||
cachedState = { ready: failing.length === 0, failing };
|
cachedState = { ready: failing.length === 0, failing };
|
||||||
return { ...cachedState, uptimeMs };
|
return withEventLoopHealth({ ...cachedState, uptimeMs }, deps.getEventLoopHealth);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function withEventLoopHealth(
|
||||||
|
result: ReadinessResult,
|
||||||
|
getEventLoopHealth?: () => GatewayEventLoopHealth | undefined,
|
||||||
|
): ReadinessResult {
|
||||||
|
const eventLoop = getEventLoopHealth?.();
|
||||||
|
return eventLoop ? { ...result, eventLoop } : result;
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user