diff --git a/CHANGELOG.md b/CHANGELOG.md index b7ddc1ed51e..d8b46cc428c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -95,6 +95,7 @@ Docs: https://docs.openclaw.ai - Agents/generated media: treat attachment-style message tool actions as completed chat sends, preventing duplicate fallback media posts when generated files were already uploaded. - Control UI/sessions: show each session's agent runtime in the Sessions table and allow filtering by runtime labels, matching the Agents panel runtime wording. Thanks @vincentkoc. - Discord/streaming: show live reasoning text in progress drafts instead of a bare `Reasoning` status line. +- Gateway/status: avoid marking fast repeated health/status samples as event-loop degraded from CPU/utilization alone until the Gateway has accumulated a sustained sampling window. Thanks @shakkernerd. - Doctor/status: warn when `OPENCLAW_GATEWAY_TOKEN` would shadow a different active `gateway.auth.token` source for local CLI commands, while avoiding false positives when config points at the same env token. Fixes #74271. Thanks @yelog. - Gateway/HTTP: avoid loading managed outgoing-image media handlers for unrelated requests, so disabled OpenAI-compatible routes return 404 without waiting on lazy media sidecars. Thanks @vincentkoc. - Gateway/OpenAI-compatible: send the assistant role SSE chunk as soon as streaming chat-completion headers are accepted, so cold agent setup cannot leave `/v1/chat/completions` clients with a bodyless 200 response until their idle timeout fires. diff --git a/src/gateway/server/event-loop-health.test.ts b/src/gateway/server/event-loop-health.test.ts new file mode 100644 index 00000000000..575a4e68fd7 --- /dev/null +++ b/src/gateway/server/event-loop-health.test.ts @@ -0,0 +1,121 @@ +import type { monitorEventLoopDelay, performance } from "node:perf_hooks"; +import { describe, expect, it, vi } from "vitest"; +import { createGatewayEventLoopHealthMonitor } from "./event-loop-health.js"; + +type CpuUsage = ReturnType; +type DelayMonitor = ReturnType; +type EventLoopUtilization = ReturnType; +type GatewayEventLoopHealthMonitorDeps = NonNullable< + Parameters[0] +>; + +function createMonitorHarness(params?: { cpuMsPerWallMs?: number; utilization?: number }) { + const startedAt = 10_000; + let nowMs = startedAt; + let delayP99Ms = 0; + let delayMaxMs = 0; + const cpuMsPerWallMs = params?.cpuMsPerWallMs ?? 1; + const utilization = params?.utilization ?? 1; + const delayMonitor = { + enable: vi.fn(), + disable: vi.fn(), + reset: vi.fn(), + percentile: vi.fn(() => delayP99Ms * 1_000_000), + get max() { + return delayMaxMs * 1_000_000; + }, + } as unknown as DelayMonitor; + const cpuUsage = vi.fn((previous?: CpuUsage) => { + const current = { + user: Math.round(nowMs * cpuMsPerWallMs * 1_000), + system: 0, + }; + if (!previous) { + return current; + } + return { + user: current.user - previous.user, + system: current.system - previous.system, + }; + }) as NonNullable; + const eventLoopUtilization = vi.fn( + (current?: EventLoopUtilization, previous?: EventLoopUtilization) => { + if (!current || !previous) { + return { idle: 0, active: nowMs, utilization }; + } + return { + idle: 0, + active: current.active - previous.active, + utilization, + }; + }, + ) as NonNullable; + const monitor = createGatewayEventLoopHealthMonitor({ + now: () => nowMs, + cpuUsage, + eventLoopUtilization, + createDelayMonitor: () => delayMonitor, + }); + + return { + monitor, + cpuUsage, + eventLoopUtilization, + setNow: (value: number) => { + nowMs = startedAt + value; + }, + setDelay: (value: { p99Ms?: number; maxMs?: number }) => { + delayP99Ms = value.p99Ms ?? delayP99Ms; + delayMaxMs = value.maxMs ?? delayMaxMs; + }, + }; +} + +describe("createGatewayEventLoopHealthMonitor", () => { + it("waits for a sustained sample window before reporting CPU-only saturation", () => { + const harness = createMonitorHarness(); + + harness.setNow(42); + expect(harness.monitor.snapshot()).toBeUndefined(); + expect(harness.cpuUsage).toHaveBeenCalledTimes(1); + expect(harness.eventLoopUtilization).toHaveBeenCalledTimes(1); + + harness.setNow(1_000); + expect(harness.monitor.snapshot()).toMatchObject({ + degraded: true, + reasons: ["event_loop_utilization", "cpu"], + intervalMs: 1_000, + delayP99Ms: 0, + delayMaxMs: 0, + utilization: 1, + cpuCoreRatio: 1, + }); + }); + + it("does not wait for the sustained sample window before reporting event-loop delay", () => { + const harness = createMonitorHarness(); + harness.setDelay({ maxMs: 1_500 }); + harness.setNow(42); + + expect(harness.monitor.snapshot()).toMatchObject({ + degraded: true, + reasons: ["event_loop_delay"], + intervalMs: 42, + delayP99Ms: 0, + delayMaxMs: 1_500, + }); + }); + + it("returns a non-degraded snapshot when the sustained load sample is healthy", () => { + const harness = createMonitorHarness({ cpuMsPerWallMs: 0.1, utilization: 0.2 }); + harness.setNow(1_000); + + expect(harness.monitor.snapshot()).toMatchObject({ + degraded: false, + reasons: [], + intervalMs: 1_000, + utilization: 0.2, + cpuCoreRatio: 0.1, + }); + }); +}); diff --git a/src/gateway/server/event-loop-health.ts b/src/gateway/server/event-loop-health.ts index f59d2981fdf..2a8c06b5a9e 100644 --- a/src/gateway/server/event-loop-health.ts +++ b/src/gateway/server/event-loop-health.ts @@ -4,6 +4,7 @@ const EVENT_LOOP_MONITOR_RESOLUTION_MS = 20; const EVENT_LOOP_DELAY_WARN_MS = 1_000; const EVENT_LOOP_UTILIZATION_WARN = 0.95; const CPU_CORE_RATIO_WARN = 0.9; +const SUSTAINED_LOAD_SAMPLE_MIN_INTERVAL_MS = 1_000; type EventLoopDelayMonitor = ReturnType; type EventLoopUtilization = ReturnType; @@ -26,6 +27,17 @@ export type GatewayEventLoopHealthMonitor = { stop: () => void; }; +type EventLoopUtilizationReader = typeof performance.eventLoopUtilization; + +type EventLoopDelayMonitorFactory = (resolutionMs: number) => EventLoopDelayMonitor; + +type GatewayEventLoopHealthMonitorDeps = { + now?: () => number; + cpuUsage?: typeof process.cpuUsage; + eventLoopUtilization?: EventLoopUtilizationReader; + createDelayMonitor?: EventLoopDelayMonitorFactory; +}; + function roundMetric(value: number, digits = 3): number { if (!Number.isFinite(value)) { return 0; @@ -38,14 +50,49 @@ function nanosecondsToMilliseconds(value: number): number { return roundMetric(value / 1_000_000, 1); } -export function createGatewayEventLoopHealthMonitor(): GatewayEventLoopHealthMonitor { +function resolveGatewayEventLoopHealthReasons(params: { + intervalMs: number; + delayP99Ms: number; + delayMaxMs: number; + utilization: number; + cpuCoreRatio: number; +}): GatewayEventLoopHealthReason[] { + const reasons: GatewayEventLoopHealthReason[] = []; + const hasSustainedLoadWindow = params.intervalMs >= SUSTAINED_LOAD_SAMPLE_MIN_INTERVAL_MS; + + if ( + params.delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || + params.delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS + ) { + reasons.push("event_loop_delay"); + } + if (hasSustainedLoadWindow && params.utilization >= EVENT_LOOP_UTILIZATION_WARN) { + reasons.push("event_loop_utilization"); + } + if (hasSustainedLoadWindow && params.cpuCoreRatio >= CPU_CORE_RATIO_WARN) { + reasons.push("cpu"); + } + + return reasons; +} + +export function createGatewayEventLoopHealthMonitor( + deps: GatewayEventLoopHealthMonitorDeps = {}, +): GatewayEventLoopHealthMonitor { + const nowMs = deps.now ?? Date.now; + const readCpuUsage = deps.cpuUsage ?? process.cpuUsage.bind(process); + const readEventLoopUtilization = + deps.eventLoopUtilization ?? performance.eventLoopUtilization.bind(performance); + const createDelayMonitor = + deps.createDelayMonitor ?? + ((resolutionMs: number) => monitorEventLoopDelay({ resolution: resolutionMs })); let monitor: EventLoopDelayMonitor | null = null; - let lastWallAt = Date.now(); - let lastCpuUsage: CpuUsage | null = process.cpuUsage(); - let lastEventLoopUtilization: EventLoopUtilization | null = performance.eventLoopUtilization(); + let lastWallAt = nowMs(); + let lastCpuUsage: CpuUsage | null = readCpuUsage(); + let lastEventLoopUtilization: EventLoopUtilization | null = readEventLoopUtilization(); try { - monitor = monitorEventLoopDelay({ resolution: EVENT_LOOP_MONITOR_RESOLUTION_MS }); + monitor = createDelayMonitor(EVENT_LOOP_MONITOR_RESOLUTION_MS); monitor.enable(); monitor.reset(); } catch { @@ -58,33 +105,36 @@ export function createGatewayEventLoopHealthMonitor(): GatewayEventLoopHealthMon return undefined; } - const now = Date.now(); + const now = nowMs(); const intervalMs = Math.max(1, now - lastWallAt); - const cpuUsage = process.cpuUsage(lastCpuUsage); - const currentEventLoopUtilization = performance.eventLoopUtilization(); - const utilization = roundMetric( - performance.eventLoopUtilization(currentEventLoopUtilization, lastEventLoopUtilization) - .utilization, - ); const delayP99Ms = nanosecondsToMilliseconds(monitor.percentile(99)); const delayMaxMs = nanosecondsToMilliseconds(monitor.max); + const hasDelayWarning = + delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS; + + if (!hasDelayWarning && intervalMs < SUSTAINED_LOAD_SAMPLE_MIN_INTERVAL_MS) { + monitor.reset(); + return undefined; + } + + const cpuUsage = readCpuUsage(lastCpuUsage); + const currentEventLoopUtilization = readEventLoopUtilization(); + const utilization = roundMetric( + readEventLoopUtilization(currentEventLoopUtilization, lastEventLoopUtilization).utilization, + ); const cpuTotalMs = roundMetric((cpuUsage.user + cpuUsage.system) / 1_000, 1); const cpuCoreRatio = roundMetric(cpuTotalMs / intervalMs); - const reasons: GatewayEventLoopHealthReason[] = []; - - if (delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS) { - reasons.push("event_loop_delay"); - } - if (utilization >= EVENT_LOOP_UTILIZATION_WARN) { - reasons.push("event_loop_utilization"); - } - if (cpuCoreRatio >= CPU_CORE_RATIO_WARN) { - reasons.push("cpu"); - } + const reasons = resolveGatewayEventLoopHealthReasons({ + intervalMs, + delayP99Ms, + delayMaxMs, + utilization, + cpuCoreRatio, + }); monitor.reset(); lastWallAt = now; - lastCpuUsage = process.cpuUsage(); + lastCpuUsage = readCpuUsage(); lastEventLoopUtilization = currentEventLoopUtilization; return {