From c89da2a60679a87ae8590f4cd9c590cfd08acc62 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 2 May 2026 04:55:59 +0100 Subject: [PATCH] fix: reduce idle liveness warning noise --- CHANGELOG.md | 1 + docs/gateway/diagnostics.md | 8 ++-- src/logging/diagnostic-stability.ts | 2 +- src/logging/diagnostic.test.ts | 72 ++++++++++++++++++++++++++++- src/logging/diagnostic.ts | 57 ++++++++++++++++------- 5 files changed, 117 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e6b0836438..a04b1c364e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ Docs: https://docs.openclaw.ai - Media: trim serialized JSON suffixes after local `MEDIA:` directive file extensions, so generated-image metadata cannot pollute the parsed media path and cause false `ENOENT` delivery failures. Fixes #75182. Thanks @TnzGit and @hclsys. - Cron: make scheduler reload schedule comparison tolerate malformed persisted jobs, so one bad cron entry no longer aborts the whole tick. Fixes #75886. Thanks @samfox-ai. - Doctor/channels: warn after migrations when default Telegram or Discord accounts have no configured token and their env fallback (`TELEGRAM_BOT_TOKEN` or `DISCORD_BOT_TOKEN`) is unavailable, with secret-safe migration docs for checking state-dir `.env`. Fixes #74298. Thanks @lolaopenclaw. +- Gateway/diagnostics: keep idle liveness samples in telemetry instead of visible warning logs unless diagnostic work is active, waiting, or queued. Thanks @vincentkoc. - Control UI/chat: keep live replies visible when a raw session alias such as `main` sends the chat turn but Gateway emits events under the canonical session key for the same run. Fixes #73716. Thanks @teebes. - CLI/models: reject `--agent` on `openclaw models set` and `set-image` instead of silently writing agent-scoped requests to global model defaults. Fixes #68391. Thanks @derrickabellard. - CLI: stop treating the legacy singular `openclaw tool ...` token as a plugin id under restrictive `plugins.allow`, so it falls through as a normal unknown/reserved command instead of suggesting a stale allowlist entry. Fixes #64732. Thanks @efe-arv, @SweetSophia, and @hashtag1974. diff --git a/docs/gateway/diagnostics.md b/docs/gateway/diagnostics.md index 10cef77ffee..8136483c7bc 100644 --- a/docs/gateway/diagnostics.md +++ b/docs/gateway/diagnostics.md @@ -114,11 +114,13 @@ export keeps only that a message was omitted and the byte count. The Gateway records a bounded, payload-free stability stream by default when diagnostics are enabled. It is for operational facts, not content. -The same diagnostic heartbeat records liveness warnings when the Gateway keeps +The same diagnostic heartbeat records liveness samples when the Gateway keeps running but the Node.js event loop or CPU looks saturated. These `diagnostic.liveness.warning` events include event-loop delay, event-loop -utilization, CPU-core ratio, and active/waiting/queued session counts. They do -not restart the Gateway by themselves. +utilization, CPU-core ratio, and active/waiting/queued session counts. Idle +samples stay in telemetry at `info` level; they are only logged as Gateway +warnings when diagnostic work is active, waiting, or queued. They do not +restart the Gateway by themselves. Inspect the live recorder: diff --git a/src/logging/diagnostic-stability.ts b/src/logging/diagnostic-stability.ts index 37d97c3fb25..dd4c361111b 100644 --- a/src/logging/diagnostic-stability.ts +++ b/src/logging/diagnostic-stability.ts @@ -281,7 +281,7 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi record.queued = event.queued; break; case "diagnostic.liveness.warning": - record.level = "warning"; + record.level = event.active > 0 || event.waiting > 0 || event.queued > 0 ? "warning" : "info"; record.durationMs = event.intervalMs; record.count = event.reasons.length; assignReasonCode(record, event.reasons[0]); diff --git a/src/logging/diagnostic.test.ts b/src/logging/diagnostic.test.ts index 5eff8fb922b..2893cccadf6 100644 --- a/src/logging/diagnostic.test.ts +++ b/src/logging/diagnostic.test.ts @@ -25,6 +25,7 @@ import { import { logSessionStateChange, logMessageQueued, + diagnosticLogger, markDiagnosticSessionProgress, resetDiagnosticStateForTest, resolveStuckSessionWarnMs, @@ -124,6 +125,7 @@ describe("stuck session diagnostics threshold", () => { afterEach(() => { resetDiagnosticEventsForTest(); resetDiagnosticStateForTest(); + vi.restoreAllMocks(); vi.useRealTimers(); }); @@ -427,8 +429,9 @@ describe("stuck session diagnostics threshold", () => { expect(emitMemorySample).toHaveBeenLastCalledWith({ emitSample: true }); }); - it("emits idle liveness warnings into the stability recorder", () => { + it("records idle liveness samples without warning in the gateway log", () => { const emitMemorySample = createEmitMemorySampleMock(); + const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined); const events: string[] = []; const unsubscribe = onDiagnosticEvent((event) => events.push(event.type)); @@ -461,11 +464,12 @@ describe("stuck session diagnostics threshold", () => { } expect(events).toContain("diagnostic.liveness.warning"); + expect(warnSpy).not.toHaveBeenCalledWith(expect.stringContaining("liveness warning:")); expect(emitMemorySample).toHaveBeenLastCalledWith({ emitSample: true }); expect(getDiagnosticStabilitySnapshot({ limit: 10 }).events).toContainEqual( expect.objectContaining({ type: "diagnostic.liveness.warning", - level: "warning", + level: "info", reason: "cpu", durationMs: 30_000, count: 1, @@ -480,6 +484,70 @@ describe("stuck session diagnostics threshold", () => { ); }); + it("warns for liveness samples when diagnostic work is open", () => { + const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined); + + startDiagnosticHeartbeat( + { + diagnostics: { + enabled: true, + }, + }, + { + emitMemorySample: createEmitMemorySampleMock(), + sampleLiveness: () => ({ + reasons: ["event_loop_delay"], + intervalMs: 30_000, + eventLoopDelayP99Ms: 1_500, + eventLoopDelayMaxMs: 2_000, + }), + }, + ); + + logMessageQueued({ sessionId: "s1", sessionKey: "main", source: "test" }); + vi.advanceTimersByTime(30_000); + + expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining("liveness warning:")); + expect(getDiagnosticStabilitySnapshot({ limit: 10 }).events).toContainEqual( + expect.objectContaining({ + type: "diagnostic.liveness.warning", + level: "warning", + active: 0, + waiting: 0, + queued: 1, + }), + ); + }); + + it("does not let idle liveness samples suppress later active-work warnings", () => { + const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined); + + startDiagnosticHeartbeat( + { + diagnostics: { + enabled: true, + }, + }, + { + emitMemorySample: createEmitMemorySampleMock(), + sampleLiveness: () => ({ + reasons: ["event_loop_delay"], + intervalMs: 30_000, + eventLoopDelayP99Ms: 1_500, + eventLoopDelayMaxMs: 2_000, + }), + }, + ); + + vi.advanceTimersByTime(30_000); + expect(warnSpy).not.toHaveBeenCalled(); + + logMessageQueued({ sessionId: "s1", sessionKey: "main", source: "test" }); + vi.advanceTimersByTime(30_000); + + expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining("liveness warning:")); + }); + it("throttles repeated liveness warnings", () => { const events: string[] = []; const unsubscribe = onDiagnosticEvent((event) => events.push(event.type)); diff --git a/src/logging/diagnostic.ts b/src/logging/diagnostic.ts index 87f5fccc124..0898dbf294f 100644 --- a/src/logging/diagnostic.ts +++ b/src/logging/diagnostic.ts @@ -112,6 +112,7 @@ let diagnosticLivenessMonitor: EventLoopDelayMonitor | null = null; let lastDiagnosticLivenessWallAt = 0; let lastDiagnosticLivenessCpuUsage: CpuUsage | null = null; let lastDiagnosticLivenessEventLoopUtilization: EventLoopUtilization | null = null; +let lastDiagnosticLivenessEventAt = 0; let lastDiagnosticLivenessWarnAt = 0; function loadCommandPollBackoffRuntime() { @@ -179,6 +180,7 @@ function startDiagnosticLivenessSampler(): void { lastDiagnosticLivenessWallAt = Date.now(); lastDiagnosticLivenessCpuUsage = process.cpuUsage(); lastDiagnosticLivenessEventLoopUtilization = performance.eventLoopUtilization(); + lastDiagnosticLivenessEventAt = 0; lastDiagnosticLivenessWarnAt = 0; if (diagnosticLivenessMonitor) { @@ -202,6 +204,7 @@ function stopDiagnosticLivenessSampler(): void { lastDiagnosticLivenessWallAt = 0; lastDiagnosticLivenessCpuUsage = null; lastDiagnosticLivenessEventLoopUtilization = null; + lastDiagnosticLivenessEventAt = 0; lastDiagnosticLivenessWarnAt = 0; } @@ -266,7 +269,21 @@ function sampleDiagnosticLiveness(now: number): DiagnosticLivenessSample | null }; } -function shouldEmitDiagnosticLivenessWarning(now: number): boolean { +function shouldEmitDiagnosticLivenessEvent(now: number): boolean { + if ( + lastDiagnosticLivenessEventAt > 0 && + now - lastDiagnosticLivenessEventAt < DEFAULT_LIVENESS_WARN_COOLDOWN_MS + ) { + return false; + } + lastDiagnosticLivenessEventAt = now; + return true; +} + +function shouldEmitDiagnosticLivenessWarning(now: number, work: DiagnosticWorkSnapshot): boolean { + if (!hasOpenDiagnosticWork(work)) { + return false; + } if ( lastDiagnosticLivenessWarnAt > 0 && now - lastDiagnosticLivenessWarnAt < DEFAULT_LIVENESS_WARN_COOLDOWN_MS @@ -281,19 +298,22 @@ function emitDiagnosticLivenessWarning( sample: DiagnosticLivenessSample, work: DiagnosticWorkSnapshot, ): void { - diag.warn( - `liveness warning: reasons=${sample.reasons.join(",")} interval=${Math.round( - sample.intervalMs / 1000, - )}s eventLoopDelayP99Ms=${formatOptionalDiagnosticMetric( - sample.eventLoopDelayP99Ms, - )} eventLoopDelayMaxMs=${formatOptionalDiagnosticMetric( - sample.eventLoopDelayMaxMs, - )} eventLoopUtilization=${formatOptionalDiagnosticMetric( - sample.eventLoopUtilization, - )} cpuCoreRatio=${formatOptionalDiagnosticMetric(sample.cpuCoreRatio)} active=${ - work.activeCount - } waiting=${work.waitingCount} queued=${work.queuedCount}`, - ); + const message = `liveness warning: reasons=${sample.reasons.join(",")} interval=${Math.round( + sample.intervalMs / 1000, + )}s eventLoopDelayP99Ms=${formatOptionalDiagnosticMetric( + sample.eventLoopDelayP99Ms, + )} eventLoopDelayMaxMs=${formatOptionalDiagnosticMetric( + sample.eventLoopDelayMaxMs, + )} eventLoopUtilization=${formatOptionalDiagnosticMetric( + sample.eventLoopUtilization, + )} cpuCoreRatio=${formatOptionalDiagnosticMetric(sample.cpuCoreRatio)} active=${ + work.activeCount + } waiting=${work.waitingCount} queued=${work.queuedCount}`; + if (hasOpenDiagnosticWork(work)) { + diag.warn(message); + } else { + diag.debug(message); + } emitDiagnosticEvent({ type: "diagnostic.liveness.warning", reasons: sample.reasons, @@ -735,10 +755,13 @@ export function startDiagnosticHeartbeat( pruneDiagnosticSessionStates(now, true); const work = getDiagnosticWorkSnapshot(); const livenessSample = (opts?.sampleLiveness ?? sampleDiagnosticLiveness)(now, work); + const shouldEmitLivenessEvent = + livenessSample !== null && shouldEmitDiagnosticLivenessEvent(now); const shouldEmitLivenessWarning = - livenessSample !== null && shouldEmitDiagnosticLivenessWarning(now); + livenessSample !== null && shouldEmitDiagnosticLivenessWarning(now, work); + const shouldEmitLivenessReport = shouldEmitLivenessEvent || shouldEmitLivenessWarning; const shouldRecordMemorySample = - shouldEmitLivenessWarning || hasRecentDiagnosticActivity(now) || hasOpenDiagnosticWork(work); + shouldEmitLivenessReport || hasRecentDiagnosticActivity(now) || hasOpenDiagnosticWork(work); (opts?.emitMemorySample ?? emitDiagnosticMemorySample)({ emitSample: shouldRecordMemorySample, }); @@ -747,7 +770,7 @@ export function startDiagnosticHeartbeat( return; } - if (shouldEmitLivenessWarning && livenessSample) { + if (shouldEmitLivenessReport && livenessSample) { emitDiagnosticLivenessWarning(livenessSample, work); }