From 669786595d64c2b9c8e8bea3ea908975c540f395 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 17 May 2026 07:15:02 +0100 Subject: [PATCH] fix(logging): avoid false liveness backlog warnings --- CHANGELOG.md | 1 + src/logging/diagnostic-session-state.ts | 2 + src/logging/diagnostic.test.ts | 80 +++++++++++++++++++++++++ src/logging/diagnostic.ts | 13 +++- 4 files changed, 94 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 543cf1ab8df..92663cb44e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ Docs: https://docs.openclaw.ai - WhatsApp: honor forced document delivery for outbound image, GIF, and video media so `forceDocument`/`asDocument` sends preserve original media bytes instead of using compressed media payloads. (#79272) Thanks @itsuzef. - WhatsApp: name outbound document attachments from their MIME type when no filename is provided, so PDF and CSV sends arrive as `file.pdf` and `file.csv` instead of an extensionless `file`. Thanks @mcaxtr. - Process/diagnostics: report active lane blockers in lane wait warnings so `queueAhead=0` no longer hides commands waiting behind active work. Fixes #82791. (#82792) Thanks @galiniliev. +- Process/diagnostics: stop counting the active processing turn as queued backlog in liveness warnings so transient max-only event-loop spikes do not surface as gateway warnings. ## 2026.5.17 diff --git a/src/logging/diagnostic-session-state.ts b/src/logging/diagnostic-session-state.ts index 3679f4e7fd7..1b2d03d935e 100644 --- a/src/logging/diagnostic-session-state.ts +++ b/src/logging/diagnostic-session-state.ts @@ -9,6 +9,7 @@ export type SessionState = { lastLongRunningWarnAgeMs?: number; state: SessionStateValue; queueDepth: number; + activeQueuedTurn?: boolean; toolCallHistory?: ToolCallRecord[]; toolLoopWarningBuckets?: Map; commandPollCounts?: Map; @@ -104,6 +105,7 @@ function mergeSessionState(target: SessionState, source: SessionState): void { target.generation = Math.max(target.generation ?? 0, source.generation ?? 0); target.lastActivity = Math.max(target.lastActivity, source.lastActivity); target.queueDepth += source.queueDepth; + target.activeQueuedTurn ||= source.activeQueuedTurn; target.lastStuckWarnAgeMs = target.lastStuckWarnAgeMs === undefined || source.lastStuckWarnAgeMs === undefined ? undefined diff --git a/src/logging/diagnostic.test.ts b/src/logging/diagnostic.test.ts index b748265e2c0..30b9fae1637 100644 --- a/src/logging/diagnostic.test.ts +++ b/src/logging/diagnostic.test.ts @@ -1280,6 +1280,86 @@ describe("stuck session diagnostics threshold", () => { ); }); + it("does not count the active processing message as queued liveness backlog", () => { + const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined); + + startDiagnosticHeartbeat( + { + diagnostics: { + enabled: true, + }, + }, + { + emitMemorySample: createEmitMemorySampleMock(), + sampleLiveness: () => ({ + reasons: ["event_loop_delay"], + intervalMs: 30_000, + eventLoopDelayP99Ms: 53.6, + eventLoopDelayMaxMs: 2_761.9, + eventLoopUtilization: 0.785, + cpuCoreRatio: 0.378, + }), + }, + ); + + logMessageQueued({ sessionId: "s1", sessionKey: "main", source: "discord" }); + logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" }); + vi.advanceTimersByTime(30_000); + + expectNoLoggerMessageContaining(warnSpy, "liveness warning:"); + requireMatchingRecord( + getDiagnosticStabilitySnapshot({ limit: 10 }).events, + { + type: "diagnostic.liveness.warning", + level: "info", + active: 1, + waiting: 0, + queued: 0, + }, + "active processing liveness stability event", + ); + }); + + it("counts messages queued behind already active work as liveness backlog", () => { + const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined); + + startDiagnosticHeartbeat( + { + diagnostics: { + enabled: true, + }, + }, + { + emitMemorySample: createEmitMemorySampleMock(), + sampleLiveness: () => ({ + reasons: ["event_loop_delay"], + intervalMs: 30_000, + eventLoopDelayP99Ms: 53.6, + eventLoopDelayMaxMs: 2_761.9, + eventLoopUtilization: 0.785, + cpuCoreRatio: 0.378, + }), + }, + ); + + logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" }); + logMessageQueued({ sessionId: "s1", sessionKey: "main", source: "discord" }); + vi.advanceTimersByTime(30_000); + + expectLoggerMessageContaining(warnSpy, "liveness warning:"); + requireMatchingRecord( + getDiagnosticStabilitySnapshot({ limit: 10 }).events, + { + type: "diagnostic.liveness.warning", + level: "warning", + active: 1, + waiting: 0, + queued: 1, + }, + "queued backlog liveness stability event", + ); + }); + it("does not let idle liveness samples suppress later active-work warnings", () => { const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined); diff --git a/src/logging/diagnostic.ts b/src/logging/diagnostic.ts index 6ebd98a5ea7..8a25c856ace 100644 --- a/src/logging/diagnostic.ts +++ b/src/logging/diagnostic.ts @@ -180,6 +180,7 @@ function formatDiagnosticWorkLabel( sessionKey?: string; state: SessionStateValue; queueDepth: number; + activeQueuedTurn?: boolean; lastActivity: number; }, now: number, @@ -217,10 +218,14 @@ function getDiagnosticWorkSnapshot(now = Date.now()): DiagnosticWorkSnapshot { waitingCount += 1; pushLimitedDiagnosticLabel(waitingLabels, formatDiagnosticWorkLabel(state, now)); } - if (state.queueDepth > 0) { + const queuedBacklog = Math.max( + 0, + state.queueDepth - (state.state === "processing" && state.activeQueuedTurn ? 1 : 0), + ); + if (queuedBacklog > 0) { pushLimitedDiagnosticLabel(queuedLabels, formatDiagnosticWorkLabel(state, now)); } - queuedCount += state.queueDepth; + queuedCount += queuedBacklog; } return { activeCount, waitingCount, queuedCount, activeLabels, waitingLabels, queuedLabels }; @@ -686,8 +691,12 @@ export function logSessionStateChange( state.generation = (state.generation ?? 0) + 1; state.lastStuckWarnAgeMs = undefined; state.lastLongRunningWarnAgeMs = undefined; + if (params.state === "processing" && prevState !== "processing") { + state.activeQueuedTurn = state.queueDepth > 0; + } if (params.state === "idle") { state.queueDepth = Math.max(0, state.queueDepth - 1); + state.activeQueuedTurn = false; } if (!isProbeSession && diag.isEnabled("debug")) { diag.debug(