fix(logging): avoid false liveness backlog warnings

This commit is contained in:
Peter Steinberger
2026-05-17 07:15:02 +01:00
parent 9a063e38d1
commit 669786595d
4 changed files with 94 additions and 2 deletions

View File

@@ -37,6 +37,7 @@ Docs: https://docs.openclaw.ai
- WhatsApp: honor forced document delivery for outbound image, GIF, and video media so `forceDocument`/`asDocument` sends preserve original media bytes instead of using compressed media payloads. (#79272) Thanks @itsuzef.
- WhatsApp: name outbound document attachments from their MIME type when no filename is provided, so PDF and CSV sends arrive as `file.pdf` and `file.csv` instead of an extensionless `file`. Thanks @mcaxtr.
- Process/diagnostics: report active lane blockers in lane wait warnings so `queueAhead=0` no longer hides commands waiting behind active work. Fixes #82791. (#82792) Thanks @galiniliev.
- Process/diagnostics: stop counting the active processing turn as queued backlog in liveness warnings so transient max-only event-loop spikes do not surface as gateway warnings.
## 2026.5.17

View File

@@ -9,6 +9,7 @@ export type SessionState = {
lastLongRunningWarnAgeMs?: number;
state: SessionStateValue;
queueDepth: number;
activeQueuedTurn?: boolean;
toolCallHistory?: ToolCallRecord[];
toolLoopWarningBuckets?: Map<string, number>;
commandPollCounts?: Map<string, { count: number; lastPollAt: number }>;
@@ -104,6 +105,7 @@ function mergeSessionState(target: SessionState, source: SessionState): void {
target.generation = Math.max(target.generation ?? 0, source.generation ?? 0);
target.lastActivity = Math.max(target.lastActivity, source.lastActivity);
target.queueDepth += source.queueDepth;
target.activeQueuedTurn ||= source.activeQueuedTurn;
target.lastStuckWarnAgeMs =
target.lastStuckWarnAgeMs === undefined || source.lastStuckWarnAgeMs === undefined
? undefined

View File

@@ -1280,6 +1280,86 @@ describe("stuck session diagnostics threshold", () => {
);
});
it("does not count the active processing message as queued liveness backlog", () => {
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
startDiagnosticHeartbeat(
{
diagnostics: {
enabled: true,
},
},
{
emitMemorySample: createEmitMemorySampleMock(),
sampleLiveness: () => ({
reasons: ["event_loop_delay"],
intervalMs: 30_000,
eventLoopDelayP99Ms: 53.6,
eventLoopDelayMaxMs: 2_761.9,
eventLoopUtilization: 0.785,
cpuCoreRatio: 0.378,
}),
},
);
logMessageQueued({ sessionId: "s1", sessionKey: "main", source: "discord" });
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
vi.advanceTimersByTime(30_000);
expectNoLoggerMessageContaining(warnSpy, "liveness warning:");
requireMatchingRecord(
getDiagnosticStabilitySnapshot({ limit: 10 }).events,
{
type: "diagnostic.liveness.warning",
level: "info",
active: 1,
waiting: 0,
queued: 0,
},
"active processing liveness stability event",
);
});
it("counts messages queued behind already active work as liveness backlog", () => {
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
startDiagnosticHeartbeat(
{
diagnostics: {
enabled: true,
},
},
{
emitMemorySample: createEmitMemorySampleMock(),
sampleLiveness: () => ({
reasons: ["event_loop_delay"],
intervalMs: 30_000,
eventLoopDelayP99Ms: 53.6,
eventLoopDelayMaxMs: 2_761.9,
eventLoopUtilization: 0.785,
cpuCoreRatio: 0.378,
}),
},
);
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
logMessageQueued({ sessionId: "s1", sessionKey: "main", source: "discord" });
vi.advanceTimersByTime(30_000);
expectLoggerMessageContaining(warnSpy, "liveness warning:");
requireMatchingRecord(
getDiagnosticStabilitySnapshot({ limit: 10 }).events,
{
type: "diagnostic.liveness.warning",
level: "warning",
active: 1,
waiting: 0,
queued: 1,
},
"queued backlog liveness stability event",
);
});
it("does not let idle liveness samples suppress later active-work warnings", () => {
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);

View File

@@ -180,6 +180,7 @@ function formatDiagnosticWorkLabel(
sessionKey?: string;
state: SessionStateValue;
queueDepth: number;
activeQueuedTurn?: boolean;
lastActivity: number;
},
now: number,
@@ -217,10 +218,14 @@ function getDiagnosticWorkSnapshot(now = Date.now()): DiagnosticWorkSnapshot {
waitingCount += 1;
pushLimitedDiagnosticLabel(waitingLabels, formatDiagnosticWorkLabel(state, now));
}
if (state.queueDepth > 0) {
const queuedBacklog = Math.max(
0,
state.queueDepth - (state.state === "processing" && state.activeQueuedTurn ? 1 : 0),
);
if (queuedBacklog > 0) {
pushLimitedDiagnosticLabel(queuedLabels, formatDiagnosticWorkLabel(state, now));
}
queuedCount += state.queueDepth;
queuedCount += queuedBacklog;
}
return { activeCount, waitingCount, queuedCount, activeLabels, waitingLabels, queuedLabels };
@@ -686,8 +691,12 @@ export function logSessionStateChange(
state.generation = (state.generation ?? 0) + 1;
state.lastStuckWarnAgeMs = undefined;
state.lastLongRunningWarnAgeMs = undefined;
if (params.state === "processing" && prevState !== "processing") {
state.activeQueuedTurn = state.queueDepth > 0;
}
if (params.state === "idle") {
state.queueDepth = Math.max(0, state.queueDepth - 1);
state.activeQueuedTurn = false;
}
if (!isProbeSession && diag.isEnabled("debug")) {
diag.debug(