diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a4254538bb..e4420d710c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,7 @@ Docs: https://docs.openclaw.ai - Memory/status: keep plain `openclaw memory status` and `openclaw memory status --json` on the cheap read-only path by reserving vector and embedding provider probes for `--deep` or `--index`. Fixes #76769. Thanks @daruire. - Telegram: suppress stale same-session replies when a newer accepted message arrives before an older in-flight Telegram dispatch finalizes. Fixes #76642. Thanks @chinar-amrutkar. - Gateway/diagnostics: throttle repeated long-running active-work session warnings so healthy cron or subagent runs no longer print the same `recovery=none` line every heartbeat. +- Gateway/diagnostics: keep non-blocking active-work and transient event-loop max-spike liveness diagnostics out of the default gateway console while preserving structured diagnostic events and warnings for queued, stalled, and recovery-eligible work. - Slack: collapse routine Socket Mode pong-timeout reconnects into one OpenClaw reconnect line and suppress the duplicate Slack SDK pong warning. - Gateway/diagnostics: abort-drain embedded runs after an extended no-progress stall so a single dead session no longer leaves queued Discord/channel turns blocked behind repeated `recovery=none` liveness warnings. - Plugins/ClawHub: accept the live artifact resolver `kind`/`sha256` field names alongside the typed `artifactKind`/`artifactSha256` form so `clawhub:` installs of npm-pack and legacy ZIP packages no longer miss downloadable artifacts. Thanks @romneyda. diff --git a/docs/gateway/diagnostics.md b/docs/gateway/diagnostics.md index 8136483c7bc..bf3b0c600d4 100644 --- a/docs/gateway/diagnostics.md +++ b/docs/gateway/diagnostics.md @@ -118,9 +118,11 @@ The same diagnostic heartbeat records liveness samples when the Gateway keeps running but the Node.js event loop or CPU looks saturated. These `diagnostic.liveness.warning` events include event-loop delay, event-loop utilization, CPU-core ratio, and active/waiting/queued session counts. Idle -samples stay in telemetry at `info` level; they are only logged as Gateway -warnings when diagnostic work is active, waiting, or queued. They do not -restart the Gateway by themselves. +samples stay in telemetry at `info` level. Liveness samples become Gateway +warnings only when work is waiting or queued, or when active work overlaps with +sustained event-loop delay. Transient max-delay spikes during otherwise healthy +background work stay in debug logs. They do not restart the Gateway by +themselves. Inspect the live recorder: diff --git a/src/logging/diagnostic.test.ts b/src/logging/diagnostic.test.ts index fb5aa6ff2cb..f67830e321f 100644 --- a/src/logging/diagnostic.test.ts +++ b/src/logging/diagnostic.test.ts @@ -409,6 +409,7 @@ describe("stuck session diagnostics threshold", () => { it("reports long-running sessions separately when active work is making progress", () => { const events: DiagnosticEventPayload[] = []; const recoverStuckSession = vi.fn(); + const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined); const unsubscribe = onDiagnosticEvent((event) => { events.push(event); }); @@ -439,6 +440,7 @@ describe("stuck session diagnostics threshold", () => { reason: "active_work", activeWorkKind: "embedded_run", }); + expect(warnSpy).not.toHaveBeenCalledWith(expect.stringContaining("long-running session:")); expect(recoverStuckSession).not.toHaveBeenCalled(); }); @@ -671,6 +673,41 @@ describe("stuck session diagnostics threshold", () => { ); }); + it("keeps transient event-loop max spikes debug-only when only background work is active", () => { + const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined); + + startDiagnosticHeartbeat( + { + diagnostics: { + enabled: true, + }, + }, + { + emitMemorySample: createEmitMemorySampleMock(), + sampleLiveness: () => ({ + reasons: ["event_loop_delay"], + intervalMs: 30_000, + eventLoopDelayP99Ms: 21, + eventLoopDelayMaxMs: 1_500, + }), + }, + ); + + logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" }); + vi.advanceTimersByTime(30_000); + + expect(warnSpy).not.toHaveBeenCalledWith(expect.stringContaining("liveness warning:")); + expect(getDiagnosticStabilitySnapshot({ limit: 10 }).events).toContainEqual( + expect.objectContaining({ + type: "diagnostic.liveness.warning", + level: "warning", + active: 1, + waiting: 0, + queued: 0, + }), + ); + }); + it("does not let idle liveness samples suppress later active-work warnings", () => { const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined); diff --git a/src/logging/diagnostic.ts b/src/logging/diagnostic.ts index e45fc303708..63ed3a9d931 100644 --- a/src/logging/diagnostic.ts +++ b/src/logging/diagnostic.ts @@ -313,7 +313,10 @@ function emitDiagnosticLivenessWarning( )} cpuCoreRatio=${formatOptionalDiagnosticMetric(sample.cpuCoreRatio)} active=${ work.activeCount } waiting=${work.waitingCount} queued=${work.queuedCount}`; - if (hasOpenDiagnosticWork(work)) { + const hasBlockingWork = work.waitingCount > 0 || work.queuedCount > 0; + const hasSustainedEventLoopDelay = + (sample.eventLoopDelayP99Ms ?? 0) >= DEFAULT_LIVENESS_EVENT_LOOP_DELAY_WARN_MS; + if (hasBlockingWork || (hasOpenDiagnosticWork(work) && hasSustainedEventLoopDelay)) { diag.warn(message); } else { diag.debug(message); @@ -657,15 +660,18 @@ export function logSessionAttention( : classification.eventType === "session.stalled" ? "stalled session" : "long-running session"; - diag.warn( - `${label}: sessionId=${state.sessionId ?? "unknown"} sessionKey=${ - state.sessionKey ?? "unknown" - } state=${params.state} age=${Math.round(params.ageMs / 1000)}s queueDepth=${ - state.queueDepth - } reason=${classification.reason} classification=${classification.classification}${ - classification.activeWorkKind ? ` activeWorkKind=${classification.activeWorkKind}` : "" - } recovery=${recoveryEligible ? "checking" : "none"}`, - ); + const message = `${label}: sessionId=${state.sessionId ?? "unknown"} sessionKey=${ + state.sessionKey ?? "unknown" + } state=${params.state} age=${Math.round(params.ageMs / 1000)}s queueDepth=${ + state.queueDepth + } reason=${classification.reason} classification=${classification.classification}${ + classification.activeWorkKind ? ` activeWorkKind=${classification.activeWorkKind}` : "" + } recovery=${recoveryEligible ? "checking" : "none"}`; + if (classification.eventType === "session.long_running" && state.queueDepth <= 0) { + diag.debug(message); + } else { + diag.warn(message); + } const baseEvent = { sessionId: state.sessionId, sessionKey: state.sessionKey,