From a989d248e9374c18a9cc5265c824be36793bfd76 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 3 May 2026 18:39:50 +0100 Subject: [PATCH] fix: throttle long-running diagnostic warnings --- CHANGELOG.md | 1 + src/logging/diagnostic-session-state.ts | 5 +++ src/logging/diagnostic.test.ts | 44 ++++++++++++++++++++++++- src/logging/diagnostic.ts | 16 +++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d1e7b81b781..84ef8610e67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ Docs: https://docs.openclaw.ai - CLI/doctor: trust a ready gateway memory probe when CLI-side active memory backend resolution is unavailable, preventing false "No active memory plugin is registered" warnings for healthy runtime setups. Fixes #76792. Thanks @som-686. - Memory/status: keep plain `openclaw memory status` and `openclaw memory status --json` on the cheap read-only path by reserving vector and embedding provider probes for `--deep` or `--index`. Fixes #76769. Thanks @daruire. - Telegram: suppress stale same-session replies when a newer accepted message arrives before an older in-flight Telegram dispatch finalizes. Fixes #76642. Thanks @chinar-amrutkar. +- Gateway/diagnostics: throttle repeated long-running active-work session warnings so healthy cron or subagent runs no longer print the same `recovery=none` line every heartbeat. - Slack: collapse routine Socket Mode pong-timeout reconnects into one OpenClaw reconnect line and suppress the duplicate Slack SDK pong warning. - Gateway/diagnostics: abort-drain embedded runs after an extended no-progress stall so a single dead session no longer leaves queued Discord/channel turns blocked behind repeated `recovery=none` liveness warnings. - Plugins/ClawHub: accept the live artifact resolver `kind`/`sha256` field names alongside the typed `artifactKind`/`artifactSha256` form so `clawhub:` installs of npm-pack and legacy ZIP packages no longer miss downloadable artifacts. Thanks @romneyda. diff --git a/src/logging/diagnostic-session-state.ts b/src/logging/diagnostic-session-state.ts index edc2e98d14c..964915bccaa 100644 --- a/src/logging/diagnostic-session-state.ts +++ b/src/logging/diagnostic-session-state.ts @@ -5,6 +5,7 @@ export type SessionState = { sessionKey?: string; lastActivity: number; lastStuckWarnAgeMs?: number; + lastLongRunningWarnAgeMs?: number; state: SessionStateValue; queueDepth: number; toolCallHistory?: ToolCallRecord[]; @@ -105,6 +106,10 @@ function mergeSessionState(target: SessionState, source: SessionState): void { target.lastStuckWarnAgeMs === undefined || source.lastStuckWarnAgeMs === undefined ? undefined : Math.max(target.lastStuckWarnAgeMs, source.lastStuckWarnAgeMs); + target.lastLongRunningWarnAgeMs = + target.lastLongRunningWarnAgeMs === undefined || source.lastLongRunningWarnAgeMs === undefined + ? undefined + : Math.max(target.lastLongRunningWarnAgeMs, source.lastLongRunningWarnAgeMs); if (source.toolCallHistory?.length) { target.toolCallHistory = [...(target.toolCallHistory ?? []), ...source.toolCallHistory]; } diff --git a/src/logging/diagnostic.test.ts b/src/logging/diagnostic.test.ts index fdbd26e0eb5..fb5aa6ff2cb 100644 --- a/src/logging/diagnostic.test.ts +++ b/src/logging/diagnostic.test.ts @@ -320,7 +320,7 @@ describe("stuck session diagnostics threshold", () => { expect(events).toHaveLength(1); expect(recoverStuckSession).toHaveBeenCalledTimes(1); - vi.advanceTimersByTime(30_000); + vi.advanceTimersByTime(31_000); } finally { unsubscribe(); } @@ -442,6 +442,48 @@ describe("stuck session diagnostics threshold", () => { expect(recoverStuckSession).not.toHaveBeenCalled(); }); + it("throttles repeated long-running active-work warnings", () => { + const events: DiagnosticEventPayload[] = []; + const recoverStuckSession = vi.fn(); + const unsubscribe = onDiagnosticEvent((event) => { + events.push(event); + }); + try { + startDiagnosticHeartbeat( + { + diagnostics: { + enabled: true, + stuckSessionWarnMs: 30_000, + }, + }, + { recoverStuckSession }, + ); + logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" }); + vi.advanceTimersByTime(45_000); + markDiagnosticEmbeddedRunStarted({ sessionId: "s1", sessionKey: "main" }); + vi.advanceTimersByTime(16_000); + + expect(events.filter((event) => event.type === "session.long_running")).toHaveLength(1); + + vi.advanceTimersByTime(28_000); + emitDiagnosticEvent({ + type: "run.progress", + sessionId: "s1", + sessionKey: "main", + reason: "stream", + }); + vi.advanceTimersByTime(2_000); + + expect(events.filter((event) => event.type === "session.long_running")).toHaveLength(1); + } finally { + unsubscribe(); + } + + const longRunningEvents = events.filter((event) => event.type === "session.long_running"); + expect(longRunningEvents).toHaveLength(1); + expect(recoverStuckSession).not.toHaveBeenCalled(); + }); + it("keeps queued sessions non-recoverable while active work is making progress", () => { const events: DiagnosticEventPayload[] = []; const recoverStuckSession = vi.fn(); diff --git a/src/logging/diagnostic.ts b/src/logging/diagnostic.ts index 623326182fd..e45fc303708 100644 --- a/src/logging/diagnostic.ts +++ b/src/logging/diagnostic.ts @@ -461,6 +461,7 @@ export function logMessageQueued(params: { state.queueDepth += 1; state.lastActivity = Date.now(); state.lastStuckWarnAgeMs = undefined; + state.lastLongRunningWarnAgeMs = undefined; if (diag.isEnabled("debug")) { diag.debug( `message queued: sessionId=${state.sessionId ?? "unknown"} sessionKey=${ @@ -540,6 +541,7 @@ export function logSessionStateChange( state.state = params.state; state.lastActivity = Date.now(); state.lastStuckWarnAgeMs = undefined; + state.lastLongRunningWarnAgeMs = undefined; if (params.state === "idle") { state.queueDepth = Math.max(0, state.queueDepth - 1); } @@ -571,6 +573,7 @@ export function markDiagnosticSessionProgress(params: SessionRef) { const state = getDiagnosticSessionState(params); state.lastActivity = Date.now(); state.lastStuckWarnAgeMs = undefined; + state.lastLongRunningWarnAgeMs = undefined; markActivity(); } @@ -635,6 +638,19 @@ export function logSessionAttention( } state.lastStuckWarnAgeMs = params.ageMs; } + if (classification.eventType === "session.long_running") { + const nextWarnAgeMs = + state.lastLongRunningWarnAgeMs === undefined + ? params.thresholdMs + : Math.max( + state.lastLongRunningWarnAgeMs + params.thresholdMs, + state.lastLongRunningWarnAgeMs * 2, + ); + if (params.ageMs < nextWarnAgeMs) { + return undefined; + } + state.lastLongRunningWarnAgeMs = params.ageMs; + } const label = classification.eventType === "session.stuck" ? "stuck session"