fix: throttle long-running diagnostic warnings

This commit is contained in:
Peter Steinberger
2026-05-03 18:39:50 +01:00
parent 0d97fa3f3a
commit a989d248e9
4 changed files with 65 additions and 1 deletions

View File

@@ -39,6 +39,7 @@ Docs: https://docs.openclaw.ai
- CLI/doctor: trust a ready gateway memory probe when CLI-side active memory backend resolution is unavailable, preventing false "No active memory plugin is registered" warnings for healthy runtime setups. Fixes #76792. Thanks @som-686.
- Memory/status: keep plain `openclaw memory status` and `openclaw memory status --json` on the cheap read-only path by reserving vector and embedding provider probes for `--deep` or `--index`. Fixes #76769. Thanks @daruire.
- Telegram: suppress stale same-session replies when a newer accepted message arrives before an older in-flight Telegram dispatch finalizes. Fixes #76642. Thanks @chinar-amrutkar.
- Gateway/diagnostics: throttle repeated long-running active-work session warnings so healthy cron or subagent runs no longer print the same `recovery=none` line every heartbeat.
- Slack: collapse routine Socket Mode pong-timeout reconnects into one OpenClaw reconnect line and suppress the duplicate Slack SDK pong warning.
- Gateway/diagnostics: abort-drain embedded runs after an extended no-progress stall so a single dead session no longer leaves queued Discord/channel turns blocked behind repeated `recovery=none` liveness warnings.
- Plugins/ClawHub: accept the live artifact resolver `kind`/`sha256` field names alongside the typed `artifactKind`/`artifactSha256` form so `clawhub:` installs of npm-pack and legacy ZIP packages no longer miss downloadable artifacts. Thanks @romneyda.

View File

@@ -5,6 +5,7 @@ export type SessionState = {
sessionKey?: string;
lastActivity: number;
lastStuckWarnAgeMs?: number;
lastLongRunningWarnAgeMs?: number;
state: SessionStateValue;
queueDepth: number;
toolCallHistory?: ToolCallRecord[];
@@ -105,6 +106,10 @@ function mergeSessionState(target: SessionState, source: SessionState): void {
target.lastStuckWarnAgeMs === undefined || source.lastStuckWarnAgeMs === undefined
? undefined
: Math.max(target.lastStuckWarnAgeMs, source.lastStuckWarnAgeMs);
target.lastLongRunningWarnAgeMs =
target.lastLongRunningWarnAgeMs === undefined || source.lastLongRunningWarnAgeMs === undefined
? undefined
: Math.max(target.lastLongRunningWarnAgeMs, source.lastLongRunningWarnAgeMs);
if (source.toolCallHistory?.length) {
target.toolCallHistory = [...(target.toolCallHistory ?? []), ...source.toolCallHistory];
}

View File

@@ -320,7 +320,7 @@ describe("stuck session diagnostics threshold", () => {
expect(events).toHaveLength(1);
expect(recoverStuckSession).toHaveBeenCalledTimes(1);
vi.advanceTimersByTime(30_000);
vi.advanceTimersByTime(31_000);
} finally {
unsubscribe();
}
@@ -442,6 +442,48 @@ describe("stuck session diagnostics threshold", () => {
expect(recoverStuckSession).not.toHaveBeenCalled();
});
it("throttles repeated long-running active-work warnings", () => {
const events: DiagnosticEventPayload[] = [];
const recoverStuckSession = vi.fn();
const unsubscribe = onDiagnosticEvent((event) => {
events.push(event);
});
try {
startDiagnosticHeartbeat(
{
diagnostics: {
enabled: true,
stuckSessionWarnMs: 30_000,
},
},
{ recoverStuckSession },
);
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
vi.advanceTimersByTime(45_000);
markDiagnosticEmbeddedRunStarted({ sessionId: "s1", sessionKey: "main" });
vi.advanceTimersByTime(16_000);
expect(events.filter((event) => event.type === "session.long_running")).toHaveLength(1);
vi.advanceTimersByTime(28_000);
emitDiagnosticEvent({
type: "run.progress",
sessionId: "s1",
sessionKey: "main",
reason: "stream",
});
vi.advanceTimersByTime(2_000);
expect(events.filter((event) => event.type === "session.long_running")).toHaveLength(1);
} finally {
unsubscribe();
}
const longRunningEvents = events.filter((event) => event.type === "session.long_running");
expect(longRunningEvents).toHaveLength(1);
expect(recoverStuckSession).not.toHaveBeenCalled();
});
it("keeps queued sessions non-recoverable while active work is making progress", () => {
const events: DiagnosticEventPayload[] = [];
const recoverStuckSession = vi.fn();

View File

@@ -461,6 +461,7 @@ export function logMessageQueued(params: {
state.queueDepth += 1;
state.lastActivity = Date.now();
state.lastStuckWarnAgeMs = undefined;
state.lastLongRunningWarnAgeMs = undefined;
if (diag.isEnabled("debug")) {
diag.debug(
`message queued: sessionId=${state.sessionId ?? "unknown"} sessionKey=${
@@ -540,6 +541,7 @@ export function logSessionStateChange(
state.state = params.state;
state.lastActivity = Date.now();
state.lastStuckWarnAgeMs = undefined;
state.lastLongRunningWarnAgeMs = undefined;
if (params.state === "idle") {
state.queueDepth = Math.max(0, state.queueDepth - 1);
}
@@ -571,6 +573,7 @@ export function markDiagnosticSessionProgress(params: SessionRef) {
const state = getDiagnosticSessionState(params);
state.lastActivity = Date.now();
state.lastStuckWarnAgeMs = undefined;
state.lastLongRunningWarnAgeMs = undefined;
markActivity();
}
@@ -635,6 +638,19 @@ export function logSessionAttention(
}
state.lastStuckWarnAgeMs = params.ageMs;
}
if (classification.eventType === "session.long_running") {
const nextWarnAgeMs =
state.lastLongRunningWarnAgeMs === undefined
? params.thresholdMs
: Math.max(
state.lastLongRunningWarnAgeMs + params.thresholdMs,
state.lastLongRunningWarnAgeMs * 2,
);
if (params.ageMs < nextWarnAgeMs) {
return undefined;
}
state.lastLongRunningWarnAgeMs = params.ageMs;
}
const label =
classification.eventType === "session.stuck"
? "stuck session"