fix: quiet nonblocking diagnostic logs

This commit is contained in:
Peter Steinberger
2026-05-03 19:24:23 +01:00
parent 01e2755dc3
commit 2b7e8dacd3
4 changed files with 59 additions and 13 deletions

View File

@@ -51,6 +51,7 @@ Docs: https://docs.openclaw.ai
- Memory/status: keep plain `openclaw memory status` and `openclaw memory status --json` on the cheap read-only path by reserving vector and embedding provider probes for `--deep` or `--index`. Fixes #76769. Thanks @daruire.
- Telegram: suppress stale same-session replies when a newer accepted message arrives before an older in-flight Telegram dispatch finalizes. Fixes #76642. Thanks @chinar-amrutkar.
- Gateway/diagnostics: throttle repeated long-running active-work session warnings so healthy cron or subagent runs no longer print the same `recovery=none` line every heartbeat.
- Gateway/diagnostics: keep non-blocking active-work and transient event-loop max-spike liveness diagnostics out of the default gateway console while preserving structured diagnostic events and warnings for queued, stalled, and recovery-eligible work.
- Slack: collapse routine Socket Mode pong-timeout reconnects into one OpenClaw reconnect line and suppress the duplicate Slack SDK pong warning.
- Gateway/diagnostics: abort-drain embedded runs after an extended no-progress stall so a single dead session no longer leaves queued Discord/channel turns blocked behind repeated `recovery=none` liveness warnings.
- Plugins/ClawHub: accept the live artifact resolver `kind`/`sha256` field names alongside the typed `artifactKind`/`artifactSha256` form so `clawhub:` installs of npm-pack and legacy ZIP packages no longer miss downloadable artifacts. Thanks @romneyda.

View File

@@ -118,9 +118,11 @@ The same diagnostic heartbeat records liveness samples when the Gateway keeps
running but the Node.js event loop or CPU looks saturated. These
`diagnostic.liveness.warning` events include event-loop delay, event-loop
utilization, CPU-core ratio, and active/waiting/queued session counts. Idle
samples stay in telemetry at `info` level; they are only logged as Gateway
warnings when diagnostic work is active, waiting, or queued. They do not
restart the Gateway by themselves.
samples stay in telemetry at `info` level. Liveness samples become Gateway
warnings only when work is waiting or queued, or when active work overlaps with
sustained event-loop delay. Transient max-delay spikes during otherwise healthy
background work stay in debug logs. They do not restart the Gateway by
themselves.
Inspect the live recorder:

View File

@@ -409,6 +409,7 @@ describe("stuck session diagnostics threshold", () => {
it("reports long-running sessions separately when active work is making progress", () => {
const events: DiagnosticEventPayload[] = [];
const recoverStuckSession = vi.fn();
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
const unsubscribe = onDiagnosticEvent((event) => {
events.push(event);
});
@@ -439,6 +440,7 @@ describe("stuck session diagnostics threshold", () => {
reason: "active_work",
activeWorkKind: "embedded_run",
});
expect(warnSpy).not.toHaveBeenCalledWith(expect.stringContaining("long-running session:"));
expect(recoverStuckSession).not.toHaveBeenCalled();
});
@@ -671,6 +673,41 @@ describe("stuck session diagnostics threshold", () => {
);
});
it("keeps transient event-loop max spikes debug-only when only background work is active", () => {
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
startDiagnosticHeartbeat(
{
diagnostics: {
enabled: true,
},
},
{
emitMemorySample: createEmitMemorySampleMock(),
sampleLiveness: () => ({
reasons: ["event_loop_delay"],
intervalMs: 30_000,
eventLoopDelayP99Ms: 21,
eventLoopDelayMaxMs: 1_500,
}),
},
);
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
vi.advanceTimersByTime(30_000);
expect(warnSpy).not.toHaveBeenCalledWith(expect.stringContaining("liveness warning:"));
expect(getDiagnosticStabilitySnapshot({ limit: 10 }).events).toContainEqual(
expect.objectContaining({
type: "diagnostic.liveness.warning",
level: "warning",
active: 1,
waiting: 0,
queued: 0,
}),
);
});
it("does not let idle liveness samples suppress later active-work warnings", () => {
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);

View File

@@ -313,7 +313,10 @@ function emitDiagnosticLivenessWarning(
)} cpuCoreRatio=${formatOptionalDiagnosticMetric(sample.cpuCoreRatio)} active=${
work.activeCount
} waiting=${work.waitingCount} queued=${work.queuedCount}`;
if (hasOpenDiagnosticWork(work)) {
const hasBlockingWork = work.waitingCount > 0 || work.queuedCount > 0;
const hasSustainedEventLoopDelay =
(sample.eventLoopDelayP99Ms ?? 0) >= DEFAULT_LIVENESS_EVENT_LOOP_DELAY_WARN_MS;
if (hasBlockingWork || (hasOpenDiagnosticWork(work) && hasSustainedEventLoopDelay)) {
diag.warn(message);
} else {
diag.debug(message);
@@ -657,15 +660,18 @@ export function logSessionAttention(
: classification.eventType === "session.stalled"
? "stalled session"
: "long-running session";
diag.warn(
`${label}: sessionId=${state.sessionId ?? "unknown"} sessionKey=${
state.sessionKey ?? "unknown"
} state=${params.state} age=${Math.round(params.ageMs / 1000)}s queueDepth=${
state.queueDepth
} reason=${classification.reason} classification=${classification.classification}${
classification.activeWorkKind ? ` activeWorkKind=${classification.activeWorkKind}` : ""
} recovery=${recoveryEligible ? "checking" : "none"}`,
);
const message = `${label}: sessionId=${state.sessionId ?? "unknown"} sessionKey=${
state.sessionKey ?? "unknown"
} state=${params.state} age=${Math.round(params.ageMs / 1000)}s queueDepth=${
state.queueDepth
} reason=${classification.reason} classification=${classification.classification}${
classification.activeWorkKind ? ` activeWorkKind=${classification.activeWorkKind}` : ""
} recovery=${recoveryEligible ? "checking" : "none"}`;
if (classification.eventType === "session.long_running" && state.queueDepth <= 0) {
diag.debug(message);
} else {
diag.warn(message);
}
const baseEvent = {
sessionId: state.sessionId,
sessionKey: state.sessionKey,