mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:50:43 +00:00
fix: quiet nonblocking diagnostic logs
This commit is contained in:
@@ -51,6 +51,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Memory/status: keep plain `openclaw memory status` and `openclaw memory status --json` on the cheap read-only path by reserving vector and embedding provider probes for `--deep` or `--index`. Fixes #76769. Thanks @daruire.
|
||||
- Telegram: suppress stale same-session replies when a newer accepted message arrives before an older in-flight Telegram dispatch finalizes. Fixes #76642. Thanks @chinar-amrutkar.
|
||||
- Gateway/diagnostics: throttle repeated long-running active-work session warnings so healthy cron or subagent runs no longer print the same `recovery=none` line every heartbeat.
|
||||
- Gateway/diagnostics: keep non-blocking active-work and transient event-loop max-spike liveness diagnostics out of the default gateway console while preserving structured diagnostic events and warnings for queued, stalled, and recovery-eligible work.
|
||||
- Slack: collapse routine Socket Mode pong-timeout reconnects into one OpenClaw reconnect line and suppress the duplicate Slack SDK pong warning.
|
||||
- Gateway/diagnostics: abort-drain embedded runs after an extended no-progress stall so a single dead session no longer leaves queued Discord/channel turns blocked behind repeated `recovery=none` liveness warnings.
|
||||
- Plugins/ClawHub: accept the live artifact resolver `kind`/`sha256` field names alongside the typed `artifactKind`/`artifactSha256` form so `clawhub:` installs of npm-pack and legacy ZIP packages no longer miss downloadable artifacts. Thanks @romneyda.
|
||||
|
||||
@@ -118,9 +118,11 @@ The same diagnostic heartbeat records liveness samples when the Gateway keeps
|
||||
running but the Node.js event loop or CPU looks saturated. These
|
||||
`diagnostic.liveness.warning` events include event-loop delay, event-loop
|
||||
utilization, CPU-core ratio, and active/waiting/queued session counts. Idle
|
||||
samples stay in telemetry at `info` level; they are only logged as Gateway
|
||||
warnings when diagnostic work is active, waiting, or queued. They do not
|
||||
restart the Gateway by themselves.
|
||||
samples stay in telemetry at `info` level. Liveness samples become Gateway
|
||||
warnings only when work is waiting or queued, or when active work overlaps with
|
||||
sustained event-loop delay. Transient max-delay spikes during otherwise healthy
|
||||
background work stay in debug logs. They do not restart the Gateway by
|
||||
themselves.
|
||||
|
||||
Inspect the live recorder:
|
||||
|
||||
|
||||
@@ -409,6 +409,7 @@ describe("stuck session diagnostics threshold", () => {
|
||||
it("reports long-running sessions separately when active work is making progress", () => {
|
||||
const events: DiagnosticEventPayload[] = [];
|
||||
const recoverStuckSession = vi.fn();
|
||||
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
|
||||
const unsubscribe = onDiagnosticEvent((event) => {
|
||||
events.push(event);
|
||||
});
|
||||
@@ -439,6 +440,7 @@ describe("stuck session diagnostics threshold", () => {
|
||||
reason: "active_work",
|
||||
activeWorkKind: "embedded_run",
|
||||
});
|
||||
expect(warnSpy).not.toHaveBeenCalledWith(expect.stringContaining("long-running session:"));
|
||||
expect(recoverStuckSession).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
@@ -671,6 +673,41 @@ describe("stuck session diagnostics threshold", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("keeps transient event-loop max spikes debug-only when only background work is active", () => {
|
||||
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
|
||||
|
||||
startDiagnosticHeartbeat(
|
||||
{
|
||||
diagnostics: {
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
emitMemorySample: createEmitMemorySampleMock(),
|
||||
sampleLiveness: () => ({
|
||||
reasons: ["event_loop_delay"],
|
||||
intervalMs: 30_000,
|
||||
eventLoopDelayP99Ms: 21,
|
||||
eventLoopDelayMaxMs: 1_500,
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
|
||||
vi.advanceTimersByTime(30_000);
|
||||
|
||||
expect(warnSpy).not.toHaveBeenCalledWith(expect.stringContaining("liveness warning:"));
|
||||
expect(getDiagnosticStabilitySnapshot({ limit: 10 }).events).toContainEqual(
|
||||
expect.objectContaining({
|
||||
type: "diagnostic.liveness.warning",
|
||||
level: "warning",
|
||||
active: 1,
|
||||
waiting: 0,
|
||||
queued: 0,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("does not let idle liveness samples suppress later active-work warnings", () => {
|
||||
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
|
||||
|
||||
|
||||
@@ -313,7 +313,10 @@ function emitDiagnosticLivenessWarning(
|
||||
)} cpuCoreRatio=${formatOptionalDiagnosticMetric(sample.cpuCoreRatio)} active=${
|
||||
work.activeCount
|
||||
} waiting=${work.waitingCount} queued=${work.queuedCount}`;
|
||||
if (hasOpenDiagnosticWork(work)) {
|
||||
const hasBlockingWork = work.waitingCount > 0 || work.queuedCount > 0;
|
||||
const hasSustainedEventLoopDelay =
|
||||
(sample.eventLoopDelayP99Ms ?? 0) >= DEFAULT_LIVENESS_EVENT_LOOP_DELAY_WARN_MS;
|
||||
if (hasBlockingWork || (hasOpenDiagnosticWork(work) && hasSustainedEventLoopDelay)) {
|
||||
diag.warn(message);
|
||||
} else {
|
||||
diag.debug(message);
|
||||
@@ -657,15 +660,18 @@ export function logSessionAttention(
|
||||
: classification.eventType === "session.stalled"
|
||||
? "stalled session"
|
||||
: "long-running session";
|
||||
diag.warn(
|
||||
`${label}: sessionId=${state.sessionId ?? "unknown"} sessionKey=${
|
||||
state.sessionKey ?? "unknown"
|
||||
} state=${params.state} age=${Math.round(params.ageMs / 1000)}s queueDepth=${
|
||||
state.queueDepth
|
||||
} reason=${classification.reason} classification=${classification.classification}${
|
||||
classification.activeWorkKind ? ` activeWorkKind=${classification.activeWorkKind}` : ""
|
||||
} recovery=${recoveryEligible ? "checking" : "none"}`,
|
||||
);
|
||||
const message = `${label}: sessionId=${state.sessionId ?? "unknown"} sessionKey=${
|
||||
state.sessionKey ?? "unknown"
|
||||
} state=${params.state} age=${Math.round(params.ageMs / 1000)}s queueDepth=${
|
||||
state.queueDepth
|
||||
} reason=${classification.reason} classification=${classification.classification}${
|
||||
classification.activeWorkKind ? ` activeWorkKind=${classification.activeWorkKind}` : ""
|
||||
} recovery=${recoveryEligible ? "checking" : "none"}`;
|
||||
if (classification.eventType === "session.long_running" && state.queueDepth <= 0) {
|
||||
diag.debug(message);
|
||||
} else {
|
||||
diag.warn(message);
|
||||
}
|
||||
const baseEvent = {
|
||||
sessionId: state.sessionId,
|
||||
sessionKey: state.sessionKey,
|
||||
|
||||
Reference in New Issue
Block a user