fix: reduce idle liveness warning noise

This commit is contained in:
Peter Steinberger
2026-05-02 04:55:59 +01:00
parent 16d8dcbcfc
commit c89da2a606
5 changed files with 117 additions and 23 deletions

View File

@@ -30,6 +30,7 @@ Docs: https://docs.openclaw.ai
- Media: trim serialized JSON suffixes after local `MEDIA:` directive file extensions, so generated-image metadata cannot pollute the parsed media path and cause false `ENOENT` delivery failures. Fixes #75182. Thanks @TnzGit and @hclsys.
- Cron: make scheduler reload schedule comparison tolerate malformed persisted jobs, so one bad cron entry no longer aborts the whole tick. Fixes #75886. Thanks @samfox-ai.
- Doctor/channels: warn after migrations when default Telegram or Discord accounts have no configured token and their env fallback (`TELEGRAM_BOT_TOKEN` or `DISCORD_BOT_TOKEN`) is unavailable, with secret-safe migration docs for checking state-dir `.env`. Fixes #74298. Thanks @lolaopenclaw.
- Gateway/diagnostics: keep idle liveness samples in telemetry instead of visible warning logs unless diagnostic work is active, waiting, or queued. Thanks @vincentkoc.
- Control UI/chat: keep live replies visible when a raw session alias such as `main` sends the chat turn but Gateway emits events under the canonical session key for the same run. Fixes #73716. Thanks @teebes.
- CLI/models: reject `--agent` on `openclaw models set` and `set-image` instead of silently writing agent-scoped requests to global model defaults. Fixes #68391. Thanks @derrickabellard.
- CLI: stop treating the legacy singular `openclaw tool ...` token as a plugin id under restrictive `plugins.allow`, so it falls through as a normal unknown/reserved command instead of suggesting a stale allowlist entry. Fixes #64732. Thanks @efe-arv, @SweetSophia, and @hashtag1974.

View File

@@ -114,11 +114,13 @@ export keeps only that a message was omitted and the byte count.
The Gateway records a bounded, payload-free stability stream by default when
diagnostics are enabled. It is for operational facts, not content.
The same diagnostic heartbeat records liveness warnings when the Gateway keeps
The same diagnostic heartbeat records liveness samples when the Gateway keeps
running but the Node.js event loop or CPU looks saturated. These
`diagnostic.liveness.warning` events include event-loop delay, event-loop
utilization, CPU-core ratio, and active/waiting/queued session counts. They do
not restart the Gateway by themselves.
utilization, CPU-core ratio, and active/waiting/queued session counts. Idle
samples stay in telemetry at `info` level; they are only logged as Gateway
warnings when diagnostic work is active, waiting, or queued. They do not
restart the Gateway by themselves.
Inspect the live recorder:

View File

@@ -281,7 +281,7 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi
record.queued = event.queued;
break;
case "diagnostic.liveness.warning":
record.level = "warning";
record.level = event.active > 0 || event.waiting > 0 || event.queued > 0 ? "warning" : "info";
record.durationMs = event.intervalMs;
record.count = event.reasons.length;
assignReasonCode(record, event.reasons[0]);

View File

@@ -25,6 +25,7 @@ import {
import {
logSessionStateChange,
logMessageQueued,
diagnosticLogger,
markDiagnosticSessionProgress,
resetDiagnosticStateForTest,
resolveStuckSessionWarnMs,
@@ -124,6 +125,7 @@ describe("stuck session diagnostics threshold", () => {
afterEach(() => {
resetDiagnosticEventsForTest();
resetDiagnosticStateForTest();
vi.restoreAllMocks();
vi.useRealTimers();
});
@@ -427,8 +429,9 @@ describe("stuck session diagnostics threshold", () => {
expect(emitMemorySample).toHaveBeenLastCalledWith({ emitSample: true });
});
it("emits idle liveness warnings into the stability recorder", () => {
it("records idle liveness samples without warning in the gateway log", () => {
const emitMemorySample = createEmitMemorySampleMock();
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
const events: string[] = [];
const unsubscribe = onDiagnosticEvent((event) => events.push(event.type));
@@ -461,11 +464,12 @@ describe("stuck session diagnostics threshold", () => {
}
expect(events).toContain("diagnostic.liveness.warning");
expect(warnSpy).not.toHaveBeenCalledWith(expect.stringContaining("liveness warning:"));
expect(emitMemorySample).toHaveBeenLastCalledWith({ emitSample: true });
expect(getDiagnosticStabilitySnapshot({ limit: 10 }).events).toContainEqual(
expect.objectContaining({
type: "diagnostic.liveness.warning",
level: "warning",
level: "info",
reason: "cpu",
durationMs: 30_000,
count: 1,
@@ -480,6 +484,70 @@ describe("stuck session diagnostics threshold", () => {
);
});
it("warns for liveness samples when diagnostic work is open", () => {
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
startDiagnosticHeartbeat(
{
diagnostics: {
enabled: true,
},
},
{
emitMemorySample: createEmitMemorySampleMock(),
sampleLiveness: () => ({
reasons: ["event_loop_delay"],
intervalMs: 30_000,
eventLoopDelayP99Ms: 1_500,
eventLoopDelayMaxMs: 2_000,
}),
},
);
logMessageQueued({ sessionId: "s1", sessionKey: "main", source: "test" });
vi.advanceTimersByTime(30_000);
expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining("liveness warning:"));
expect(getDiagnosticStabilitySnapshot({ limit: 10 }).events).toContainEqual(
expect.objectContaining({
type: "diagnostic.liveness.warning",
level: "warning",
active: 0,
waiting: 0,
queued: 1,
}),
);
});
it("does not let idle liveness samples suppress later active-work warnings", () => {
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
startDiagnosticHeartbeat(
{
diagnostics: {
enabled: true,
},
},
{
emitMemorySample: createEmitMemorySampleMock(),
sampleLiveness: () => ({
reasons: ["event_loop_delay"],
intervalMs: 30_000,
eventLoopDelayP99Ms: 1_500,
eventLoopDelayMaxMs: 2_000,
}),
},
);
vi.advanceTimersByTime(30_000);
expect(warnSpy).not.toHaveBeenCalled();
logMessageQueued({ sessionId: "s1", sessionKey: "main", source: "test" });
vi.advanceTimersByTime(30_000);
expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining("liveness warning:"));
});
it("throttles repeated liveness warnings", () => {
const events: string[] = [];
const unsubscribe = onDiagnosticEvent((event) => events.push(event.type));

View File

@@ -112,6 +112,7 @@ let diagnosticLivenessMonitor: EventLoopDelayMonitor | null = null;
let lastDiagnosticLivenessWallAt = 0;
let lastDiagnosticLivenessCpuUsage: CpuUsage | null = null;
let lastDiagnosticLivenessEventLoopUtilization: EventLoopUtilization | null = null;
let lastDiagnosticLivenessEventAt = 0;
let lastDiagnosticLivenessWarnAt = 0;
function loadCommandPollBackoffRuntime() {
@@ -179,6 +180,7 @@ function startDiagnosticLivenessSampler(): void {
lastDiagnosticLivenessWallAt = Date.now();
lastDiagnosticLivenessCpuUsage = process.cpuUsage();
lastDiagnosticLivenessEventLoopUtilization = performance.eventLoopUtilization();
lastDiagnosticLivenessEventAt = 0;
lastDiagnosticLivenessWarnAt = 0;
if (diagnosticLivenessMonitor) {
@@ -202,6 +204,7 @@ function stopDiagnosticLivenessSampler(): void {
lastDiagnosticLivenessWallAt = 0;
lastDiagnosticLivenessCpuUsage = null;
lastDiagnosticLivenessEventLoopUtilization = null;
lastDiagnosticLivenessEventAt = 0;
lastDiagnosticLivenessWarnAt = 0;
}
@@ -266,7 +269,21 @@ function sampleDiagnosticLiveness(now: number): DiagnosticLivenessSample | null
};
}
function shouldEmitDiagnosticLivenessWarning(now: number): boolean {
function shouldEmitDiagnosticLivenessEvent(now: number): boolean {
if (
lastDiagnosticLivenessEventAt > 0 &&
now - lastDiagnosticLivenessEventAt < DEFAULT_LIVENESS_WARN_COOLDOWN_MS
) {
return false;
}
lastDiagnosticLivenessEventAt = now;
return true;
}
function shouldEmitDiagnosticLivenessWarning(now: number, work: DiagnosticWorkSnapshot): boolean {
if (!hasOpenDiagnosticWork(work)) {
return false;
}
if (
lastDiagnosticLivenessWarnAt > 0 &&
now - lastDiagnosticLivenessWarnAt < DEFAULT_LIVENESS_WARN_COOLDOWN_MS
@@ -281,19 +298,22 @@ function emitDiagnosticLivenessWarning(
sample: DiagnosticLivenessSample,
work: DiagnosticWorkSnapshot,
): void {
diag.warn(
`liveness warning: reasons=${sample.reasons.join(",")} interval=${Math.round(
sample.intervalMs / 1000,
)}s eventLoopDelayP99Ms=${formatOptionalDiagnosticMetric(
sample.eventLoopDelayP99Ms,
)} eventLoopDelayMaxMs=${formatOptionalDiagnosticMetric(
sample.eventLoopDelayMaxMs,
)} eventLoopUtilization=${formatOptionalDiagnosticMetric(
sample.eventLoopUtilization,
)} cpuCoreRatio=${formatOptionalDiagnosticMetric(sample.cpuCoreRatio)} active=${
work.activeCount
} waiting=${work.waitingCount} queued=${work.queuedCount}`,
);
const message = `liveness warning: reasons=${sample.reasons.join(",")} interval=${Math.round(
sample.intervalMs / 1000,
)}s eventLoopDelayP99Ms=${formatOptionalDiagnosticMetric(
sample.eventLoopDelayP99Ms,
)} eventLoopDelayMaxMs=${formatOptionalDiagnosticMetric(
sample.eventLoopDelayMaxMs,
)} eventLoopUtilization=${formatOptionalDiagnosticMetric(
sample.eventLoopUtilization,
)} cpuCoreRatio=${formatOptionalDiagnosticMetric(sample.cpuCoreRatio)} active=${
work.activeCount
} waiting=${work.waitingCount} queued=${work.queuedCount}`;
if (hasOpenDiagnosticWork(work)) {
diag.warn(message);
} else {
diag.debug(message);
}
emitDiagnosticEvent({
type: "diagnostic.liveness.warning",
reasons: sample.reasons,
@@ -735,10 +755,13 @@ export function startDiagnosticHeartbeat(
pruneDiagnosticSessionStates(now, true);
const work = getDiagnosticWorkSnapshot();
const livenessSample = (opts?.sampleLiveness ?? sampleDiagnosticLiveness)(now, work);
const shouldEmitLivenessEvent =
livenessSample !== null && shouldEmitDiagnosticLivenessEvent(now);
const shouldEmitLivenessWarning =
livenessSample !== null && shouldEmitDiagnosticLivenessWarning(now);
livenessSample !== null && shouldEmitDiagnosticLivenessWarning(now, work);
const shouldEmitLivenessReport = shouldEmitLivenessEvent || shouldEmitLivenessWarning;
const shouldRecordMemorySample =
shouldEmitLivenessWarning || hasRecentDiagnosticActivity(now) || hasOpenDiagnosticWork(work);
shouldEmitLivenessReport || hasRecentDiagnosticActivity(now) || hasOpenDiagnosticWork(work);
(opts?.emitMemorySample ?? emitDiagnosticMemorySample)({
emitSample: shouldRecordMemorySample,
});
@@ -747,7 +770,7 @@ export function startDiagnosticHeartbeat(
return;
}
if (shouldEmitLivenessWarning && livenessSample) {
if (shouldEmitLivenessReport && livenessSample) {
emitDiagnosticLivenessWarning(livenessSample, work);
}