mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 09:40:43 +00:00
fix: reduce idle liveness warning noise
This commit is contained in:
@@ -30,6 +30,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Media: trim serialized JSON suffixes after local `MEDIA:` directive file extensions, so generated-image metadata cannot pollute the parsed media path and cause false `ENOENT` delivery failures. Fixes #75182. Thanks @TnzGit and @hclsys.
|
||||
- Cron: make scheduler reload schedule comparison tolerate malformed persisted jobs, so one bad cron entry no longer aborts the whole tick. Fixes #75886. Thanks @samfox-ai.
|
||||
- Doctor/channels: warn after migrations when default Telegram or Discord accounts have no configured token and their env fallback (`TELEGRAM_BOT_TOKEN` or `DISCORD_BOT_TOKEN`) is unavailable, with secret-safe migration docs for checking state-dir `.env`. Fixes #74298. Thanks @lolaopenclaw.
|
||||
- Gateway/diagnostics: keep idle liveness samples in telemetry instead of visible warning logs unless diagnostic work is active, waiting, or queued. Thanks @vincentkoc.
|
||||
- Control UI/chat: keep live replies visible when a raw session alias such as `main` sends the chat turn but Gateway emits events under the canonical session key for the same run. Fixes #73716. Thanks @teebes.
|
||||
- CLI/models: reject `--agent` on `openclaw models set` and `set-image` instead of silently writing agent-scoped requests to global model defaults. Fixes #68391. Thanks @derrickabellard.
|
||||
- CLI: stop treating the legacy singular `openclaw tool ...` token as a plugin id under restrictive `plugins.allow`, so it falls through as a normal unknown/reserved command instead of suggesting a stale allowlist entry. Fixes #64732. Thanks @efe-arv, @SweetSophia, and @hashtag1974.
|
||||
|
||||
@@ -114,11 +114,13 @@ export keeps only that a message was omitted and the byte count.
|
||||
The Gateway records a bounded, payload-free stability stream by default when
|
||||
diagnostics are enabled. It is for operational facts, not content.
|
||||
|
||||
The same diagnostic heartbeat records liveness warnings when the Gateway keeps
|
||||
The same diagnostic heartbeat records liveness samples when the Gateway keeps
|
||||
running but the Node.js event loop or CPU looks saturated. These
|
||||
`diagnostic.liveness.warning` events include event-loop delay, event-loop
|
||||
utilization, CPU-core ratio, and active/waiting/queued session counts. They do
|
||||
not restart the Gateway by themselves.
|
||||
utilization, CPU-core ratio, and active/waiting/queued session counts. Idle
|
||||
samples stay in telemetry at `info` level; they are only logged as Gateway
|
||||
warnings when diagnostic work is active, waiting, or queued. They do not
|
||||
restart the Gateway by themselves.
|
||||
|
||||
Inspect the live recorder:
|
||||
|
||||
|
||||
@@ -281,7 +281,7 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi
|
||||
record.queued = event.queued;
|
||||
break;
|
||||
case "diagnostic.liveness.warning":
|
||||
record.level = "warning";
|
||||
record.level = event.active > 0 || event.waiting > 0 || event.queued > 0 ? "warning" : "info";
|
||||
record.durationMs = event.intervalMs;
|
||||
record.count = event.reasons.length;
|
||||
assignReasonCode(record, event.reasons[0]);
|
||||
|
||||
@@ -25,6 +25,7 @@ import {
|
||||
import {
|
||||
logSessionStateChange,
|
||||
logMessageQueued,
|
||||
diagnosticLogger,
|
||||
markDiagnosticSessionProgress,
|
||||
resetDiagnosticStateForTest,
|
||||
resolveStuckSessionWarnMs,
|
||||
@@ -124,6 +125,7 @@ describe("stuck session diagnostics threshold", () => {
|
||||
afterEach(() => {
|
||||
resetDiagnosticEventsForTest();
|
||||
resetDiagnosticStateForTest();
|
||||
vi.restoreAllMocks();
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
@@ -427,8 +429,9 @@ describe("stuck session diagnostics threshold", () => {
|
||||
expect(emitMemorySample).toHaveBeenLastCalledWith({ emitSample: true });
|
||||
});
|
||||
|
||||
it("emits idle liveness warnings into the stability recorder", () => {
|
||||
it("records idle liveness samples without warning in the gateway log", () => {
|
||||
const emitMemorySample = createEmitMemorySampleMock();
|
||||
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
|
||||
const events: string[] = [];
|
||||
const unsubscribe = onDiagnosticEvent((event) => events.push(event.type));
|
||||
|
||||
@@ -461,11 +464,12 @@ describe("stuck session diagnostics threshold", () => {
|
||||
}
|
||||
|
||||
expect(events).toContain("diagnostic.liveness.warning");
|
||||
expect(warnSpy).not.toHaveBeenCalledWith(expect.stringContaining("liveness warning:"));
|
||||
expect(emitMemorySample).toHaveBeenLastCalledWith({ emitSample: true });
|
||||
expect(getDiagnosticStabilitySnapshot({ limit: 10 }).events).toContainEqual(
|
||||
expect.objectContaining({
|
||||
type: "diagnostic.liveness.warning",
|
||||
level: "warning",
|
||||
level: "info",
|
||||
reason: "cpu",
|
||||
durationMs: 30_000,
|
||||
count: 1,
|
||||
@@ -480,6 +484,70 @@ describe("stuck session diagnostics threshold", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("warns for liveness samples when diagnostic work is open", () => {
|
||||
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
|
||||
|
||||
startDiagnosticHeartbeat(
|
||||
{
|
||||
diagnostics: {
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
emitMemorySample: createEmitMemorySampleMock(),
|
||||
sampleLiveness: () => ({
|
||||
reasons: ["event_loop_delay"],
|
||||
intervalMs: 30_000,
|
||||
eventLoopDelayP99Ms: 1_500,
|
||||
eventLoopDelayMaxMs: 2_000,
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
logMessageQueued({ sessionId: "s1", sessionKey: "main", source: "test" });
|
||||
vi.advanceTimersByTime(30_000);
|
||||
|
||||
expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining("liveness warning:"));
|
||||
expect(getDiagnosticStabilitySnapshot({ limit: 10 }).events).toContainEqual(
|
||||
expect.objectContaining({
|
||||
type: "diagnostic.liveness.warning",
|
||||
level: "warning",
|
||||
active: 0,
|
||||
waiting: 0,
|
||||
queued: 1,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("does not let idle liveness samples suppress later active-work warnings", () => {
|
||||
const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined);
|
||||
|
||||
startDiagnosticHeartbeat(
|
||||
{
|
||||
diagnostics: {
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
emitMemorySample: createEmitMemorySampleMock(),
|
||||
sampleLiveness: () => ({
|
||||
reasons: ["event_loop_delay"],
|
||||
intervalMs: 30_000,
|
||||
eventLoopDelayP99Ms: 1_500,
|
||||
eventLoopDelayMaxMs: 2_000,
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
vi.advanceTimersByTime(30_000);
|
||||
expect(warnSpy).not.toHaveBeenCalled();
|
||||
|
||||
logMessageQueued({ sessionId: "s1", sessionKey: "main", source: "test" });
|
||||
vi.advanceTimersByTime(30_000);
|
||||
|
||||
expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining("liveness warning:"));
|
||||
});
|
||||
|
||||
it("throttles repeated liveness warnings", () => {
|
||||
const events: string[] = [];
|
||||
const unsubscribe = onDiagnosticEvent((event) => events.push(event.type));
|
||||
|
||||
@@ -112,6 +112,7 @@ let diagnosticLivenessMonitor: EventLoopDelayMonitor | null = null;
|
||||
let lastDiagnosticLivenessWallAt = 0;
|
||||
let lastDiagnosticLivenessCpuUsage: CpuUsage | null = null;
|
||||
let lastDiagnosticLivenessEventLoopUtilization: EventLoopUtilization | null = null;
|
||||
let lastDiagnosticLivenessEventAt = 0;
|
||||
let lastDiagnosticLivenessWarnAt = 0;
|
||||
|
||||
function loadCommandPollBackoffRuntime() {
|
||||
@@ -179,6 +180,7 @@ function startDiagnosticLivenessSampler(): void {
|
||||
lastDiagnosticLivenessWallAt = Date.now();
|
||||
lastDiagnosticLivenessCpuUsage = process.cpuUsage();
|
||||
lastDiagnosticLivenessEventLoopUtilization = performance.eventLoopUtilization();
|
||||
lastDiagnosticLivenessEventAt = 0;
|
||||
lastDiagnosticLivenessWarnAt = 0;
|
||||
|
||||
if (diagnosticLivenessMonitor) {
|
||||
@@ -202,6 +204,7 @@ function stopDiagnosticLivenessSampler(): void {
|
||||
lastDiagnosticLivenessWallAt = 0;
|
||||
lastDiagnosticLivenessCpuUsage = null;
|
||||
lastDiagnosticLivenessEventLoopUtilization = null;
|
||||
lastDiagnosticLivenessEventAt = 0;
|
||||
lastDiagnosticLivenessWarnAt = 0;
|
||||
}
|
||||
|
||||
@@ -266,7 +269,21 @@ function sampleDiagnosticLiveness(now: number): DiagnosticLivenessSample | null
|
||||
};
|
||||
}
|
||||
|
||||
function shouldEmitDiagnosticLivenessWarning(now: number): boolean {
|
||||
function shouldEmitDiagnosticLivenessEvent(now: number): boolean {
|
||||
if (
|
||||
lastDiagnosticLivenessEventAt > 0 &&
|
||||
now - lastDiagnosticLivenessEventAt < DEFAULT_LIVENESS_WARN_COOLDOWN_MS
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
lastDiagnosticLivenessEventAt = now;
|
||||
return true;
|
||||
}
|
||||
|
||||
function shouldEmitDiagnosticLivenessWarning(now: number, work: DiagnosticWorkSnapshot): boolean {
|
||||
if (!hasOpenDiagnosticWork(work)) {
|
||||
return false;
|
||||
}
|
||||
if (
|
||||
lastDiagnosticLivenessWarnAt > 0 &&
|
||||
now - lastDiagnosticLivenessWarnAt < DEFAULT_LIVENESS_WARN_COOLDOWN_MS
|
||||
@@ -281,19 +298,22 @@ function emitDiagnosticLivenessWarning(
|
||||
sample: DiagnosticLivenessSample,
|
||||
work: DiagnosticWorkSnapshot,
|
||||
): void {
|
||||
diag.warn(
|
||||
`liveness warning: reasons=${sample.reasons.join(",")} interval=${Math.round(
|
||||
sample.intervalMs / 1000,
|
||||
)}s eventLoopDelayP99Ms=${formatOptionalDiagnosticMetric(
|
||||
sample.eventLoopDelayP99Ms,
|
||||
)} eventLoopDelayMaxMs=${formatOptionalDiagnosticMetric(
|
||||
sample.eventLoopDelayMaxMs,
|
||||
)} eventLoopUtilization=${formatOptionalDiagnosticMetric(
|
||||
sample.eventLoopUtilization,
|
||||
)} cpuCoreRatio=${formatOptionalDiagnosticMetric(sample.cpuCoreRatio)} active=${
|
||||
work.activeCount
|
||||
} waiting=${work.waitingCount} queued=${work.queuedCount}`,
|
||||
);
|
||||
const message = `liveness warning: reasons=${sample.reasons.join(",")} interval=${Math.round(
|
||||
sample.intervalMs / 1000,
|
||||
)}s eventLoopDelayP99Ms=${formatOptionalDiagnosticMetric(
|
||||
sample.eventLoopDelayP99Ms,
|
||||
)} eventLoopDelayMaxMs=${formatOptionalDiagnosticMetric(
|
||||
sample.eventLoopDelayMaxMs,
|
||||
)} eventLoopUtilization=${formatOptionalDiagnosticMetric(
|
||||
sample.eventLoopUtilization,
|
||||
)} cpuCoreRatio=${formatOptionalDiagnosticMetric(sample.cpuCoreRatio)} active=${
|
||||
work.activeCount
|
||||
} waiting=${work.waitingCount} queued=${work.queuedCount}`;
|
||||
if (hasOpenDiagnosticWork(work)) {
|
||||
diag.warn(message);
|
||||
} else {
|
||||
diag.debug(message);
|
||||
}
|
||||
emitDiagnosticEvent({
|
||||
type: "diagnostic.liveness.warning",
|
||||
reasons: sample.reasons,
|
||||
@@ -735,10 +755,13 @@ export function startDiagnosticHeartbeat(
|
||||
pruneDiagnosticSessionStates(now, true);
|
||||
const work = getDiagnosticWorkSnapshot();
|
||||
const livenessSample = (opts?.sampleLiveness ?? sampleDiagnosticLiveness)(now, work);
|
||||
const shouldEmitLivenessEvent =
|
||||
livenessSample !== null && shouldEmitDiagnosticLivenessEvent(now);
|
||||
const shouldEmitLivenessWarning =
|
||||
livenessSample !== null && shouldEmitDiagnosticLivenessWarning(now);
|
||||
livenessSample !== null && shouldEmitDiagnosticLivenessWarning(now, work);
|
||||
const shouldEmitLivenessReport = shouldEmitLivenessEvent || shouldEmitLivenessWarning;
|
||||
const shouldRecordMemorySample =
|
||||
shouldEmitLivenessWarning || hasRecentDiagnosticActivity(now) || hasOpenDiagnosticWork(work);
|
||||
shouldEmitLivenessReport || hasRecentDiagnosticActivity(now) || hasOpenDiagnosticWork(work);
|
||||
(opts?.emitMemorySample ?? emitDiagnosticMemorySample)({
|
||||
emitSample: shouldRecordMemorySample,
|
||||
});
|
||||
@@ -747,7 +770,7 @@ export function startDiagnosticHeartbeat(
|
||||
return;
|
||||
}
|
||||
|
||||
if (shouldEmitLivenessWarning && livenessSample) {
|
||||
if (shouldEmitLivenessReport && livenessSample) {
|
||||
emitDiagnosticLivenessWarning(livenessSample, work);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user