From 4e1f59010ef62b78a749c5da0921841b73a89add Mon Sep 17 00:00:00 2001 From: Josh Avant <830519+joshavant@users.noreply.github.com> Date: Thu, 14 May 2026 01:39:46 -0500 Subject: [PATCH] fix(gateway): suppress startup liveness warnings (#81699) * fix(gateway): suppress startup liveness warnings * docs(changelog): note diagnostic startup grace fix --- CHANGELOG.md | 1 + src/gateway/server.impl.ts | 5 +++- src/logging/diagnostic.test.ts | 42 ++++++++++++++++++++++++++++++++++ src/logging/diagnostic.ts | 8 ++++++- 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25c63bd1073..e1f1a467fe0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai - Hooks: load workspace-relative legacy hook modules from dot-dot-prefixed directories without treating the filename prefix as parent traversal. - Plugins: preserve installed package metadata and persisted registry freshness checks for plugin package paths under dot-dot-prefixed directories. - Agents: allow dot-dot-prefixed filenames such as `..note.txt` through sandbox FS bridge, remote sandbox reads, and apply_patch summaries without mistaking the name for parent traversal. +- Gateway/diagnostics: suppress cold-start liveness warnings during the startup grace window while still sampling liveness metrics. Fixes #79915. (#81699) Thanks @joshavant. - CLI/migrate: hide per-item source/plugin hints on non-conflicting Codex skill and plugin selection prompts, keeping the hint text reserved for rows that actually need attention. Thanks @sjf. - Codex harness: treat high-confidence app-server OAuth refresh invalidation as a terminal auth-profile failure, stopping repeated raw token-refresh errors without turning entitlement or usage-limit payloads into re-auth prompts. - CLI/migrate: humanize Codex conflict-status messaging across the migrate UI so selection prompts and plan/result rows say "Codex skill already installed in workspace" instead of surfacing internal `MIGRATION_REASON_*` codes. Thanks @sjf. diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index 7997105bb09..5c6ade78775 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -593,7 +593,10 @@ export async function startGatewayServer( const diagnosticsEnabled = isDiagnosticsEnabled(cfgAtStart); setDiagnosticsEnabledForProcess(diagnosticsEnabled); if (diagnosticsEnabled) { - startDiagnosticHeartbeat(undefined, { getConfig: getRuntimeConfig }); + startDiagnosticHeartbeat(undefined, { + getConfig: getRuntimeConfig, + startupGraceMs: 60_000, + }); } setGatewaySigusr1RestartPolicy({ allowExternal: isRestartEnabled(cfgAtStart) }); let getActiveTaskCount = () => 0; diff --git a/src/logging/diagnostic.test.ts b/src/logging/diagnostic.test.ts index 89bea0240e6..8d47c8c0b3b 100644 --- a/src/logging/diagnostic.test.ts +++ b/src/logging/diagnostic.test.ts @@ -1055,6 +1055,48 @@ describe("stuck session diagnostics threshold", () => { ); }); + it("suppresses liveness warnings during startupGraceMs while still sampling", () => { + const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined); + const events: string[] = []; + const sampleLiveness = vi.fn(() => ({ + reasons: ["event_loop_delay" as const], + intervalMs: 30_000, + eventLoopDelayP99Ms: 1_500, + eventLoopDelayMaxMs: 2_000, + })); + const unsubscribe = onDiagnosticEvent((event) => events.push(event.type)); + + try { + startDiagnosticHeartbeat( + { + diagnostics: { + enabled: true, + }, + }, + { + emitMemorySample: createEmitMemorySampleMock(), + sampleLiveness, + startupGraceMs: 60_000, + }, + ); + + logMessageQueued({ sessionId: "s1", sessionKey: "main", source: "test" }); + vi.advanceTimersByTime(30_000); + + expect(sampleLiveness).toHaveBeenCalledTimes(1); + expectNoLoggerMessageContaining(warnSpy, "liveness warning:"); + expect(events).not.toContain("diagnostic.liveness.warning"); + + vi.advanceTimersByTime(30_000); + + expect(sampleLiveness).toHaveBeenCalledTimes(2); + expectLoggerMessageContaining(warnSpy, "liveness warning:"); + expect(events).toContain("diagnostic.liveness.warning"); + } finally { + unsubscribe(); + } + }); + it("warns for liveness samples when diagnostic work is open", () => { const warnSpy = vi.spyOn(diagnosticLogger, "warn").mockImplementation(() => undefined); diff --git a/src/logging/diagnostic.ts b/src/logging/diagnostic.ts index 9f29fc7d6a5..58bce7a0517 100644 --- a/src/logging/diagnostic.ts +++ b/src/logging/diagnostic.ts @@ -123,6 +123,7 @@ type StartDiagnosticHeartbeatOptions = { emitMemorySample?: EmitDiagnosticMemorySample; sampleLiveness?: SampleDiagnosticLiveness; recoverStuckSession?: RecoverStuckSession; + startupGraceMs?: number; }; let diagnosticLivenessMonitor: EventLoopDelayMonitor | null = null; @@ -939,6 +940,8 @@ export function startDiagnosticHeartbeat( return; } startDiagnosticLivenessSampler(); + const livenessGraceUntil = + opts?.startupGraceMs != null && opts.startupGraceMs > 0 ? Date.now() + opts.startupGraceMs : 0; heartbeatInterval = setInterval(() => { let heartbeatConfig = config; if (!heartbeatConfig) { @@ -953,7 +956,10 @@ export function startDiagnosticHeartbeat( const now = Date.now(); pruneDiagnosticSessionStates(now, true); const work = getDiagnosticWorkSnapshot(now); - const livenessSample = (opts?.sampleLiveness ?? sampleDiagnosticLiveness)(now, work); + const inStartupGrace = livenessGraceUntil > 0 && now < livenessGraceUntil; + const rawLivenessSample = (opts?.sampleLiveness ?? sampleDiagnosticLiveness)(now, work); + // Keep sampling during grace so event-loop delay baselines reset, but suppress startup-only reports. + const livenessSample = inStartupGrace ? null : rawLivenessSample; const shouldEmitLivenessEvent = livenessSample !== null && shouldEmitDiagnosticLivenessEvent(now); const shouldEmitLivenessWarning =