From cb1bca1a16c5e7a8e75e6402a95d1fc45ce47a71 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 28 Apr 2026 05:26:45 +0100 Subject: [PATCH] fix(diagnostics): export liveness warning telemetry --- .../diagnostics-otel/src/service.test.ts | 54 +++++++++++ extensions/diagnostics-otel/src/service.ts | 97 +++++++++++++++++++ 2 files changed, 151 insertions(+) diff --git a/extensions/diagnostics-otel/src/service.test.ts b/extensions/diagnostics-otel/src/service.test.ts index 7a0672e32eb..2d32a0322ea 100644 --- a/extensions/diagnostics-otel/src/service.test.ts +++ b/extensions/diagnostics-otel/src/service.test.ts @@ -492,6 +492,60 @@ describe("diagnostics-otel service", () => { await service.stop?.(ctx); }); + test("records liveness warning diagnostics", async () => { + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true }); + + await service.start(ctx); + emitDiagnosticEvent({ + type: "diagnostic.liveness.warning", + reasons: ["event_loop_delay", "cpu"], + intervalMs: 30_000, + eventLoopDelayP99Ms: 250, + eventLoopDelayMaxMs: 900, + eventLoopUtilization: 0.95, + cpuUserMs: 1200, + cpuSystemMs: 300, + cpuTotalMs: 1500, + cpuCoreRatio: 1.4, + active: 2, + waiting: 1, + queued: 4, + }); + await flushDiagnosticEvents(); + + expect(telemetryState.counters.get("openclaw.liveness.warning")?.add).toHaveBeenCalledWith(1, { + "openclaw.liveness.reason": "event_loop_delay:cpu", + }); + expect( + telemetryState.histograms.get("openclaw.liveness.event_loop_delay_p99_ms")?.record, + ).toHaveBeenCalledWith(250, { + "openclaw.liveness.reason": "event_loop_delay:cpu", + }); + expect( + telemetryState.histograms.get("openclaw.liveness.cpu_core_ratio")?.record, + ).toHaveBeenCalledWith(1.4, { + "openclaw.liveness.reason": "event_loop_delay:cpu", + }); + const livenessSpan = telemetryState.tracer.startSpan.mock.calls.find( + (call) => call[0] === "openclaw.liveness.warning", + ); + expect(livenessSpan?.[1]).toMatchObject({ + attributes: { + "openclaw.liveness.reason": "event_loop_delay:cpu", + "openclaw.liveness.active": 2, + "openclaw.liveness.queued": 4, + }, + }); + const span = telemetryState.spans.find((item) => item.name === "openclaw.liveness.warning"); + expect(span?.setStatus).toHaveBeenCalledWith({ + code: 2, + message: "event_loop_delay:cpu", + }); + + await service.stop?.(ctx); + }); + test("reports log exporter emit failures without exporting raw error text", async () => { const events: Array[0]>[0]> = []; const unsubscribe = onInternalDiagnosticEvent((event) => { diff --git a/extensions/diagnostics-otel/src/service.ts b/extensions/diagnostics-otel/src/service.ts index b8f67b54d6e..527f1f20b1b 100644 --- a/extensions/diagnostics-otel/src/service.ts +++ b/extensions/diagnostics-otel/src/service.ts @@ -888,6 +888,38 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { unit: "1", description: "Diagnostic memory pressure events", }); + const livenessWarningCounter = meter.createCounter("openclaw.liveness.warning", { + unit: "1", + description: "Diagnostic liveness warning events", + }); + const livenessEventLoopDelayP99Histogram = meter.createHistogram( + "openclaw.liveness.event_loop_delay_p99_ms", + { + unit: "ms", + description: "P99 event-loop delay reported by diagnostic liveness warnings", + }, + ); + const livenessEventLoopDelayMaxHistogram = meter.createHistogram( + "openclaw.liveness.event_loop_delay_max_ms", + { + unit: "ms", + description: "Maximum event-loop delay reported by diagnostic liveness warnings", + }, + ); + const livenessEventLoopUtilizationHistogram = meter.createHistogram( + "openclaw.liveness.event_loop_utilization", + { + unit: "1", + description: "Event-loop utilization reported by diagnostic liveness warnings", + }, + ); + const livenessCpuCoreRatioHistogram = meter.createHistogram( + "openclaw.liveness.cpu_core_ratio", + { + unit: "1", + description: "CPU core ratio reported by diagnostic liveness warnings", + }, + ); const telemetryExporterCounter = meter.createCounter("openclaw.telemetry.exporter.events", { unit: "1", description: "Diagnostic telemetry exporter lifecycle and failure events", @@ -2058,6 +2090,68 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { queueDepthHistogram.record(evt.queued, { "openclaw.channel": "heartbeat" }); }; + const recordLivenessWarning = ( + evt: Extract, + ) => { + const reason = evt.reasons.join(":"); + const attrs = { + "openclaw.liveness.reason": lowCardinalityAttr(reason, "unknown"), + }; + livenessWarningCounter.add(1, attrs); + queueDepthHistogram.record(evt.queued, { "openclaw.channel": "liveness" }); + if (evt.eventLoopDelayP99Ms !== undefined) { + livenessEventLoopDelayP99Histogram.record(evt.eventLoopDelayP99Ms, attrs); + } + if (evt.eventLoopDelayMaxMs !== undefined) { + livenessEventLoopDelayMaxHistogram.record(evt.eventLoopDelayMaxMs, attrs); + } + if (evt.eventLoopUtilization !== undefined) { + livenessEventLoopUtilizationHistogram.record(evt.eventLoopUtilization, attrs); + } + if (evt.cpuCoreRatio !== undefined) { + livenessCpuCoreRatioHistogram.record(evt.cpuCoreRatio, attrs); + } + if (!tracesEnabled) { + return; + } + const spanAttrs: Record = { + ...attrs, + "openclaw.liveness.active": evt.active, + "openclaw.liveness.waiting": evt.waiting, + "openclaw.liveness.queued": evt.queued, + "openclaw.liveness.interval_ms": evt.intervalMs, + ...(evt.eventLoopDelayP99Ms !== undefined + ? { "openclaw.liveness.event_loop_delay_p99_ms": evt.eventLoopDelayP99Ms } + : {}), + ...(evt.eventLoopDelayMaxMs !== undefined + ? { "openclaw.liveness.event_loop_delay_max_ms": evt.eventLoopDelayMaxMs } + : {}), + ...(evt.eventLoopUtilization !== undefined + ? { "openclaw.liveness.event_loop_utilization": evt.eventLoopUtilization } + : {}), + ...(evt.cpuUserMs !== undefined + ? { "openclaw.liveness.cpu_user_ms": evt.cpuUserMs } + : {}), + ...(evt.cpuSystemMs !== undefined + ? { "openclaw.liveness.cpu_system_ms": evt.cpuSystemMs } + : {}), + ...(evt.cpuTotalMs !== undefined + ? { "openclaw.liveness.cpu_total_ms": evt.cpuTotalMs } + : {}), + ...(evt.cpuCoreRatio !== undefined + ? { "openclaw.liveness.cpu_core_ratio": evt.cpuCoreRatio } + : {}), + }; + const span = spanWithDuration("openclaw.liveness.warning", spanAttrs, 0, { + endTimeMs: evt.ts, + }); + span.setStatus({ + code: SpanStatusCode.ERROR, + message: reason, + }); + span.end(evt.ts); + }; + const recordTelemetryExporter = ( evt: TelemetryExporterDiagnosticEvent, metadata: DiagnosticEventMetadata, @@ -2130,6 +2224,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { case "diagnostic.heartbeat": recordHeartbeat(evt); return; + case "diagnostic.liveness.warning": + recordLivenessWarning(evt); + return; case "run.started": recordRunStarted(evt, metadata); return;