fix(diagnostics): export liveness warning telemetry

This commit is contained in:
Peter Steinberger
2026-04-28 05:26:45 +01:00
parent 001bf47727
commit cb1bca1a16
2 changed files with 151 additions and 0 deletions

View File

@@ -492,6 +492,60 @@ describe("diagnostics-otel service", () => {
await service.stop?.(ctx);
});
test("records liveness warning diagnostics", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
await service.start(ctx);
emitDiagnosticEvent({
type: "diagnostic.liveness.warning",
reasons: ["event_loop_delay", "cpu"],
intervalMs: 30_000,
eventLoopDelayP99Ms: 250,
eventLoopDelayMaxMs: 900,
eventLoopUtilization: 0.95,
cpuUserMs: 1200,
cpuSystemMs: 300,
cpuTotalMs: 1500,
cpuCoreRatio: 1.4,
active: 2,
waiting: 1,
queued: 4,
});
await flushDiagnosticEvents();
expect(telemetryState.counters.get("openclaw.liveness.warning")?.add).toHaveBeenCalledWith(1, {
"openclaw.liveness.reason": "event_loop_delay:cpu",
});
expect(
telemetryState.histograms.get("openclaw.liveness.event_loop_delay_p99_ms")?.record,
).toHaveBeenCalledWith(250, {
"openclaw.liveness.reason": "event_loop_delay:cpu",
});
expect(
telemetryState.histograms.get("openclaw.liveness.cpu_core_ratio")?.record,
).toHaveBeenCalledWith(1.4, {
"openclaw.liveness.reason": "event_loop_delay:cpu",
});
const livenessSpan = telemetryState.tracer.startSpan.mock.calls.find(
(call) => call[0] === "openclaw.liveness.warning",
);
expect(livenessSpan?.[1]).toMatchObject({
attributes: {
"openclaw.liveness.reason": "event_loop_delay:cpu",
"openclaw.liveness.active": 2,
"openclaw.liveness.queued": 4,
},
});
const span = telemetryState.spans.find((item) => item.name === "openclaw.liveness.warning");
expect(span?.setStatus).toHaveBeenCalledWith({
code: 2,
message: "event_loop_delay:cpu",
});
await service.stop?.(ctx);
});
test("reports log exporter emit failures without exporting raw error text", async () => {
const events: Array<Parameters<Parameters<typeof onInternalDiagnosticEvent>[0]>[0]> = [];
const unsubscribe = onInternalDiagnosticEvent((event) => {

View File

@@ -888,6 +888,38 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
unit: "1",
description: "Diagnostic memory pressure events",
});
const livenessWarningCounter = meter.createCounter("openclaw.liveness.warning", {
unit: "1",
description: "Diagnostic liveness warning events",
});
const livenessEventLoopDelayP99Histogram = meter.createHistogram(
"openclaw.liveness.event_loop_delay_p99_ms",
{
unit: "ms",
description: "P99 event-loop delay reported by diagnostic liveness warnings",
},
);
const livenessEventLoopDelayMaxHistogram = meter.createHistogram(
"openclaw.liveness.event_loop_delay_max_ms",
{
unit: "ms",
description: "Maximum event-loop delay reported by diagnostic liveness warnings",
},
);
const livenessEventLoopUtilizationHistogram = meter.createHistogram(
"openclaw.liveness.event_loop_utilization",
{
unit: "1",
description: "Event-loop utilization reported by diagnostic liveness warnings",
},
);
const livenessCpuCoreRatioHistogram = meter.createHistogram(
"openclaw.liveness.cpu_core_ratio",
{
unit: "1",
description: "CPU core ratio reported by diagnostic liveness warnings",
},
);
const telemetryExporterCounter = meter.createCounter("openclaw.telemetry.exporter.events", {
unit: "1",
description: "Diagnostic telemetry exporter lifecycle and failure events",
@@ -2058,6 +2090,68 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
queueDepthHistogram.record(evt.queued, { "openclaw.channel": "heartbeat" });
};
const recordLivenessWarning = (
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.liveness.warning" }>,
) => {
const reason = evt.reasons.join(":");
const attrs = {
"openclaw.liveness.reason": lowCardinalityAttr(reason, "unknown"),
};
livenessWarningCounter.add(1, attrs);
queueDepthHistogram.record(evt.queued, { "openclaw.channel": "liveness" });
if (evt.eventLoopDelayP99Ms !== undefined) {
livenessEventLoopDelayP99Histogram.record(evt.eventLoopDelayP99Ms, attrs);
}
if (evt.eventLoopDelayMaxMs !== undefined) {
livenessEventLoopDelayMaxHistogram.record(evt.eventLoopDelayMaxMs, attrs);
}
if (evt.eventLoopUtilization !== undefined) {
livenessEventLoopUtilizationHistogram.record(evt.eventLoopUtilization, attrs);
}
if (evt.cpuCoreRatio !== undefined) {
livenessCpuCoreRatioHistogram.record(evt.cpuCoreRatio, attrs);
}
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number> = {
...attrs,
"openclaw.liveness.active": evt.active,
"openclaw.liveness.waiting": evt.waiting,
"openclaw.liveness.queued": evt.queued,
"openclaw.liveness.interval_ms": evt.intervalMs,
...(evt.eventLoopDelayP99Ms !== undefined
? { "openclaw.liveness.event_loop_delay_p99_ms": evt.eventLoopDelayP99Ms }
: {}),
...(evt.eventLoopDelayMaxMs !== undefined
? { "openclaw.liveness.event_loop_delay_max_ms": evt.eventLoopDelayMaxMs }
: {}),
...(evt.eventLoopUtilization !== undefined
? { "openclaw.liveness.event_loop_utilization": evt.eventLoopUtilization }
: {}),
...(evt.cpuUserMs !== undefined
? { "openclaw.liveness.cpu_user_ms": evt.cpuUserMs }
: {}),
...(evt.cpuSystemMs !== undefined
? { "openclaw.liveness.cpu_system_ms": evt.cpuSystemMs }
: {}),
...(evt.cpuTotalMs !== undefined
? { "openclaw.liveness.cpu_total_ms": evt.cpuTotalMs }
: {}),
...(evt.cpuCoreRatio !== undefined
? { "openclaw.liveness.cpu_core_ratio": evt.cpuCoreRatio }
: {}),
};
const span = spanWithDuration("openclaw.liveness.warning", spanAttrs, 0, {
endTimeMs: evt.ts,
});
span.setStatus({
code: SpanStatusCode.ERROR,
message: reason,
});
span.end(evt.ts);
};
const recordTelemetryExporter = (
evt: TelemetryExporterDiagnosticEvent,
metadata: DiagnosticEventMetadata,
@@ -2130,6 +2224,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
case "diagnostic.heartbeat":
recordHeartbeat(evt);
return;
case "diagnostic.liveness.warning":
recordLivenessWarning(evt);
return;
case "run.started":
recordRunStarted(evt, metadata);
return;