mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 07:10:43 +00:00
fix(diagnostics): export liveness warning telemetry
This commit is contained in:
@@ -492,6 +492,60 @@ describe("diagnostics-otel service", () => {
|
||||
await service.stop?.(ctx);
|
||||
});
|
||||
|
||||
test("records liveness warning diagnostics", async () => {
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
|
||||
|
||||
await service.start(ctx);
|
||||
emitDiagnosticEvent({
|
||||
type: "diagnostic.liveness.warning",
|
||||
reasons: ["event_loop_delay", "cpu"],
|
||||
intervalMs: 30_000,
|
||||
eventLoopDelayP99Ms: 250,
|
||||
eventLoopDelayMaxMs: 900,
|
||||
eventLoopUtilization: 0.95,
|
||||
cpuUserMs: 1200,
|
||||
cpuSystemMs: 300,
|
||||
cpuTotalMs: 1500,
|
||||
cpuCoreRatio: 1.4,
|
||||
active: 2,
|
||||
waiting: 1,
|
||||
queued: 4,
|
||||
});
|
||||
await flushDiagnosticEvents();
|
||||
|
||||
expect(telemetryState.counters.get("openclaw.liveness.warning")?.add).toHaveBeenCalledWith(1, {
|
||||
"openclaw.liveness.reason": "event_loop_delay:cpu",
|
||||
});
|
||||
expect(
|
||||
telemetryState.histograms.get("openclaw.liveness.event_loop_delay_p99_ms")?.record,
|
||||
).toHaveBeenCalledWith(250, {
|
||||
"openclaw.liveness.reason": "event_loop_delay:cpu",
|
||||
});
|
||||
expect(
|
||||
telemetryState.histograms.get("openclaw.liveness.cpu_core_ratio")?.record,
|
||||
).toHaveBeenCalledWith(1.4, {
|
||||
"openclaw.liveness.reason": "event_loop_delay:cpu",
|
||||
});
|
||||
const livenessSpan = telemetryState.tracer.startSpan.mock.calls.find(
|
||||
(call) => call[0] === "openclaw.liveness.warning",
|
||||
);
|
||||
expect(livenessSpan?.[1]).toMatchObject({
|
||||
attributes: {
|
||||
"openclaw.liveness.reason": "event_loop_delay:cpu",
|
||||
"openclaw.liveness.active": 2,
|
||||
"openclaw.liveness.queued": 4,
|
||||
},
|
||||
});
|
||||
const span = telemetryState.spans.find((item) => item.name === "openclaw.liveness.warning");
|
||||
expect(span?.setStatus).toHaveBeenCalledWith({
|
||||
code: 2,
|
||||
message: "event_loop_delay:cpu",
|
||||
});
|
||||
|
||||
await service.stop?.(ctx);
|
||||
});
|
||||
|
||||
test("reports log exporter emit failures without exporting raw error text", async () => {
|
||||
const events: Array<Parameters<Parameters<typeof onInternalDiagnosticEvent>[0]>[0]> = [];
|
||||
const unsubscribe = onInternalDiagnosticEvent((event) => {
|
||||
|
||||
@@ -888,6 +888,38 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
unit: "1",
|
||||
description: "Diagnostic memory pressure events",
|
||||
});
|
||||
const livenessWarningCounter = meter.createCounter("openclaw.liveness.warning", {
|
||||
unit: "1",
|
||||
description: "Diagnostic liveness warning events",
|
||||
});
|
||||
const livenessEventLoopDelayP99Histogram = meter.createHistogram(
|
||||
"openclaw.liveness.event_loop_delay_p99_ms",
|
||||
{
|
||||
unit: "ms",
|
||||
description: "P99 event-loop delay reported by diagnostic liveness warnings",
|
||||
},
|
||||
);
|
||||
const livenessEventLoopDelayMaxHistogram = meter.createHistogram(
|
||||
"openclaw.liveness.event_loop_delay_max_ms",
|
||||
{
|
||||
unit: "ms",
|
||||
description: "Maximum event-loop delay reported by diagnostic liveness warnings",
|
||||
},
|
||||
);
|
||||
const livenessEventLoopUtilizationHistogram = meter.createHistogram(
|
||||
"openclaw.liveness.event_loop_utilization",
|
||||
{
|
||||
unit: "1",
|
||||
description: "Event-loop utilization reported by diagnostic liveness warnings",
|
||||
},
|
||||
);
|
||||
const livenessCpuCoreRatioHistogram = meter.createHistogram(
|
||||
"openclaw.liveness.cpu_core_ratio",
|
||||
{
|
||||
unit: "1",
|
||||
description: "CPU core ratio reported by diagnostic liveness warnings",
|
||||
},
|
||||
);
|
||||
const telemetryExporterCounter = meter.createCounter("openclaw.telemetry.exporter.events", {
|
||||
unit: "1",
|
||||
description: "Diagnostic telemetry exporter lifecycle and failure events",
|
||||
@@ -2058,6 +2090,68 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
queueDepthHistogram.record(evt.queued, { "openclaw.channel": "heartbeat" });
|
||||
};
|
||||
|
||||
const recordLivenessWarning = (
|
||||
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.liveness.warning" }>,
|
||||
) => {
|
||||
const reason = evt.reasons.join(":");
|
||||
const attrs = {
|
||||
"openclaw.liveness.reason": lowCardinalityAttr(reason, "unknown"),
|
||||
};
|
||||
livenessWarningCounter.add(1, attrs);
|
||||
queueDepthHistogram.record(evt.queued, { "openclaw.channel": "liveness" });
|
||||
if (evt.eventLoopDelayP99Ms !== undefined) {
|
||||
livenessEventLoopDelayP99Histogram.record(evt.eventLoopDelayP99Ms, attrs);
|
||||
}
|
||||
if (evt.eventLoopDelayMaxMs !== undefined) {
|
||||
livenessEventLoopDelayMaxHistogram.record(evt.eventLoopDelayMaxMs, attrs);
|
||||
}
|
||||
if (evt.eventLoopUtilization !== undefined) {
|
||||
livenessEventLoopUtilizationHistogram.record(evt.eventLoopUtilization, attrs);
|
||||
}
|
||||
if (evt.cpuCoreRatio !== undefined) {
|
||||
livenessCpuCoreRatioHistogram.record(evt.cpuCoreRatio, attrs);
|
||||
}
|
||||
if (!tracesEnabled) {
|
||||
return;
|
||||
}
|
||||
const spanAttrs: Record<string, string | number> = {
|
||||
...attrs,
|
||||
"openclaw.liveness.active": evt.active,
|
||||
"openclaw.liveness.waiting": evt.waiting,
|
||||
"openclaw.liveness.queued": evt.queued,
|
||||
"openclaw.liveness.interval_ms": evt.intervalMs,
|
||||
...(evt.eventLoopDelayP99Ms !== undefined
|
||||
? { "openclaw.liveness.event_loop_delay_p99_ms": evt.eventLoopDelayP99Ms }
|
||||
: {}),
|
||||
...(evt.eventLoopDelayMaxMs !== undefined
|
||||
? { "openclaw.liveness.event_loop_delay_max_ms": evt.eventLoopDelayMaxMs }
|
||||
: {}),
|
||||
...(evt.eventLoopUtilization !== undefined
|
||||
? { "openclaw.liveness.event_loop_utilization": evt.eventLoopUtilization }
|
||||
: {}),
|
||||
...(evt.cpuUserMs !== undefined
|
||||
? { "openclaw.liveness.cpu_user_ms": evt.cpuUserMs }
|
||||
: {}),
|
||||
...(evt.cpuSystemMs !== undefined
|
||||
? { "openclaw.liveness.cpu_system_ms": evt.cpuSystemMs }
|
||||
: {}),
|
||||
...(evt.cpuTotalMs !== undefined
|
||||
? { "openclaw.liveness.cpu_total_ms": evt.cpuTotalMs }
|
||||
: {}),
|
||||
...(evt.cpuCoreRatio !== undefined
|
||||
? { "openclaw.liveness.cpu_core_ratio": evt.cpuCoreRatio }
|
||||
: {}),
|
||||
};
|
||||
const span = spanWithDuration("openclaw.liveness.warning", spanAttrs, 0, {
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
span.setStatus({
|
||||
code: SpanStatusCode.ERROR,
|
||||
message: reason,
|
||||
});
|
||||
span.end(evt.ts);
|
||||
};
|
||||
|
||||
const recordTelemetryExporter = (
|
||||
evt: TelemetryExporterDiagnosticEvent,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
@@ -2130,6 +2224,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
case "diagnostic.heartbeat":
|
||||
recordHeartbeat(evt);
|
||||
return;
|
||||
case "diagnostic.liveness.warning":
|
||||
recordLivenessWarning(evt);
|
||||
return;
|
||||
case "run.started":
|
||||
recordRunStarted(evt, metadata);
|
||||
return;
|
||||
|
||||
Reference in New Issue
Block a user