feat(diagnostics-otel): add genai operation duration metric

This commit is contained in:
Vincent Koc
2026-04-25 11:41:37 -07:00
parent 81307fc11d
commit dc19069d71
3 changed files with 92 additions and 3 deletions

View File

@@ -32,6 +32,7 @@ Docs: https://docs.openclaw.ai
- Plugins/install: move managed plugin install metadata from `plugins.installs`
to the state-managed `plugins/installs.json` ledger, with legacy config reads
kept as a deprecated compatibility fallback. Thanks @vincentkoc.
- Diagnostics/OTEL: add the GenAI `gen_ai.client.operation.duration` histogram for model-call latency in seconds with bounded provider/model/API and error attributes. Thanks @vincentkoc.
- Diagnostics/OTEL: add bounded outbound message delivery lifecycle diagnostics and export them as low-cardinality delivery spans/metrics without message body, recipient, room, or media-path data. (#71471) Thanks @vincentkoc and @jlapenna.
- Diagnostics/OTEL: emit bounded exec-process diagnostics and export them as `openclaw.exec` spans without exposing command text, working directories, or container identifiers. (#71451) Thanks @vincentkoc and @jlapenna.
- Diagnostics/OTEL: support `OPENCLAW_OTEL_PRELOADED=1` so the plugin can reuse an already-registered OpenTelemetry SDK while keeping OpenClaw diagnostic listeners wired. (#71450) Thanks @vincentkoc and @jlapenna.

View File

@@ -740,6 +740,63 @@ describe("diagnostics-otel service", () => {
await service.stop?.(ctx);
});
test("exports GenAI client operation duration histogram without diagnostic identifiers", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { metrics: true });
await service.start(ctx);
emitDiagnosticEvent({
type: "model.call.completed",
runId: "run-1",
callId: "call-1",
sessionKey: "session-key",
provider: "openai",
model: "gpt-5.4",
api: "openai-completions",
durationMs: 250,
});
emitDiagnosticEvent({
type: "model.call.error",
runId: "run-1",
callId: "call-2",
sessionKey: "session-key",
provider: "google",
model: "gemini-2.5-flash",
api: "google-generative-ai",
durationMs: 1250,
errorCategory: "TimeoutError",
});
await flushDiagnosticEvents();
expect(telemetryState.meter.createHistogram).toHaveBeenCalledWith(
"gen_ai.client.operation.duration",
expect.objectContaining({
unit: "s",
advice: {
explicitBucketBoundaries: expect.arrayContaining([0.01, 0.32, 2.56, 81.92]),
},
}),
);
const genAiOperationDuration = telemetryState.histograms.get(
"gen_ai.client.operation.duration",
);
expect(genAiOperationDuration?.record).toHaveBeenCalledTimes(2);
expect(genAiOperationDuration?.record).toHaveBeenCalledWith(0.25, {
"gen_ai.operation.name": "text_completion",
"gen_ai.provider.name": "openai",
"gen_ai.request.model": "gpt-5.4",
});
expect(genAiOperationDuration?.record).toHaveBeenCalledWith(1.25, {
"gen_ai.operation.name": "generate_content",
"gen_ai.provider.name": "google",
"gen_ai.request.model": "gemini-2.5-flash",
"error.type": "TimeoutError",
});
expect(JSON.stringify(genAiOperationDuration?.record.mock.calls)).not.toContain("session-key");
expect(JSON.stringify(genAiOperationDuration?.record.mock.calls)).not.toContain("run-1");
await service.stop?.(ctx);
});
test("exports run, model call, and tool execution lifecycle spans", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });

View File

@@ -55,6 +55,9 @@ const GEN_AI_LATEST_EXPERIMENTAL_OPT_IN = "gen_ai_latest_experimental";
const GEN_AI_TOKEN_USAGE_BUCKETS = [
1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864,
];
const GEN_AI_OPERATION_DURATION_BUCKETS = [
0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92,
];
type OtelContentCapturePolicy = {
inputMessages: boolean;
@@ -585,6 +588,16 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
explicitBucketBoundaries: GEN_AI_TOKEN_USAGE_BUCKETS,
},
});
const genAiOperationDurationHistogram = meter.createHistogram(
"gen_ai.client.operation.duration",
{
unit: "s",
description: "GenAI client operation duration",
advice: {
explicitBucketBoundaries: GEN_AI_OPERATION_DURATION_BUCKETS,
},
},
);
const costCounter = meter.createCounter("openclaw.cost.usd", {
unit: "1",
description: "Estimated model cost (USD)",
@@ -1307,12 +1320,25 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
"openclaw.api": lowCardinalityAttr(evt.api),
"openclaw.transport": lowCardinalityAttr(evt.transport),
});
const genAiModelCallMetricAttrs = (
evt: ModelCallLifecycleDiagnosticEvent,
errorType?: string,
) => ({
"gen_ai.operation.name": genAiOperationName(evt.api),
"gen_ai.provider.name": lowCardinalityAttr(evt.provider),
"gen_ai.request.model": lowCardinalityAttr(evt.model),
...(errorType ? { "error.type": errorType } : {}),
});
const recordModelCallCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "model.call.completed" }>,
metadata: DiagnosticEventMetadata,
) => {
modelCallDurationHistogram.record(evt.durationMs, modelCallMetricAttrs(evt));
genAiOperationDurationHistogram.record(
evt.durationMs / 1000,
genAiModelCallMetricAttrs(evt),
);
if (!tracesEnabled) {
return;
}
@@ -1344,18 +1370,23 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
evt: Extract<DiagnosticEventPayload, { type: "model.call.error" }>,
metadata: DiagnosticEventMetadata,
) => {
const errorType = lowCardinalityAttr(evt.errorCategory, "other");
modelCallDurationHistogram.record(evt.durationMs, {
...modelCallMetricAttrs(evt),
"openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
"openclaw.errorCategory": errorType,
});
genAiOperationDurationHistogram.record(
evt.durationMs / 1000,
genAiModelCallMetricAttrs(evt, errorType),
);
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
"openclaw.provider": evt.provider,
"openclaw.model": evt.model,
"openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
"error.type": lowCardinalityAttr(evt.errorCategory, "other"),
"openclaw.errorCategory": errorType,
"error.type": errorType,
};
assignGenAiModelCallAttrs(spanAttrs, evt);
if (evt.api) {