fix(diagnostics): defer OTEL run span finalization (#72260)

This commit is contained in:
Sally O'Malley
2026-04-26 14:29:05 -04:00
committed by GitHub
parent e53c068d78
commit 637bd33e69
2 changed files with 590 additions and 95 deletions

View File

@@ -7,14 +7,24 @@ const telemetryState = vi.hoisted(() => {
name: string;
addEvent: ReturnType<typeof vi.fn>;
end: ReturnType<typeof vi.fn>;
setAttributes: ReturnType<typeof vi.fn>;
setStatus: ReturnType<typeof vi.fn>;
spanContext: ReturnType<typeof vi.fn>;
}> = [];
const tracer = {
startSpan: vi.fn((name: string, _opts?: unknown, _ctx?: unknown) => {
const spanNumber = spans.length + 1;
const spanId = spanNumber.toString(16).padStart(16, "0");
const span = {
addEvent: vi.fn(),
end: vi.fn(),
setAttributes: vi.fn(),
setStatus: vi.fn(),
spanContext: vi.fn(() => ({
traceId: "4bf92f3577b34da6a3ce929d0e0e4736",
spanId,
traceFlags: 1,
})),
};
spans.push({ name, ...span });
return span;
@@ -122,6 +132,7 @@ vi.mock("@opentelemetry/semantic-conventions", () => ({
import {
emitTrustedDiagnosticEvent,
onInternalDiagnosticEvent,
resetDiagnosticEventsForTest,
} from "../../../src/infra/diagnostic-events.js";
import type { OpenClawPluginServiceContext } from "../api.js";
import { emitDiagnosticEvent } from "../api.js";
@@ -219,6 +230,7 @@ function flushDiagnosticEvents() {
describe("diagnostics-otel service", () => {
beforeEach(() => {
resetDiagnosticEventsForTest();
delete process.env.OPENCLAW_OTEL_PRELOADED;
delete process.env.OTEL_SEMCONV_STABILITY_OPT_IN;
telemetryState.counters.clear();
@@ -241,6 +253,7 @@ describe("diagnostics-otel service", () => {
});
afterEach(() => {
resetDiagnosticEventsForTest();
if (ORIGINAL_OPENCLAW_OTEL_PRELOADED === undefined) {
delete process.env.OPENCLAW_OTEL_PRELOADED;
} else {
@@ -561,6 +574,7 @@ describe("diagnostics-otel service", () => {
outcome: "completed",
durationMs: 100,
});
await flushDiagnosticEvents();
expect(sdkStart).not.toHaveBeenCalled();
expect(telemetryState.histograms.get("openclaw.run.duration_ms")?.record).toHaveBeenCalledWith(
@@ -1506,6 +1520,17 @@ describe("diagnostics-otel service", () => {
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
await service.start(ctx);
emitTrustedDiagnosticEvent({
type: "run.started",
runId: "run-1",
provider: "openai",
model: "gpt-5.4",
trace: {
traceId: TRACE_ID,
spanId: SPAN_ID,
traceFlags: "01",
},
});
emitTrustedDiagnosticEvent({
type: "context.assembled",
runId: "run-1",
@@ -1536,6 +1561,8 @@ describe("diagnostics-otel service", () => {
const contextCall = telemetryState.tracer.startSpan.mock.calls.find(
(call) => call[0] === "openclaw.context.assembled",
);
const runSpan = telemetryState.spans.find((span) => span.name === "openclaw.run");
const runSpanId = runSpan?.spanContext.mock.results[0]?.value?.spanId;
expect(contextCall?.[1]).toMatchObject({
attributes: {
"openclaw.provider": "openai",
@@ -1553,12 +1580,19 @@ describe("diagnostics-otel service", () => {
"openclaw.context.reserve_tokens": 4096,
},
});
expect(contextCall?.[1]).toEqual({
attributes: expect.any(Object),
startTime: expect.any(Number),
});
expect(JSON.stringify(contextCall)).not.toContain("session-key");
expect(JSON.stringify(contextCall)).not.toContain("prompt text");
expect(telemetryState.tracer.setSpanContext).toHaveBeenCalledWith(
expect.anything(),
expect.objectContaining({ traceId: TRACE_ID, spanId: SPAN_ID }),
expect.objectContaining({ traceId: TRACE_ID, spanId: runSpanId }),
);
expect(
(contextCall?.[2] as { spanContext?: { spanId?: string } } | undefined)?.spanContext?.spanId,
).toBe(runSpanId);
await service.stop?.(ctx);
});
@@ -1688,7 +1722,185 @@ describe("diagnostics-otel service", () => {
await service.stop?.(ctx);
});
test("parents trusted diagnostic lifecycle spans from explicit parent ids", async () => {
test("parents trusted diagnostic lifecycle spans from active started spans", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
await service.start(ctx);
emitTrustedDiagnosticEvent({
type: "run.started",
runId: "run-1",
provider: "openai",
model: "gpt-5.4",
trace: {
traceId: TRACE_ID,
spanId: CHILD_SPAN_ID,
parentSpanId: SPAN_ID,
traceFlags: "01",
},
});
emitTrustedDiagnosticEvent({
type: "model.call.started",
runId: "run-1",
callId: "call-1",
provider: "openai",
model: "gpt-5.4",
trace: {
traceId: TRACE_ID,
spanId: GRANDCHILD_SPAN_ID,
parentSpanId: CHILD_SPAN_ID,
traceFlags: "01",
},
});
emitTrustedDiagnosticEvent({
type: "tool.execution.started",
runId: "run-1",
toolName: "read",
trace: {
traceId: TRACE_ID,
spanId: TOOL_SPAN_ID,
parentSpanId: GRANDCHILD_SPAN_ID,
traceFlags: "01",
},
});
emitTrustedDiagnosticEvent({
type: "tool.execution.error",
runId: "run-1",
toolName: "read",
durationMs: 20,
errorCategory: "TypeError",
trace: {
traceId: TRACE_ID,
spanId: TOOL_SPAN_ID,
parentSpanId: GRANDCHILD_SPAN_ID,
traceFlags: "01",
},
});
emitTrustedDiagnosticEvent({
type: "model.call.completed",
runId: "run-1",
callId: "call-1",
provider: "openai",
model: "gpt-5.4",
durationMs: 80,
trace: {
traceId: TRACE_ID,
spanId: GRANDCHILD_SPAN_ID,
parentSpanId: CHILD_SPAN_ID,
traceFlags: "01",
},
});
emitTrustedDiagnosticEvent({
type: "run.completed",
runId: "run-1",
provider: "openai",
model: "gpt-5.4",
outcome: "completed",
durationMs: 100,
trace: {
traceId: TRACE_ID,
spanId: CHILD_SPAN_ID,
parentSpanId: SPAN_ID,
traceFlags: "01",
},
});
await flushDiagnosticEvents();
const runSpan = telemetryState.spans.find((span) => span.name === "openclaw.run");
const modelSpan = telemetryState.spans.find((span) => span.name === "openclaw.model.call");
const toolSpan = telemetryState.spans.find((span) => span.name === "openclaw.tool.execution");
const runSpanId = runSpan?.spanContext.mock.results[0]?.value?.spanId;
const modelSpanId = modelSpan?.spanContext.mock.results[0]?.value?.spanId;
expect(telemetryState.tracer.setSpanContext).toHaveBeenCalledTimes(2);
expect(telemetryState.tracer.setSpanContext.mock.calls.map((call) => call[1])).toEqual([
expect.objectContaining({ traceId: TRACE_ID, spanId: runSpanId }),
expect.objectContaining({ traceId: TRACE_ID, spanId: modelSpanId }),
]);
const parentBySpanName = Object.fromEntries(
telemetryState.tracer.startSpan.mock.calls.map((call) => [
call[0],
(call[2] as { spanContext?: { spanId?: string } } | undefined)?.spanContext?.spanId,
]),
);
expect(parentBySpanName).toMatchObject({
"openclaw.run": undefined,
"openclaw.model.call": runSpanId,
"openclaw.tool.execution": modelSpanId,
});
expect(toolSpan?.setStatus).toHaveBeenCalledWith({
code: 2,
message: "TypeError",
});
await service.stop?.(ctx);
});
test("keeps trusted run spans alive long enough for post-completion usage parenting", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
await service.start(ctx);
emitTrustedDiagnosticEvent({
type: "run.started",
runId: "run-1",
provider: "openai",
model: "gpt-5.4",
trace: {
traceId: TRACE_ID,
spanId: CHILD_SPAN_ID,
parentSpanId: SPAN_ID,
traceFlags: "01",
},
});
emitTrustedDiagnosticEvent({
type: "run.completed",
runId: "run-1",
provider: "openai",
model: "gpt-5.4",
outcome: "completed",
durationMs: 100,
trace: {
traceId: TRACE_ID,
spanId: CHILD_SPAN_ID,
parentSpanId: SPAN_ID,
traceFlags: "01",
},
});
emitTrustedDiagnosticEvent({
type: "model.usage",
provider: "openai",
model: "gpt-5.4",
usage: { input: 3, output: 2, total: 5 },
durationMs: 10,
trace: {
traceId: TRACE_ID,
spanId: GRANDCHILD_SPAN_ID,
parentSpanId: SPAN_ID,
traceFlags: "01",
},
});
await flushDiagnosticEvents();
const runSpan = telemetryState.spans.find((span) => span.name === "openclaw.run");
const runSpanId = runSpan?.spanContext.mock.results[0]?.value?.spanId;
const modelUsageCall = telemetryState.tracer.startSpan.mock.calls.find(
(call) => call[0] === "openclaw.model.usage",
);
expect(telemetryState.tracer.setSpanContext).toHaveBeenCalledWith(
expect.anything(),
expect.objectContaining({ traceId: TRACE_ID, spanId: runSpanId }),
);
expect(
(modelUsageCall?.[2] as { spanContext?: { spanId?: string } } | undefined)?.spanContext
?.spanId,
).toBe(runSpanId);
expect(runSpan?.end).toHaveBeenCalledWith(expect.any(Number));
await service.stop?.(ctx);
});
test("does not force remote parents for completed-only trusted lifecycle spans", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
await service.start(ctx);
@@ -1721,38 +1933,15 @@ describe("diagnostics-otel service", () => {
traceFlags: "01",
},
});
emitTrustedDiagnosticEvent({
type: "tool.execution.error",
runId: "run-1",
toolName: "read",
durationMs: 20,
errorCategory: "TypeError",
trace: {
traceId: TRACE_ID,
spanId: TOOL_SPAN_ID,
parentSpanId: GRANDCHILD_SPAN_ID,
traceFlags: "01",
},
});
await flushDiagnosticEvents();
expect(telemetryState.tracer.setSpanContext).toHaveBeenCalledTimes(3);
expect(telemetryState.tracer.setSpanContext.mock.calls.map((call) => call[1])).toEqual([
expect.objectContaining({ traceId: TRACE_ID, spanId: SPAN_ID }),
expect.objectContaining({ traceId: TRACE_ID, spanId: CHILD_SPAN_ID }),
expect.objectContaining({ traceId: TRACE_ID, spanId: GRANDCHILD_SPAN_ID }),
]);
expect(telemetryState.tracer.setSpanContext).not.toHaveBeenCalled();
const parentBySpanName = Object.fromEntries(
telemetryState.tracer.startSpan.mock.calls.map((call) => [
call[0],
(call[2] as { spanContext?: { spanId?: string } } | undefined)?.spanContext?.spanId,
]),
telemetryState.tracer.startSpan.mock.calls.map((call) => [call[0], call[2]]),
);
expect(parentBySpanName).toMatchObject({
"openclaw.run": SPAN_ID,
"openclaw.model.call": CHILD_SPAN_ID,
"openclaw.tool.execution": GRANDCHILD_SPAN_ID,
"openclaw.run": undefined,
"openclaw.model.call": undefined,
});
await service.stop?.(ctx);
});
@@ -1860,6 +2049,93 @@ describe("diagnostics-otel service", () => {
await service.stop?.(ctx);
});
test("does not create live started spans for untrusted lifecycle diagnostics", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
await service.start(ctx);
emitDiagnosticEvent({
type: "run.started",
runId: "run-1",
provider: "openai",
model: "gpt-5.4",
});
emitDiagnosticEvent({
type: "run.completed",
runId: "run-1",
provider: "openai",
model: "gpt-5.4",
outcome: "completed",
durationMs: 100,
});
emitDiagnosticEvent({
type: "model.call.started",
runId: "run-1",
callId: "call-1",
provider: "openai",
model: "gpt-5.4",
});
emitDiagnosticEvent({
type: "model.call.completed",
runId: "run-1",
callId: "call-1",
provider: "openai",
model: "gpt-5.4",
durationMs: 80,
});
emitDiagnosticEvent({
type: "tool.execution.started",
runId: "run-1",
toolName: "read",
});
emitDiagnosticEvent({
type: "tool.execution.error",
runId: "run-1",
toolName: "read",
durationMs: 20,
errorCategory: "TypeError",
});
emitDiagnosticEvent({
type: "harness.run.started",
runId: "run-1",
provider: "codex",
model: "gpt-5.4",
harnessId: "codex",
pluginId: "codex-plugin",
});
emitDiagnosticEvent({
type: "harness.run.completed",
runId: "run-1",
provider: "codex",
model: "gpt-5.4",
harnessId: "codex",
pluginId: "codex-plugin",
outcome: "completed",
durationMs: 90,
});
await flushDiagnosticEvents();
expect(
telemetryState.tracer.startSpan.mock.calls.filter((call) => call[0] === "openclaw.run"),
).toHaveLength(1);
expect(
telemetryState.tracer.startSpan.mock.calls.filter(
(call) => call[0] === "openclaw.model.call",
),
).toHaveLength(1);
expect(
telemetryState.tracer.startSpan.mock.calls.filter(
(call) => call[0] === "openclaw.tool.execution",
),
).toHaveLength(1);
expect(
telemetryState.tracer.startSpan.mock.calls.filter(
(call) => call[0] === "openclaw.harness.run",
),
).toHaveLength(1);
await service.stop?.(ctx);
});
test("exports exec process spans without command text", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });

View File

@@ -81,9 +81,9 @@ type ModelCallLifecycleDiagnosticEvent = Extract<
DiagnosticEventPayload,
{ type: "model.call.completed" | "model.call.error" }
>;
type HarnessRunLifecycleDiagnosticEvent = Extract<
type HarnessRunDiagnosticEvent = Extract<
DiagnosticEventPayload,
{ type: "harness.run.completed" | "harness.run.error" }
{ type: "harness.run.started" | "harness.run.completed" | "harness.run.error" }
>;
type TelemetryExporterDiagnosticEvent = Extract<
DiagnosticEventPayload,
@@ -244,7 +244,7 @@ function assignGenAiSpanIdentityAttrs(
function assignGenAiModelCallAttrs(
attrs: Record<string, string | number | boolean>,
evt: ModelCallLifecycleDiagnosticEvent,
evt: { api?: string; model?: string; provider?: string },
): void {
assignGenAiSpanIdentityAttrs(attrs, evt);
}
@@ -467,19 +467,6 @@ function contextForTraceContext(traceContext: DiagnosticTraceContext | undefined
});
}
function contextForDiagnosticSpanParent(traceContext: DiagnosticTraceContext | undefined) {
const normalized = normalizeTraceContext(traceContext);
if (!normalized?.parentSpanId) {
return undefined;
}
return trace.setSpanContext(otelContextApi.active(), {
traceId: normalized.traceId,
spanId: normalized.parentSpanId,
traceFlags: traceFlagsToOtel(normalized.traceFlags),
isRemote: true,
});
}
function contextForTrustedTraceContext(
evt: DiagnosticEventPayload,
metadata: DiagnosticEventMetadata,
@@ -487,13 +474,6 @@ function contextForTrustedTraceContext(
return metadata.trusted ? contextForTraceContext(evt.trace) : undefined;
}
function contextForTrustedDiagnosticSpanParent(
evt: DiagnosticEventPayload,
metadata: DiagnosticEventMetadata,
) {
return metadata.trusted ? contextForDiagnosticSpanParent(evt.trace) : undefined;
}
function addTraceAttributes(
attributes: Record<string, string | number | boolean>,
traceContext: DiagnosticTraceContext | undefined,
@@ -518,17 +498,21 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
let sdk: NodeSDK | null = null;
let logProvider: LoggerProvider | null = null;
let unsubscribe: (() => void) | null = null;
let stopActiveTrustedSpans: (() => void) | null = null;
const stopStarted = async () => {
const currentUnsubscribe = unsubscribe;
const currentLogProvider = logProvider;
const currentSdk = sdk;
const currentStopActiveTrustedSpans = stopActiveTrustedSpans;
unsubscribe = null;
logProvider = null;
sdk = null;
stopActiveTrustedSpans = null;
currentUnsubscribe?.();
currentStopActiveTrustedSpans?.();
if (currentLogProvider) {
await currentLogProvider.shutdown().catch(() => undefined);
}
@@ -694,6 +678,24 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
const meter = metrics.getMeter("openclaw");
const tracer = trace.getTracer("openclaw");
const activeTrustedSpans = new Map<string, ReturnType<typeof tracer.startSpan>>();
const activeTrustedSpanAliases = new Map<string, ReturnType<typeof tracer.startSpan>>();
const pendingTrustedRunFinalizers = new Map<string, ReturnType<typeof setImmediate>>();
stopActiveTrustedSpans = () => {
const stopAt = Date.now();
for (const handle of pendingTrustedRunFinalizers.values()) {
clearImmediate(handle);
}
pendingTrustedRunFinalizers.clear();
for (const span of new Set([
...activeTrustedSpans.values(),
...activeTrustedSpanAliases.values(),
])) {
span.end(stopAt);
}
activeTrustedSpans.clear();
activeTrustedSpanAliases.clear();
};
const tokensCounter = meter.createCounter("openclaw.tokens", {
unit: "1",
@@ -942,11 +944,16 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
options: {
parentContext?: ReturnType<typeof contextForTraceContext> | null;
endTimeMs?: number;
startTimeMs?: number;
} = {},
) => {
const endTimeMs = options.endTimeMs ?? Date.now();
const startTime =
typeof durationMs === "number" ? endTimeMs - Math.max(0, durationMs) : undefined;
typeof options.startTimeMs === "number"
? options.startTimeMs
: typeof durationMs === "number" && durationMs >= 0
? endTimeMs - durationMs
: undefined;
const parentContext =
"parentContext" in options ? (options.parentContext ?? undefined) : undefined;
const span = tracer.startSpan(
@@ -959,6 +966,78 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
);
return span;
};
const trustedTraceContext = (
evt: DiagnosticEventPayload,
metadata: DiagnosticEventMetadata,
) => (metadata.trusted ? normalizeTraceContext(evt.trace) : undefined);
const activeTrustedParentContext = (
evt: DiagnosticEventPayload,
metadata: DiagnosticEventMetadata,
) => {
const parentSpanId = trustedTraceContext(evt, metadata)?.parentSpanId;
if (!parentSpanId) {
return undefined;
}
const activeParentSpan =
activeTrustedSpans.get(parentSpanId) ?? activeTrustedSpanAliases.get(parentSpanId);
if (!activeParentSpan) {
return undefined;
}
return trace.setSpanContext(otelContextApi.active(), activeParentSpan.spanContext());
};
const trackTrustedSpan = (
evt: DiagnosticEventPayload,
metadata: DiagnosticEventMetadata,
span: ReturnType<typeof tracer.startSpan>,
) => {
const spanId = trustedTraceContext(evt, metadata)?.spanId;
if (spanId) {
activeTrustedSpans.set(spanId, span);
}
return span;
};
const takeTrackedTrustedSpan = (
evt: DiagnosticEventPayload,
metadata: DiagnosticEventMetadata,
) => {
const spanId = trustedTraceContext(evt, metadata)?.spanId;
if (!spanId) {
return undefined;
}
const span = activeTrustedSpans.get(spanId);
if (span) {
activeTrustedSpans.delete(spanId);
}
return span;
};
const setSpanAttrs = (
span: ReturnType<typeof tracer.startSpan>,
attributes: Record<string, string | number | boolean>,
) => {
span.setAttributes?.(redactOtelAttributes(attributes));
};
const scheduleTrackedRunSpanFinalize = (
spanId: string,
parentSpanId: string | undefined,
span: ReturnType<typeof tracer.startSpan>,
endTimeMs: number,
) => {
const existingHandle = pendingTrustedRunFinalizers.get(spanId);
if (existingHandle) {
clearImmediate(existingHandle);
}
const handle = setImmediate(() => {
pendingTrustedRunFinalizers.delete(spanId);
if (activeTrustedSpans.get(spanId) === span) {
activeTrustedSpans.delete(spanId);
}
if (parentSpanId && activeTrustedSpanAliases.get(parentSpanId) === span) {
activeTrustedSpanAliases.delete(parentSpanId);
}
span.end(endTimeMs);
});
pendingTrustedRunFinalizers.set(spanId, handle);
};
const addRunAttrs = (
spanAttrs: Record<string, string | number | boolean>,
@@ -1093,7 +1172,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
);
const span = spanWithDuration("openclaw.model.usage", spanAttrs, evt.durationMs, {
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
span.end(evt.ts);
@@ -1258,6 +1337,29 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
span.end(evt.ts);
};
const recordRunStarted = (
evt: Extract<DiagnosticEventPayload, { type: "run.started" }>,
metadata: DiagnosticEventMetadata,
) => {
if (!tracesEnabled || !metadata.trusted) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {};
addRunAttrs(spanAttrs, evt);
const span = trackTrustedSpan(
evt,
metadata,
spanWithDuration("openclaw.run", spanAttrs, undefined, {
parentContext: activeTrustedParentContext(evt, metadata),
startTimeMs: evt.ts,
}),
);
const parentSpanId = trustedTraceContext(evt, metadata)?.parentSpanId;
if (parentSpanId && !activeTrustedSpans.has(parentSpanId)) {
activeTrustedSpanAliases.set(parentSpanId, span);
}
};
const recordLaneEnqueue = (
evt: Extract<DiagnosticEventPayload, { type: "queue.lane.enqueue" }>,
) => {
@@ -1421,28 +1523,65 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
if (evt.errorCategory) {
spanAttrs["openclaw.errorCategory"] = lowCardinalityAttr(evt.errorCategory, "other");
}
const span = spanWithDuration("openclaw.run", spanAttrs, evt.durationMs, {
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
endTimeMs: evt.ts,
});
const trustedTrace = trustedTraceContext(evt, metadata);
const trackedSpan = trustedTrace?.spanId
? activeTrustedSpans.get(trustedTrace.spanId)
: undefined;
const span =
trackedSpan ??
spanWithDuration("openclaw.run", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
if (evt.outcome === "error") {
span.setStatus({
code: SpanStatusCode.ERROR,
...(evt.errorCategory ? { message: redactSensitiveText(evt.errorCategory) } : {}),
});
}
if (trackedSpan && trustedTrace?.spanId) {
scheduleTrackedRunSpanFinalize(
trustedTrace.spanId,
trustedTrace.parentSpanId,
trackedSpan,
evt.ts,
);
return;
}
span.end(evt.ts);
};
const harnessRunMetricAttrs = (evt: HarnessRunLifecycleDiagnosticEvent) => ({
const harnessRunMetricAttrs = (evt: HarnessRunDiagnosticEvent) => ({
"openclaw.harness.id": lowCardinalityAttr(evt.harnessId, "unknown"),
"openclaw.harness.plugin": lowCardinalityAttr(evt.pluginId),
"openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome,
...(evt.type === "harness.run.started"
? {}
: {
"openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome,
}),
"openclaw.provider": lowCardinalityAttr(evt.provider, "unknown"),
"openclaw.model": lowCardinalityAttr(evt.model, "unknown"),
...(evt.channel ? { "openclaw.channel": lowCardinalityAttr(evt.channel) } : {}),
});
const recordHarnessRunStarted = (
evt: Extract<DiagnosticEventPayload, { type: "harness.run.started" }>,
metadata: DiagnosticEventMetadata,
) => {
if (!tracesEnabled || !metadata.trusted) {
return;
}
trackTrustedSpan(
evt,
metadata,
spanWithDuration("openclaw.harness.run", harnessRunMetricAttrs(evt), undefined, {
parentContext: activeTrustedParentContext(evt, metadata),
startTimeMs: evt.ts,
}),
);
};
const recordHarnessRunCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "harness.run.completed" }>,
metadata: DiagnosticEventMetadata,
@@ -1467,10 +1606,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
spanAttrs["openclaw.harness.items.completed"] = evt.itemLifecycle.completedCount;
spanAttrs["openclaw.harness.items.active"] = evt.itemLifecycle.activeCount;
}
const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
endTimeMs: evt.ts,
});
const span =
takeTrackedTrustedSpan(evt, metadata) ??
spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
if (evt.outcome === "error") {
span.setStatus({
code: SpanStatusCode.ERROR,
@@ -1499,10 +1641,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
"error.type": errorType,
...(evt.cleanupFailed ? { "openclaw.harness.cleanup_failed": true } : {}),
};
const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
endTimeMs: evt.ts,
});
const span =
takeTrackedTrustedSpan(evt, metadata) ??
spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
span.setStatus({
code: SpanStatusCode.ERROR,
message: errorType,
@@ -1534,7 +1679,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
spanAttrs["openclaw.context.reserve_tokens"] = evt.reserveTokens;
}
const span = spanWithDuration("openclaw.context.assembled", spanAttrs, 0, {
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
span.end(evt.ts);
@@ -1556,6 +1701,34 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
...(errorType ? { "error.type": errorType } : {}),
});
const recordModelCallStarted = (
evt: Extract<DiagnosticEventPayload, { type: "model.call.started" }>,
metadata: DiagnosticEventMetadata,
) => {
if (!tracesEnabled || !metadata.trusted) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
"openclaw.provider": evt.provider,
"openclaw.model": evt.model,
};
assignGenAiModelCallAttrs(spanAttrs, evt);
if (evt.api) {
spanAttrs["openclaw.api"] = evt.api;
}
if (evt.transport) {
spanAttrs["openclaw.transport"] = evt.transport;
}
trackTrustedSpan(
evt,
metadata,
spanWithDuration("openclaw.model.call", spanAttrs, undefined, {
parentContext: activeTrustedParentContext(evt, metadata),
startTimeMs: evt.ts,
}),
);
};
const recordModelCallCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "model.call.completed" }>,
metadata: DiagnosticEventMetadata,
@@ -1584,10 +1757,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
evt as unknown as Record<string, unknown>,
contentCapturePolicy,
);
const span = spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, {
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
endTimeMs: evt.ts,
});
const span =
takeTrackedTrustedSpan(evt, metadata) ??
spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash);
span.end(evt.ts);
};
@@ -1626,10 +1802,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
evt as unknown as Record<string, unknown>,
contentCapturePolicy,
);
const span = spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, {
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
endTimeMs: evt.ts,
});
const span =
takeTrackedTrustedSpan(evt, metadata) ??
spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash);
span.setStatus({
code: SpanStatusCode.ERROR,
@@ -1638,6 +1817,36 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
span.end(evt.ts);
};
const toolExecutionBaseAttrs = (
evt: Extract<
DiagnosticEventPayload,
{
type: "tool.execution.started" | "tool.execution.completed" | "tool.execution.error";
}
>,
): Record<string, string | number | boolean> => ({
"openclaw.toolName": evt.toolName,
"gen_ai.tool.name": evt.toolName,
...paramsSummaryAttrs(evt.paramsSummary),
});
const recordToolExecutionStarted = (
evt: Extract<DiagnosticEventPayload, { type: "tool.execution.started" }>,
metadata: DiagnosticEventMetadata,
) => {
if (!tracesEnabled || !metadata.trusted) {
return;
}
trackTrustedSpan(
evt,
metadata,
spanWithDuration("openclaw.tool.execution", toolExecutionBaseAttrs(evt), undefined, {
parentContext: activeTrustedParentContext(evt, metadata),
startTimeMs: evt.ts,
}),
);
};
const recordToolExecutionCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "tool.execution.completed" }>,
metadata: DiagnosticEventMetadata,
@@ -1651,9 +1860,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
"openclaw.toolName": evt.toolName,
"gen_ai.tool.name": evt.toolName,
...paramsSummaryAttrs(evt.paramsSummary),
...toolExecutionBaseAttrs(evt),
};
addRunAttrs(spanAttrs, evt);
assignOtelToolContentAttributes(
@@ -1661,10 +1868,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
evt as unknown as Record<string, unknown>,
contentCapturePolicy,
);
const span = spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
endTimeMs: evt.ts,
});
const span =
takeTrackedTrustedSpan(evt, metadata) ??
spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
span.end(evt.ts);
};
@@ -1682,10 +1892,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
"openclaw.toolName": evt.toolName,
...toolExecutionBaseAttrs(evt),
"openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
"gen_ai.tool.name": evt.toolName,
...paramsSummaryAttrs(evt.paramsSummary),
};
addRunAttrs(spanAttrs, evt);
if (evt.errorCode) {
@@ -1696,10 +1904,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
evt as unknown as Record<string, unknown>,
contentCapturePolicy,
);
const span = spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
endTimeMs: evt.ts,
});
const span =
takeTrackedTrustedSpan(evt, metadata) ??
spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
span.setStatus({
code: SpanStatusCode.ERROR,
message: redactSensitiveText(evt.errorCategory),
@@ -1827,9 +2038,15 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
case "diagnostic.heartbeat":
recordHeartbeat(evt);
return;
case "run.started":
recordRunStarted(evt, metadata);
return;
case "run.completed":
recordRunCompleted(evt, metadata);
return;
case "harness.run.started":
recordHarnessRunStarted(evt, metadata);
return;
case "harness.run.completed":
recordHarnessRunCompleted(evt, metadata);
return;
@@ -1839,12 +2056,18 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
case "context.assembled":
recordContextAssembled(evt, metadata);
return;
case "model.call.started":
recordModelCallStarted(evt, metadata);
return;
case "model.call.completed":
recordModelCallCompleted(evt, metadata);
return;
case "model.call.error":
recordModelCallError(evt, metadata);
return;
case "tool.execution.started":
recordToolExecutionStarted(evt, metadata);
return;
case "tool.execution.completed":
recordToolExecutionCompleted(evt, metadata);
return;
@@ -1869,10 +2092,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
case "telemetry.exporter":
recordTelemetryExporter(evt, metadata);
return;
case "tool.execution.started":
case "run.started":
case "harness.run.started":
case "model.call.started":
case "payload.large":
return;
}