diff --git a/extensions/diagnostics-otel/src/service.test.ts b/extensions/diagnostics-otel/src/service.test.ts index 0cf3b373e71..e96524982e6 100644 --- a/extensions/diagnostics-otel/src/service.test.ts +++ b/extensions/diagnostics-otel/src/service.test.ts @@ -7,14 +7,24 @@ const telemetryState = vi.hoisted(() => { name: string; addEvent: ReturnType; end: ReturnType; + setAttributes: ReturnType; setStatus: ReturnType; + spanContext: ReturnType; }> = []; const tracer = { startSpan: vi.fn((name: string, _opts?: unknown, _ctx?: unknown) => { + const spanNumber = spans.length + 1; + const spanId = spanNumber.toString(16).padStart(16, "0"); const span = { addEvent: vi.fn(), end: vi.fn(), + setAttributes: vi.fn(), setStatus: vi.fn(), + spanContext: vi.fn(() => ({ + traceId: "4bf92f3577b34da6a3ce929d0e0e4736", + spanId, + traceFlags: 1, + })), }; spans.push({ name, ...span }); return span; @@ -122,6 +132,7 @@ vi.mock("@opentelemetry/semantic-conventions", () => ({ import { emitTrustedDiagnosticEvent, onInternalDiagnosticEvent, + resetDiagnosticEventsForTest, } from "../../../src/infra/diagnostic-events.js"; import type { OpenClawPluginServiceContext } from "../api.js"; import { emitDiagnosticEvent } from "../api.js"; @@ -219,6 +230,7 @@ function flushDiagnosticEvents() { describe("diagnostics-otel service", () => { beforeEach(() => { + resetDiagnosticEventsForTest(); delete process.env.OPENCLAW_OTEL_PRELOADED; delete process.env.OTEL_SEMCONV_STABILITY_OPT_IN; telemetryState.counters.clear(); @@ -241,6 +253,7 @@ describe("diagnostics-otel service", () => { }); afterEach(() => { + resetDiagnosticEventsForTest(); if (ORIGINAL_OPENCLAW_OTEL_PRELOADED === undefined) { delete process.env.OPENCLAW_OTEL_PRELOADED; } else { @@ -561,6 +574,7 @@ describe("diagnostics-otel service", () => { outcome: "completed", durationMs: 100, }); + await flushDiagnosticEvents(); expect(sdkStart).not.toHaveBeenCalled(); expect(telemetryState.histograms.get("openclaw.run.duration_ms")?.record).toHaveBeenCalledWith( @@ -1506,6 +1520,17 @@ describe("diagnostics-otel service", () => { const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true }); await service.start(ctx); + emitTrustedDiagnosticEvent({ + type: "run.started", + runId: "run-1", + provider: "openai", + model: "gpt-5.4", + trace: { + traceId: TRACE_ID, + spanId: SPAN_ID, + traceFlags: "01", + }, + }); emitTrustedDiagnosticEvent({ type: "context.assembled", runId: "run-1", @@ -1536,6 +1561,8 @@ describe("diagnostics-otel service", () => { const contextCall = telemetryState.tracer.startSpan.mock.calls.find( (call) => call[0] === "openclaw.context.assembled", ); + const runSpan = telemetryState.spans.find((span) => span.name === "openclaw.run"); + const runSpanId = runSpan?.spanContext.mock.results[0]?.value?.spanId; expect(contextCall?.[1]).toMatchObject({ attributes: { "openclaw.provider": "openai", @@ -1553,12 +1580,19 @@ describe("diagnostics-otel service", () => { "openclaw.context.reserve_tokens": 4096, }, }); + expect(contextCall?.[1]).toEqual({ + attributes: expect.any(Object), + startTime: expect.any(Number), + }); expect(JSON.stringify(contextCall)).not.toContain("session-key"); expect(JSON.stringify(contextCall)).not.toContain("prompt text"); expect(telemetryState.tracer.setSpanContext).toHaveBeenCalledWith( expect.anything(), - expect.objectContaining({ traceId: TRACE_ID, spanId: SPAN_ID }), + expect.objectContaining({ traceId: TRACE_ID, spanId: runSpanId }), ); + expect( + (contextCall?.[2] as { spanContext?: { spanId?: string } } | undefined)?.spanContext?.spanId, + ).toBe(runSpanId); await service.stop?.(ctx); }); @@ -1688,7 +1722,185 @@ describe("diagnostics-otel service", () => { await service.stop?.(ctx); }); - test("parents trusted diagnostic lifecycle spans from explicit parent ids", async () => { + test("parents trusted diagnostic lifecycle spans from active started spans", async () => { + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true }); + await service.start(ctx); + + emitTrustedDiagnosticEvent({ + type: "run.started", + runId: "run-1", + provider: "openai", + model: "gpt-5.4", + trace: { + traceId: TRACE_ID, + spanId: CHILD_SPAN_ID, + parentSpanId: SPAN_ID, + traceFlags: "01", + }, + }); + emitTrustedDiagnosticEvent({ + type: "model.call.started", + runId: "run-1", + callId: "call-1", + provider: "openai", + model: "gpt-5.4", + trace: { + traceId: TRACE_ID, + spanId: GRANDCHILD_SPAN_ID, + parentSpanId: CHILD_SPAN_ID, + traceFlags: "01", + }, + }); + emitTrustedDiagnosticEvent({ + type: "tool.execution.started", + runId: "run-1", + toolName: "read", + trace: { + traceId: TRACE_ID, + spanId: TOOL_SPAN_ID, + parentSpanId: GRANDCHILD_SPAN_ID, + traceFlags: "01", + }, + }); + emitTrustedDiagnosticEvent({ + type: "tool.execution.error", + runId: "run-1", + toolName: "read", + durationMs: 20, + errorCategory: "TypeError", + trace: { + traceId: TRACE_ID, + spanId: TOOL_SPAN_ID, + parentSpanId: GRANDCHILD_SPAN_ID, + traceFlags: "01", + }, + }); + emitTrustedDiagnosticEvent({ + type: "model.call.completed", + runId: "run-1", + callId: "call-1", + provider: "openai", + model: "gpt-5.4", + durationMs: 80, + trace: { + traceId: TRACE_ID, + spanId: GRANDCHILD_SPAN_ID, + parentSpanId: CHILD_SPAN_ID, + traceFlags: "01", + }, + }); + emitTrustedDiagnosticEvent({ + type: "run.completed", + runId: "run-1", + provider: "openai", + model: "gpt-5.4", + outcome: "completed", + durationMs: 100, + trace: { + traceId: TRACE_ID, + spanId: CHILD_SPAN_ID, + parentSpanId: SPAN_ID, + traceFlags: "01", + }, + }); + await flushDiagnosticEvents(); + + const runSpan = telemetryState.spans.find((span) => span.name === "openclaw.run"); + const modelSpan = telemetryState.spans.find((span) => span.name === "openclaw.model.call"); + const toolSpan = telemetryState.spans.find((span) => span.name === "openclaw.tool.execution"); + const runSpanId = runSpan?.spanContext.mock.results[0]?.value?.spanId; + const modelSpanId = modelSpan?.spanContext.mock.results[0]?.value?.spanId; + + expect(telemetryState.tracer.setSpanContext).toHaveBeenCalledTimes(2); + expect(telemetryState.tracer.setSpanContext.mock.calls.map((call) => call[1])).toEqual([ + expect.objectContaining({ traceId: TRACE_ID, spanId: runSpanId }), + expect.objectContaining({ traceId: TRACE_ID, spanId: modelSpanId }), + ]); + + const parentBySpanName = Object.fromEntries( + telemetryState.tracer.startSpan.mock.calls.map((call) => [ + call[0], + (call[2] as { spanContext?: { spanId?: string } } | undefined)?.spanContext?.spanId, + ]), + ); + expect(parentBySpanName).toMatchObject({ + "openclaw.run": undefined, + "openclaw.model.call": runSpanId, + "openclaw.tool.execution": modelSpanId, + }); + expect(toolSpan?.setStatus).toHaveBeenCalledWith({ + code: 2, + message: "TypeError", + }); + await service.stop?.(ctx); + }); + + test("keeps trusted run spans alive long enough for post-completion usage parenting", async () => { + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true }); + await service.start(ctx); + + emitTrustedDiagnosticEvent({ + type: "run.started", + runId: "run-1", + provider: "openai", + model: "gpt-5.4", + trace: { + traceId: TRACE_ID, + spanId: CHILD_SPAN_ID, + parentSpanId: SPAN_ID, + traceFlags: "01", + }, + }); + emitTrustedDiagnosticEvent({ + type: "run.completed", + runId: "run-1", + provider: "openai", + model: "gpt-5.4", + outcome: "completed", + durationMs: 100, + trace: { + traceId: TRACE_ID, + spanId: CHILD_SPAN_ID, + parentSpanId: SPAN_ID, + traceFlags: "01", + }, + }); + emitTrustedDiagnosticEvent({ + type: "model.usage", + provider: "openai", + model: "gpt-5.4", + usage: { input: 3, output: 2, total: 5 }, + durationMs: 10, + trace: { + traceId: TRACE_ID, + spanId: GRANDCHILD_SPAN_ID, + parentSpanId: SPAN_ID, + traceFlags: "01", + }, + }); + await flushDiagnosticEvents(); + + const runSpan = telemetryState.spans.find((span) => span.name === "openclaw.run"); + const runSpanId = runSpan?.spanContext.mock.results[0]?.value?.spanId; + const modelUsageCall = telemetryState.tracer.startSpan.mock.calls.find( + (call) => call[0] === "openclaw.model.usage", + ); + + expect(telemetryState.tracer.setSpanContext).toHaveBeenCalledWith( + expect.anything(), + expect.objectContaining({ traceId: TRACE_ID, spanId: runSpanId }), + ); + expect( + (modelUsageCall?.[2] as { spanContext?: { spanId?: string } } | undefined)?.spanContext + ?.spanId, + ).toBe(runSpanId); + expect(runSpan?.end).toHaveBeenCalledWith(expect.any(Number)); + await service.stop?.(ctx); + }); + + test("does not force remote parents for completed-only trusted lifecycle spans", async () => { const service = createDiagnosticsOtelService(); const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true }); await service.start(ctx); @@ -1721,38 +1933,15 @@ describe("diagnostics-otel service", () => { traceFlags: "01", }, }); - emitTrustedDiagnosticEvent({ - type: "tool.execution.error", - runId: "run-1", - toolName: "read", - durationMs: 20, - errorCategory: "TypeError", - trace: { - traceId: TRACE_ID, - spanId: TOOL_SPAN_ID, - parentSpanId: GRANDCHILD_SPAN_ID, - traceFlags: "01", - }, - }); await flushDiagnosticEvents(); - expect(telemetryState.tracer.setSpanContext).toHaveBeenCalledTimes(3); - expect(telemetryState.tracer.setSpanContext.mock.calls.map((call) => call[1])).toEqual([ - expect.objectContaining({ traceId: TRACE_ID, spanId: SPAN_ID }), - expect.objectContaining({ traceId: TRACE_ID, spanId: CHILD_SPAN_ID }), - expect.objectContaining({ traceId: TRACE_ID, spanId: GRANDCHILD_SPAN_ID }), - ]); - + expect(telemetryState.tracer.setSpanContext).not.toHaveBeenCalled(); const parentBySpanName = Object.fromEntries( - telemetryState.tracer.startSpan.mock.calls.map((call) => [ - call[0], - (call[2] as { spanContext?: { spanId?: string } } | undefined)?.spanContext?.spanId, - ]), + telemetryState.tracer.startSpan.mock.calls.map((call) => [call[0], call[2]]), ); expect(parentBySpanName).toMatchObject({ - "openclaw.run": SPAN_ID, - "openclaw.model.call": CHILD_SPAN_ID, - "openclaw.tool.execution": GRANDCHILD_SPAN_ID, + "openclaw.run": undefined, + "openclaw.model.call": undefined, }); await service.stop?.(ctx); }); @@ -1860,6 +2049,93 @@ describe("diagnostics-otel service", () => { await service.stop?.(ctx); }); + test("does not create live started spans for untrusted lifecycle diagnostics", async () => { + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true }); + await service.start(ctx); + + emitDiagnosticEvent({ + type: "run.started", + runId: "run-1", + provider: "openai", + model: "gpt-5.4", + }); + emitDiagnosticEvent({ + type: "run.completed", + runId: "run-1", + provider: "openai", + model: "gpt-5.4", + outcome: "completed", + durationMs: 100, + }); + emitDiagnosticEvent({ + type: "model.call.started", + runId: "run-1", + callId: "call-1", + provider: "openai", + model: "gpt-5.4", + }); + emitDiagnosticEvent({ + type: "model.call.completed", + runId: "run-1", + callId: "call-1", + provider: "openai", + model: "gpt-5.4", + durationMs: 80, + }); + emitDiagnosticEvent({ + type: "tool.execution.started", + runId: "run-1", + toolName: "read", + }); + emitDiagnosticEvent({ + type: "tool.execution.error", + runId: "run-1", + toolName: "read", + durationMs: 20, + errorCategory: "TypeError", + }); + emitDiagnosticEvent({ + type: "harness.run.started", + runId: "run-1", + provider: "codex", + model: "gpt-5.4", + harnessId: "codex", + pluginId: "codex-plugin", + }); + emitDiagnosticEvent({ + type: "harness.run.completed", + runId: "run-1", + provider: "codex", + model: "gpt-5.4", + harnessId: "codex", + pluginId: "codex-plugin", + outcome: "completed", + durationMs: 90, + }); + await flushDiagnosticEvents(); + + expect( + telemetryState.tracer.startSpan.mock.calls.filter((call) => call[0] === "openclaw.run"), + ).toHaveLength(1); + expect( + telemetryState.tracer.startSpan.mock.calls.filter( + (call) => call[0] === "openclaw.model.call", + ), + ).toHaveLength(1); + expect( + telemetryState.tracer.startSpan.mock.calls.filter( + (call) => call[0] === "openclaw.tool.execution", + ), + ).toHaveLength(1); + expect( + telemetryState.tracer.startSpan.mock.calls.filter( + (call) => call[0] === "openclaw.harness.run", + ), + ).toHaveLength(1); + await service.stop?.(ctx); + }); + test("exports exec process spans without command text", async () => { const service = createDiagnosticsOtelService(); const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true }); diff --git a/extensions/diagnostics-otel/src/service.ts b/extensions/diagnostics-otel/src/service.ts index c34ab6f1c6c..5742de215c8 100644 --- a/extensions/diagnostics-otel/src/service.ts +++ b/extensions/diagnostics-otel/src/service.ts @@ -81,9 +81,9 @@ type ModelCallLifecycleDiagnosticEvent = Extract< DiagnosticEventPayload, { type: "model.call.completed" | "model.call.error" } >; -type HarnessRunLifecycleDiagnosticEvent = Extract< +type HarnessRunDiagnosticEvent = Extract< DiagnosticEventPayload, - { type: "harness.run.completed" | "harness.run.error" } + { type: "harness.run.started" | "harness.run.completed" | "harness.run.error" } >; type TelemetryExporterDiagnosticEvent = Extract< DiagnosticEventPayload, @@ -244,7 +244,7 @@ function assignGenAiSpanIdentityAttrs( function assignGenAiModelCallAttrs( attrs: Record, - evt: ModelCallLifecycleDiagnosticEvent, + evt: { api?: string; model?: string; provider?: string }, ): void { assignGenAiSpanIdentityAttrs(attrs, evt); } @@ -467,19 +467,6 @@ function contextForTraceContext(traceContext: DiagnosticTraceContext | undefined }); } -function contextForDiagnosticSpanParent(traceContext: DiagnosticTraceContext | undefined) { - const normalized = normalizeTraceContext(traceContext); - if (!normalized?.parentSpanId) { - return undefined; - } - return trace.setSpanContext(otelContextApi.active(), { - traceId: normalized.traceId, - spanId: normalized.parentSpanId, - traceFlags: traceFlagsToOtel(normalized.traceFlags), - isRemote: true, - }); -} - function contextForTrustedTraceContext( evt: DiagnosticEventPayload, metadata: DiagnosticEventMetadata, @@ -487,13 +474,6 @@ function contextForTrustedTraceContext( return metadata.trusted ? contextForTraceContext(evt.trace) : undefined; } -function contextForTrustedDiagnosticSpanParent( - evt: DiagnosticEventPayload, - metadata: DiagnosticEventMetadata, -) { - return metadata.trusted ? contextForDiagnosticSpanParent(evt.trace) : undefined; -} - function addTraceAttributes( attributes: Record, traceContext: DiagnosticTraceContext | undefined, @@ -518,17 +498,21 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { let sdk: NodeSDK | null = null; let logProvider: LoggerProvider | null = null; let unsubscribe: (() => void) | null = null; + let stopActiveTrustedSpans: (() => void) | null = null; const stopStarted = async () => { const currentUnsubscribe = unsubscribe; const currentLogProvider = logProvider; const currentSdk = sdk; + const currentStopActiveTrustedSpans = stopActiveTrustedSpans; unsubscribe = null; logProvider = null; sdk = null; + stopActiveTrustedSpans = null; currentUnsubscribe?.(); + currentStopActiveTrustedSpans?.(); if (currentLogProvider) { await currentLogProvider.shutdown().catch(() => undefined); } @@ -694,6 +678,24 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { const meter = metrics.getMeter("openclaw"); const tracer = trace.getTracer("openclaw"); + const activeTrustedSpans = new Map>(); + const activeTrustedSpanAliases = new Map>(); + const pendingTrustedRunFinalizers = new Map>(); + stopActiveTrustedSpans = () => { + const stopAt = Date.now(); + for (const handle of pendingTrustedRunFinalizers.values()) { + clearImmediate(handle); + } + pendingTrustedRunFinalizers.clear(); + for (const span of new Set([ + ...activeTrustedSpans.values(), + ...activeTrustedSpanAliases.values(), + ])) { + span.end(stopAt); + } + activeTrustedSpans.clear(); + activeTrustedSpanAliases.clear(); + }; const tokensCounter = meter.createCounter("openclaw.tokens", { unit: "1", @@ -942,11 +944,16 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { options: { parentContext?: ReturnType | null; endTimeMs?: number; + startTimeMs?: number; } = {}, ) => { const endTimeMs = options.endTimeMs ?? Date.now(); const startTime = - typeof durationMs === "number" ? endTimeMs - Math.max(0, durationMs) : undefined; + typeof options.startTimeMs === "number" + ? options.startTimeMs + : typeof durationMs === "number" && durationMs >= 0 + ? endTimeMs - durationMs + : undefined; const parentContext = "parentContext" in options ? (options.parentContext ?? undefined) : undefined; const span = tracer.startSpan( @@ -959,6 +966,78 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { ); return span; }; + const trustedTraceContext = ( + evt: DiagnosticEventPayload, + metadata: DiagnosticEventMetadata, + ) => (metadata.trusted ? normalizeTraceContext(evt.trace) : undefined); + const activeTrustedParentContext = ( + evt: DiagnosticEventPayload, + metadata: DiagnosticEventMetadata, + ) => { + const parentSpanId = trustedTraceContext(evt, metadata)?.parentSpanId; + if (!parentSpanId) { + return undefined; + } + const activeParentSpan = + activeTrustedSpans.get(parentSpanId) ?? activeTrustedSpanAliases.get(parentSpanId); + if (!activeParentSpan) { + return undefined; + } + return trace.setSpanContext(otelContextApi.active(), activeParentSpan.spanContext()); + }; + const trackTrustedSpan = ( + evt: DiagnosticEventPayload, + metadata: DiagnosticEventMetadata, + span: ReturnType, + ) => { + const spanId = trustedTraceContext(evt, metadata)?.spanId; + if (spanId) { + activeTrustedSpans.set(spanId, span); + } + return span; + }; + const takeTrackedTrustedSpan = ( + evt: DiagnosticEventPayload, + metadata: DiagnosticEventMetadata, + ) => { + const spanId = trustedTraceContext(evt, metadata)?.spanId; + if (!spanId) { + return undefined; + } + const span = activeTrustedSpans.get(spanId); + if (span) { + activeTrustedSpans.delete(spanId); + } + return span; + }; + const setSpanAttrs = ( + span: ReturnType, + attributes: Record, + ) => { + span.setAttributes?.(redactOtelAttributes(attributes)); + }; + const scheduleTrackedRunSpanFinalize = ( + spanId: string, + parentSpanId: string | undefined, + span: ReturnType, + endTimeMs: number, + ) => { + const existingHandle = pendingTrustedRunFinalizers.get(spanId); + if (existingHandle) { + clearImmediate(existingHandle); + } + const handle = setImmediate(() => { + pendingTrustedRunFinalizers.delete(spanId); + if (activeTrustedSpans.get(spanId) === span) { + activeTrustedSpans.delete(spanId); + } + if (parentSpanId && activeTrustedSpanAliases.get(parentSpanId) === span) { + activeTrustedSpanAliases.delete(parentSpanId); + } + span.end(endTimeMs); + }); + pendingTrustedRunFinalizers.set(spanId, handle); + }; const addRunAttrs = ( spanAttrs: Record, @@ -1093,7 +1172,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { ); const span = spanWithDuration("openclaw.model.usage", spanAttrs, evt.durationMs, { - parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), + parentContext: activeTrustedParentContext(evt, metadata), endTimeMs: evt.ts, }); span.end(evt.ts); @@ -1258,6 +1337,29 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { span.end(evt.ts); }; + const recordRunStarted = ( + evt: Extract, + metadata: DiagnosticEventMetadata, + ) => { + if (!tracesEnabled || !metadata.trusted) { + return; + } + const spanAttrs: Record = {}; + addRunAttrs(spanAttrs, evt); + const span = trackTrustedSpan( + evt, + metadata, + spanWithDuration("openclaw.run", spanAttrs, undefined, { + parentContext: activeTrustedParentContext(evt, metadata), + startTimeMs: evt.ts, + }), + ); + const parentSpanId = trustedTraceContext(evt, metadata)?.parentSpanId; + if (parentSpanId && !activeTrustedSpans.has(parentSpanId)) { + activeTrustedSpanAliases.set(parentSpanId, span); + } + }; + const recordLaneEnqueue = ( evt: Extract, ) => { @@ -1421,28 +1523,65 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { if (evt.errorCategory) { spanAttrs["openclaw.errorCategory"] = lowCardinalityAttr(evt.errorCategory, "other"); } - const span = spanWithDuration("openclaw.run", spanAttrs, evt.durationMs, { - parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), - endTimeMs: evt.ts, - }); + const trustedTrace = trustedTraceContext(evt, metadata); + const trackedSpan = trustedTrace?.spanId + ? activeTrustedSpans.get(trustedTrace.spanId) + : undefined; + const span = + trackedSpan ?? + spanWithDuration("openclaw.run", spanAttrs, evt.durationMs, { + parentContext: activeTrustedParentContext(evt, metadata), + endTimeMs: evt.ts, + }); + setSpanAttrs(span, spanAttrs); if (evt.outcome === "error") { span.setStatus({ code: SpanStatusCode.ERROR, ...(evt.errorCategory ? { message: redactSensitiveText(evt.errorCategory) } : {}), }); } + if (trackedSpan && trustedTrace?.spanId) { + scheduleTrackedRunSpanFinalize( + trustedTrace.spanId, + trustedTrace.parentSpanId, + trackedSpan, + evt.ts, + ); + return; + } span.end(evt.ts); }; - const harnessRunMetricAttrs = (evt: HarnessRunLifecycleDiagnosticEvent) => ({ + const harnessRunMetricAttrs = (evt: HarnessRunDiagnosticEvent) => ({ "openclaw.harness.id": lowCardinalityAttr(evt.harnessId, "unknown"), "openclaw.harness.plugin": lowCardinalityAttr(evt.pluginId), - "openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome, + ...(evt.type === "harness.run.started" + ? {} + : { + "openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome, + }), "openclaw.provider": lowCardinalityAttr(evt.provider, "unknown"), "openclaw.model": lowCardinalityAttr(evt.model, "unknown"), ...(evt.channel ? { "openclaw.channel": lowCardinalityAttr(evt.channel) } : {}), }); + const recordHarnessRunStarted = ( + evt: Extract, + metadata: DiagnosticEventMetadata, + ) => { + if (!tracesEnabled || !metadata.trusted) { + return; + } + trackTrustedSpan( + evt, + metadata, + spanWithDuration("openclaw.harness.run", harnessRunMetricAttrs(evt), undefined, { + parentContext: activeTrustedParentContext(evt, metadata), + startTimeMs: evt.ts, + }), + ); + }; + const recordHarnessRunCompleted = ( evt: Extract, metadata: DiagnosticEventMetadata, @@ -1467,10 +1606,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { spanAttrs["openclaw.harness.items.completed"] = evt.itemLifecycle.completedCount; spanAttrs["openclaw.harness.items.active"] = evt.itemLifecycle.activeCount; } - const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, { - parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), - endTimeMs: evt.ts, - }); + const span = + takeTrackedTrustedSpan(evt, metadata) ?? + spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, { + parentContext: activeTrustedParentContext(evt, metadata), + endTimeMs: evt.ts, + }); + setSpanAttrs(span, spanAttrs); if (evt.outcome === "error") { span.setStatus({ code: SpanStatusCode.ERROR, @@ -1499,10 +1641,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { "error.type": errorType, ...(evt.cleanupFailed ? { "openclaw.harness.cleanup_failed": true } : {}), }; - const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, { - parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), - endTimeMs: evt.ts, - }); + const span = + takeTrackedTrustedSpan(evt, metadata) ?? + spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, { + parentContext: activeTrustedParentContext(evt, metadata), + endTimeMs: evt.ts, + }); + setSpanAttrs(span, spanAttrs); span.setStatus({ code: SpanStatusCode.ERROR, message: errorType, @@ -1534,7 +1679,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { spanAttrs["openclaw.context.reserve_tokens"] = evt.reserveTokens; } const span = spanWithDuration("openclaw.context.assembled", spanAttrs, 0, { - parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), + parentContext: activeTrustedParentContext(evt, metadata), endTimeMs: evt.ts, }); span.end(evt.ts); @@ -1556,6 +1701,34 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { ...(errorType ? { "error.type": errorType } : {}), }); + const recordModelCallStarted = ( + evt: Extract, + metadata: DiagnosticEventMetadata, + ) => { + if (!tracesEnabled || !metadata.trusted) { + return; + } + const spanAttrs: Record = { + "openclaw.provider": evt.provider, + "openclaw.model": evt.model, + }; + assignGenAiModelCallAttrs(spanAttrs, evt); + if (evt.api) { + spanAttrs["openclaw.api"] = evt.api; + } + if (evt.transport) { + spanAttrs["openclaw.transport"] = evt.transport; + } + trackTrustedSpan( + evt, + metadata, + spanWithDuration("openclaw.model.call", spanAttrs, undefined, { + parentContext: activeTrustedParentContext(evt, metadata), + startTimeMs: evt.ts, + }), + ); + }; + const recordModelCallCompleted = ( evt: Extract, metadata: DiagnosticEventMetadata, @@ -1584,10 +1757,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { evt as unknown as Record, contentCapturePolicy, ); - const span = spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, { - parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), - endTimeMs: evt.ts, - }); + const span = + takeTrackedTrustedSpan(evt, metadata) ?? + spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, { + parentContext: activeTrustedParentContext(evt, metadata), + endTimeMs: evt.ts, + }); + setSpanAttrs(span, spanAttrs); addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash); span.end(evt.ts); }; @@ -1626,10 +1802,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { evt as unknown as Record, contentCapturePolicy, ); - const span = spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, { - parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), - endTimeMs: evt.ts, - }); + const span = + takeTrackedTrustedSpan(evt, metadata) ?? + spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, { + parentContext: activeTrustedParentContext(evt, metadata), + endTimeMs: evt.ts, + }); + setSpanAttrs(span, spanAttrs); addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash); span.setStatus({ code: SpanStatusCode.ERROR, @@ -1638,6 +1817,36 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { span.end(evt.ts); }; + const toolExecutionBaseAttrs = ( + evt: Extract< + DiagnosticEventPayload, + { + type: "tool.execution.started" | "tool.execution.completed" | "tool.execution.error"; + } + >, + ): Record => ({ + "openclaw.toolName": evt.toolName, + "gen_ai.tool.name": evt.toolName, + ...paramsSummaryAttrs(evt.paramsSummary), + }); + + const recordToolExecutionStarted = ( + evt: Extract, + metadata: DiagnosticEventMetadata, + ) => { + if (!tracesEnabled || !metadata.trusted) { + return; + } + trackTrustedSpan( + evt, + metadata, + spanWithDuration("openclaw.tool.execution", toolExecutionBaseAttrs(evt), undefined, { + parentContext: activeTrustedParentContext(evt, metadata), + startTimeMs: evt.ts, + }), + ); + }; + const recordToolExecutionCompleted = ( evt: Extract, metadata: DiagnosticEventMetadata, @@ -1651,9 +1860,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { return; } const spanAttrs: Record = { - "openclaw.toolName": evt.toolName, - "gen_ai.tool.name": evt.toolName, - ...paramsSummaryAttrs(evt.paramsSummary), + ...toolExecutionBaseAttrs(evt), }; addRunAttrs(spanAttrs, evt); assignOtelToolContentAttributes( @@ -1661,10 +1868,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { evt as unknown as Record, contentCapturePolicy, ); - const span = spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, { - parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), - endTimeMs: evt.ts, - }); + const span = + takeTrackedTrustedSpan(evt, metadata) ?? + spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, { + parentContext: activeTrustedParentContext(evt, metadata), + endTimeMs: evt.ts, + }); + setSpanAttrs(span, spanAttrs); span.end(evt.ts); }; @@ -1682,10 +1892,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { return; } const spanAttrs: Record = { - "openclaw.toolName": evt.toolName, + ...toolExecutionBaseAttrs(evt), "openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"), - "gen_ai.tool.name": evt.toolName, - ...paramsSummaryAttrs(evt.paramsSummary), }; addRunAttrs(spanAttrs, evt); if (evt.errorCode) { @@ -1696,10 +1904,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { evt as unknown as Record, contentCapturePolicy, ); - const span = spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, { - parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), - endTimeMs: evt.ts, - }); + const span = + takeTrackedTrustedSpan(evt, metadata) ?? + spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, { + parentContext: activeTrustedParentContext(evt, metadata), + endTimeMs: evt.ts, + }); + setSpanAttrs(span, spanAttrs); span.setStatus({ code: SpanStatusCode.ERROR, message: redactSensitiveText(evt.errorCategory), @@ -1827,9 +2038,15 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { case "diagnostic.heartbeat": recordHeartbeat(evt); return; + case "run.started": + recordRunStarted(evt, metadata); + return; case "run.completed": recordRunCompleted(evt, metadata); return; + case "harness.run.started": + recordHarnessRunStarted(evt, metadata); + return; case "harness.run.completed": recordHarnessRunCompleted(evt, metadata); return; @@ -1839,12 +2056,18 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { case "context.assembled": recordContextAssembled(evt, metadata); return; + case "model.call.started": + recordModelCallStarted(evt, metadata); + return; case "model.call.completed": recordModelCallCompleted(evt, metadata); return; case "model.call.error": recordModelCallError(evt, metadata); return; + case "tool.execution.started": + recordToolExecutionStarted(evt, metadata); + return; case "tool.execution.completed": recordToolExecutionCompleted(evt, metadata); return; @@ -1869,10 +2092,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { case "telemetry.exporter": recordTelemetryExporter(evt, metadata); return; - case "tool.execution.started": - case "run.started": - case "harness.run.started": - case "model.call.started": case "payload.large": return; }