mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:40:44 +00:00
fix(diagnostics): defer OTEL run span finalization (#72260)
This commit is contained in:
@@ -7,14 +7,24 @@ const telemetryState = vi.hoisted(() => {
|
||||
name: string;
|
||||
addEvent: ReturnType<typeof vi.fn>;
|
||||
end: ReturnType<typeof vi.fn>;
|
||||
setAttributes: ReturnType<typeof vi.fn>;
|
||||
setStatus: ReturnType<typeof vi.fn>;
|
||||
spanContext: ReturnType<typeof vi.fn>;
|
||||
}> = [];
|
||||
const tracer = {
|
||||
startSpan: vi.fn((name: string, _opts?: unknown, _ctx?: unknown) => {
|
||||
const spanNumber = spans.length + 1;
|
||||
const spanId = spanNumber.toString(16).padStart(16, "0");
|
||||
const span = {
|
||||
addEvent: vi.fn(),
|
||||
end: vi.fn(),
|
||||
setAttributes: vi.fn(),
|
||||
setStatus: vi.fn(),
|
||||
spanContext: vi.fn(() => ({
|
||||
traceId: "4bf92f3577b34da6a3ce929d0e0e4736",
|
||||
spanId,
|
||||
traceFlags: 1,
|
||||
})),
|
||||
};
|
||||
spans.push({ name, ...span });
|
||||
return span;
|
||||
@@ -122,6 +132,7 @@ vi.mock("@opentelemetry/semantic-conventions", () => ({
|
||||
import {
|
||||
emitTrustedDiagnosticEvent,
|
||||
onInternalDiagnosticEvent,
|
||||
resetDiagnosticEventsForTest,
|
||||
} from "../../../src/infra/diagnostic-events.js";
|
||||
import type { OpenClawPluginServiceContext } from "../api.js";
|
||||
import { emitDiagnosticEvent } from "../api.js";
|
||||
@@ -219,6 +230,7 @@ function flushDiagnosticEvents() {
|
||||
|
||||
describe("diagnostics-otel service", () => {
|
||||
beforeEach(() => {
|
||||
resetDiagnosticEventsForTest();
|
||||
delete process.env.OPENCLAW_OTEL_PRELOADED;
|
||||
delete process.env.OTEL_SEMCONV_STABILITY_OPT_IN;
|
||||
telemetryState.counters.clear();
|
||||
@@ -241,6 +253,7 @@ describe("diagnostics-otel service", () => {
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
resetDiagnosticEventsForTest();
|
||||
if (ORIGINAL_OPENCLAW_OTEL_PRELOADED === undefined) {
|
||||
delete process.env.OPENCLAW_OTEL_PRELOADED;
|
||||
} else {
|
||||
@@ -561,6 +574,7 @@ describe("diagnostics-otel service", () => {
|
||||
outcome: "completed",
|
||||
durationMs: 100,
|
||||
});
|
||||
await flushDiagnosticEvents();
|
||||
|
||||
expect(sdkStart).not.toHaveBeenCalled();
|
||||
expect(telemetryState.histograms.get("openclaw.run.duration_ms")?.record).toHaveBeenCalledWith(
|
||||
@@ -1506,6 +1520,17 @@ describe("diagnostics-otel service", () => {
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
|
||||
await service.start(ctx);
|
||||
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "run.started",
|
||||
runId: "run-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
trace: {
|
||||
traceId: TRACE_ID,
|
||||
spanId: SPAN_ID,
|
||||
traceFlags: "01",
|
||||
},
|
||||
});
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "context.assembled",
|
||||
runId: "run-1",
|
||||
@@ -1536,6 +1561,8 @@ describe("diagnostics-otel service", () => {
|
||||
const contextCall = telemetryState.tracer.startSpan.mock.calls.find(
|
||||
(call) => call[0] === "openclaw.context.assembled",
|
||||
);
|
||||
const runSpan = telemetryState.spans.find((span) => span.name === "openclaw.run");
|
||||
const runSpanId = runSpan?.spanContext.mock.results[0]?.value?.spanId;
|
||||
expect(contextCall?.[1]).toMatchObject({
|
||||
attributes: {
|
||||
"openclaw.provider": "openai",
|
||||
@@ -1553,12 +1580,19 @@ describe("diagnostics-otel service", () => {
|
||||
"openclaw.context.reserve_tokens": 4096,
|
||||
},
|
||||
});
|
||||
expect(contextCall?.[1]).toEqual({
|
||||
attributes: expect.any(Object),
|
||||
startTime: expect.any(Number),
|
||||
});
|
||||
expect(JSON.stringify(contextCall)).not.toContain("session-key");
|
||||
expect(JSON.stringify(contextCall)).not.toContain("prompt text");
|
||||
expect(telemetryState.tracer.setSpanContext).toHaveBeenCalledWith(
|
||||
expect.anything(),
|
||||
expect.objectContaining({ traceId: TRACE_ID, spanId: SPAN_ID }),
|
||||
expect.objectContaining({ traceId: TRACE_ID, spanId: runSpanId }),
|
||||
);
|
||||
expect(
|
||||
(contextCall?.[2] as { spanContext?: { spanId?: string } } | undefined)?.spanContext?.spanId,
|
||||
).toBe(runSpanId);
|
||||
await service.stop?.(ctx);
|
||||
});
|
||||
|
||||
@@ -1688,7 +1722,185 @@ describe("diagnostics-otel service", () => {
|
||||
await service.stop?.(ctx);
|
||||
});
|
||||
|
||||
test("parents trusted diagnostic lifecycle spans from explicit parent ids", async () => {
|
||||
test("parents trusted diagnostic lifecycle spans from active started spans", async () => {
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
|
||||
await service.start(ctx);
|
||||
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "run.started",
|
||||
runId: "run-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
trace: {
|
||||
traceId: TRACE_ID,
|
||||
spanId: CHILD_SPAN_ID,
|
||||
parentSpanId: SPAN_ID,
|
||||
traceFlags: "01",
|
||||
},
|
||||
});
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "model.call.started",
|
||||
runId: "run-1",
|
||||
callId: "call-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
trace: {
|
||||
traceId: TRACE_ID,
|
||||
spanId: GRANDCHILD_SPAN_ID,
|
||||
parentSpanId: CHILD_SPAN_ID,
|
||||
traceFlags: "01",
|
||||
},
|
||||
});
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "tool.execution.started",
|
||||
runId: "run-1",
|
||||
toolName: "read",
|
||||
trace: {
|
||||
traceId: TRACE_ID,
|
||||
spanId: TOOL_SPAN_ID,
|
||||
parentSpanId: GRANDCHILD_SPAN_ID,
|
||||
traceFlags: "01",
|
||||
},
|
||||
});
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "tool.execution.error",
|
||||
runId: "run-1",
|
||||
toolName: "read",
|
||||
durationMs: 20,
|
||||
errorCategory: "TypeError",
|
||||
trace: {
|
||||
traceId: TRACE_ID,
|
||||
spanId: TOOL_SPAN_ID,
|
||||
parentSpanId: GRANDCHILD_SPAN_ID,
|
||||
traceFlags: "01",
|
||||
},
|
||||
});
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "model.call.completed",
|
||||
runId: "run-1",
|
||||
callId: "call-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
durationMs: 80,
|
||||
trace: {
|
||||
traceId: TRACE_ID,
|
||||
spanId: GRANDCHILD_SPAN_ID,
|
||||
parentSpanId: CHILD_SPAN_ID,
|
||||
traceFlags: "01",
|
||||
},
|
||||
});
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "run.completed",
|
||||
runId: "run-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
outcome: "completed",
|
||||
durationMs: 100,
|
||||
trace: {
|
||||
traceId: TRACE_ID,
|
||||
spanId: CHILD_SPAN_ID,
|
||||
parentSpanId: SPAN_ID,
|
||||
traceFlags: "01",
|
||||
},
|
||||
});
|
||||
await flushDiagnosticEvents();
|
||||
|
||||
const runSpan = telemetryState.spans.find((span) => span.name === "openclaw.run");
|
||||
const modelSpan = telemetryState.spans.find((span) => span.name === "openclaw.model.call");
|
||||
const toolSpan = telemetryState.spans.find((span) => span.name === "openclaw.tool.execution");
|
||||
const runSpanId = runSpan?.spanContext.mock.results[0]?.value?.spanId;
|
||||
const modelSpanId = modelSpan?.spanContext.mock.results[0]?.value?.spanId;
|
||||
|
||||
expect(telemetryState.tracer.setSpanContext).toHaveBeenCalledTimes(2);
|
||||
expect(telemetryState.tracer.setSpanContext.mock.calls.map((call) => call[1])).toEqual([
|
||||
expect.objectContaining({ traceId: TRACE_ID, spanId: runSpanId }),
|
||||
expect.objectContaining({ traceId: TRACE_ID, spanId: modelSpanId }),
|
||||
]);
|
||||
|
||||
const parentBySpanName = Object.fromEntries(
|
||||
telemetryState.tracer.startSpan.mock.calls.map((call) => [
|
||||
call[0],
|
||||
(call[2] as { spanContext?: { spanId?: string } } | undefined)?.spanContext?.spanId,
|
||||
]),
|
||||
);
|
||||
expect(parentBySpanName).toMatchObject({
|
||||
"openclaw.run": undefined,
|
||||
"openclaw.model.call": runSpanId,
|
||||
"openclaw.tool.execution": modelSpanId,
|
||||
});
|
||||
expect(toolSpan?.setStatus).toHaveBeenCalledWith({
|
||||
code: 2,
|
||||
message: "TypeError",
|
||||
});
|
||||
await service.stop?.(ctx);
|
||||
});
|
||||
|
||||
test("keeps trusted run spans alive long enough for post-completion usage parenting", async () => {
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
|
||||
await service.start(ctx);
|
||||
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "run.started",
|
||||
runId: "run-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
trace: {
|
||||
traceId: TRACE_ID,
|
||||
spanId: CHILD_SPAN_ID,
|
||||
parentSpanId: SPAN_ID,
|
||||
traceFlags: "01",
|
||||
},
|
||||
});
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "run.completed",
|
||||
runId: "run-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
outcome: "completed",
|
||||
durationMs: 100,
|
||||
trace: {
|
||||
traceId: TRACE_ID,
|
||||
spanId: CHILD_SPAN_ID,
|
||||
parentSpanId: SPAN_ID,
|
||||
traceFlags: "01",
|
||||
},
|
||||
});
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "model.usage",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
usage: { input: 3, output: 2, total: 5 },
|
||||
durationMs: 10,
|
||||
trace: {
|
||||
traceId: TRACE_ID,
|
||||
spanId: GRANDCHILD_SPAN_ID,
|
||||
parentSpanId: SPAN_ID,
|
||||
traceFlags: "01",
|
||||
},
|
||||
});
|
||||
await flushDiagnosticEvents();
|
||||
|
||||
const runSpan = telemetryState.spans.find((span) => span.name === "openclaw.run");
|
||||
const runSpanId = runSpan?.spanContext.mock.results[0]?.value?.spanId;
|
||||
const modelUsageCall = telemetryState.tracer.startSpan.mock.calls.find(
|
||||
(call) => call[0] === "openclaw.model.usage",
|
||||
);
|
||||
|
||||
expect(telemetryState.tracer.setSpanContext).toHaveBeenCalledWith(
|
||||
expect.anything(),
|
||||
expect.objectContaining({ traceId: TRACE_ID, spanId: runSpanId }),
|
||||
);
|
||||
expect(
|
||||
(modelUsageCall?.[2] as { spanContext?: { spanId?: string } } | undefined)?.spanContext
|
||||
?.spanId,
|
||||
).toBe(runSpanId);
|
||||
expect(runSpan?.end).toHaveBeenCalledWith(expect.any(Number));
|
||||
await service.stop?.(ctx);
|
||||
});
|
||||
|
||||
test("does not force remote parents for completed-only trusted lifecycle spans", async () => {
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
|
||||
await service.start(ctx);
|
||||
@@ -1721,38 +1933,15 @@ describe("diagnostics-otel service", () => {
|
||||
traceFlags: "01",
|
||||
},
|
||||
});
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "tool.execution.error",
|
||||
runId: "run-1",
|
||||
toolName: "read",
|
||||
durationMs: 20,
|
||||
errorCategory: "TypeError",
|
||||
trace: {
|
||||
traceId: TRACE_ID,
|
||||
spanId: TOOL_SPAN_ID,
|
||||
parentSpanId: GRANDCHILD_SPAN_ID,
|
||||
traceFlags: "01",
|
||||
},
|
||||
});
|
||||
await flushDiagnosticEvents();
|
||||
|
||||
expect(telemetryState.tracer.setSpanContext).toHaveBeenCalledTimes(3);
|
||||
expect(telemetryState.tracer.setSpanContext.mock.calls.map((call) => call[1])).toEqual([
|
||||
expect.objectContaining({ traceId: TRACE_ID, spanId: SPAN_ID }),
|
||||
expect.objectContaining({ traceId: TRACE_ID, spanId: CHILD_SPAN_ID }),
|
||||
expect.objectContaining({ traceId: TRACE_ID, spanId: GRANDCHILD_SPAN_ID }),
|
||||
]);
|
||||
|
||||
expect(telemetryState.tracer.setSpanContext).not.toHaveBeenCalled();
|
||||
const parentBySpanName = Object.fromEntries(
|
||||
telemetryState.tracer.startSpan.mock.calls.map((call) => [
|
||||
call[0],
|
||||
(call[2] as { spanContext?: { spanId?: string } } | undefined)?.spanContext?.spanId,
|
||||
]),
|
||||
telemetryState.tracer.startSpan.mock.calls.map((call) => [call[0], call[2]]),
|
||||
);
|
||||
expect(parentBySpanName).toMatchObject({
|
||||
"openclaw.run": SPAN_ID,
|
||||
"openclaw.model.call": CHILD_SPAN_ID,
|
||||
"openclaw.tool.execution": GRANDCHILD_SPAN_ID,
|
||||
"openclaw.run": undefined,
|
||||
"openclaw.model.call": undefined,
|
||||
});
|
||||
await service.stop?.(ctx);
|
||||
});
|
||||
@@ -1860,6 +2049,93 @@ describe("diagnostics-otel service", () => {
|
||||
await service.stop?.(ctx);
|
||||
});
|
||||
|
||||
test("does not create live started spans for untrusted lifecycle diagnostics", async () => {
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
|
||||
await service.start(ctx);
|
||||
|
||||
emitDiagnosticEvent({
|
||||
type: "run.started",
|
||||
runId: "run-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
});
|
||||
emitDiagnosticEvent({
|
||||
type: "run.completed",
|
||||
runId: "run-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
outcome: "completed",
|
||||
durationMs: 100,
|
||||
});
|
||||
emitDiagnosticEvent({
|
||||
type: "model.call.started",
|
||||
runId: "run-1",
|
||||
callId: "call-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
});
|
||||
emitDiagnosticEvent({
|
||||
type: "model.call.completed",
|
||||
runId: "run-1",
|
||||
callId: "call-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
durationMs: 80,
|
||||
});
|
||||
emitDiagnosticEvent({
|
||||
type: "tool.execution.started",
|
||||
runId: "run-1",
|
||||
toolName: "read",
|
||||
});
|
||||
emitDiagnosticEvent({
|
||||
type: "tool.execution.error",
|
||||
runId: "run-1",
|
||||
toolName: "read",
|
||||
durationMs: 20,
|
||||
errorCategory: "TypeError",
|
||||
});
|
||||
emitDiagnosticEvent({
|
||||
type: "harness.run.started",
|
||||
runId: "run-1",
|
||||
provider: "codex",
|
||||
model: "gpt-5.4",
|
||||
harnessId: "codex",
|
||||
pluginId: "codex-plugin",
|
||||
});
|
||||
emitDiagnosticEvent({
|
||||
type: "harness.run.completed",
|
||||
runId: "run-1",
|
||||
provider: "codex",
|
||||
model: "gpt-5.4",
|
||||
harnessId: "codex",
|
||||
pluginId: "codex-plugin",
|
||||
outcome: "completed",
|
||||
durationMs: 90,
|
||||
});
|
||||
await flushDiagnosticEvents();
|
||||
|
||||
expect(
|
||||
telemetryState.tracer.startSpan.mock.calls.filter((call) => call[0] === "openclaw.run"),
|
||||
).toHaveLength(1);
|
||||
expect(
|
||||
telemetryState.tracer.startSpan.mock.calls.filter(
|
||||
(call) => call[0] === "openclaw.model.call",
|
||||
),
|
||||
).toHaveLength(1);
|
||||
expect(
|
||||
telemetryState.tracer.startSpan.mock.calls.filter(
|
||||
(call) => call[0] === "openclaw.tool.execution",
|
||||
),
|
||||
).toHaveLength(1);
|
||||
expect(
|
||||
telemetryState.tracer.startSpan.mock.calls.filter(
|
||||
(call) => call[0] === "openclaw.harness.run",
|
||||
),
|
||||
).toHaveLength(1);
|
||||
await service.stop?.(ctx);
|
||||
});
|
||||
|
||||
test("exports exec process spans without command text", async () => {
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
|
||||
|
||||
@@ -81,9 +81,9 @@ type ModelCallLifecycleDiagnosticEvent = Extract<
|
||||
DiagnosticEventPayload,
|
||||
{ type: "model.call.completed" | "model.call.error" }
|
||||
>;
|
||||
type HarnessRunLifecycleDiagnosticEvent = Extract<
|
||||
type HarnessRunDiagnosticEvent = Extract<
|
||||
DiagnosticEventPayload,
|
||||
{ type: "harness.run.completed" | "harness.run.error" }
|
||||
{ type: "harness.run.started" | "harness.run.completed" | "harness.run.error" }
|
||||
>;
|
||||
type TelemetryExporterDiagnosticEvent = Extract<
|
||||
DiagnosticEventPayload,
|
||||
@@ -244,7 +244,7 @@ function assignGenAiSpanIdentityAttrs(
|
||||
|
||||
function assignGenAiModelCallAttrs(
|
||||
attrs: Record<string, string | number | boolean>,
|
||||
evt: ModelCallLifecycleDiagnosticEvent,
|
||||
evt: { api?: string; model?: string; provider?: string },
|
||||
): void {
|
||||
assignGenAiSpanIdentityAttrs(attrs, evt);
|
||||
}
|
||||
@@ -467,19 +467,6 @@ function contextForTraceContext(traceContext: DiagnosticTraceContext | undefined
|
||||
});
|
||||
}
|
||||
|
||||
function contextForDiagnosticSpanParent(traceContext: DiagnosticTraceContext | undefined) {
|
||||
const normalized = normalizeTraceContext(traceContext);
|
||||
if (!normalized?.parentSpanId) {
|
||||
return undefined;
|
||||
}
|
||||
return trace.setSpanContext(otelContextApi.active(), {
|
||||
traceId: normalized.traceId,
|
||||
spanId: normalized.parentSpanId,
|
||||
traceFlags: traceFlagsToOtel(normalized.traceFlags),
|
||||
isRemote: true,
|
||||
});
|
||||
}
|
||||
|
||||
function contextForTrustedTraceContext(
|
||||
evt: DiagnosticEventPayload,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
@@ -487,13 +474,6 @@ function contextForTrustedTraceContext(
|
||||
return metadata.trusted ? contextForTraceContext(evt.trace) : undefined;
|
||||
}
|
||||
|
||||
function contextForTrustedDiagnosticSpanParent(
|
||||
evt: DiagnosticEventPayload,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
) {
|
||||
return metadata.trusted ? contextForDiagnosticSpanParent(evt.trace) : undefined;
|
||||
}
|
||||
|
||||
function addTraceAttributes(
|
||||
attributes: Record<string, string | number | boolean>,
|
||||
traceContext: DiagnosticTraceContext | undefined,
|
||||
@@ -518,17 +498,21 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
let sdk: NodeSDK | null = null;
|
||||
let logProvider: LoggerProvider | null = null;
|
||||
let unsubscribe: (() => void) | null = null;
|
||||
let stopActiveTrustedSpans: (() => void) | null = null;
|
||||
|
||||
const stopStarted = async () => {
|
||||
const currentUnsubscribe = unsubscribe;
|
||||
const currentLogProvider = logProvider;
|
||||
const currentSdk = sdk;
|
||||
const currentStopActiveTrustedSpans = stopActiveTrustedSpans;
|
||||
|
||||
unsubscribe = null;
|
||||
logProvider = null;
|
||||
sdk = null;
|
||||
stopActiveTrustedSpans = null;
|
||||
|
||||
currentUnsubscribe?.();
|
||||
currentStopActiveTrustedSpans?.();
|
||||
if (currentLogProvider) {
|
||||
await currentLogProvider.shutdown().catch(() => undefined);
|
||||
}
|
||||
@@ -694,6 +678,24 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
|
||||
const meter = metrics.getMeter("openclaw");
|
||||
const tracer = trace.getTracer("openclaw");
|
||||
const activeTrustedSpans = new Map<string, ReturnType<typeof tracer.startSpan>>();
|
||||
const activeTrustedSpanAliases = new Map<string, ReturnType<typeof tracer.startSpan>>();
|
||||
const pendingTrustedRunFinalizers = new Map<string, ReturnType<typeof setImmediate>>();
|
||||
stopActiveTrustedSpans = () => {
|
||||
const stopAt = Date.now();
|
||||
for (const handle of pendingTrustedRunFinalizers.values()) {
|
||||
clearImmediate(handle);
|
||||
}
|
||||
pendingTrustedRunFinalizers.clear();
|
||||
for (const span of new Set([
|
||||
...activeTrustedSpans.values(),
|
||||
...activeTrustedSpanAliases.values(),
|
||||
])) {
|
||||
span.end(stopAt);
|
||||
}
|
||||
activeTrustedSpans.clear();
|
||||
activeTrustedSpanAliases.clear();
|
||||
};
|
||||
|
||||
const tokensCounter = meter.createCounter("openclaw.tokens", {
|
||||
unit: "1",
|
||||
@@ -942,11 +944,16 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
options: {
|
||||
parentContext?: ReturnType<typeof contextForTraceContext> | null;
|
||||
endTimeMs?: number;
|
||||
startTimeMs?: number;
|
||||
} = {},
|
||||
) => {
|
||||
const endTimeMs = options.endTimeMs ?? Date.now();
|
||||
const startTime =
|
||||
typeof durationMs === "number" ? endTimeMs - Math.max(0, durationMs) : undefined;
|
||||
typeof options.startTimeMs === "number"
|
||||
? options.startTimeMs
|
||||
: typeof durationMs === "number" && durationMs >= 0
|
||||
? endTimeMs - durationMs
|
||||
: undefined;
|
||||
const parentContext =
|
||||
"parentContext" in options ? (options.parentContext ?? undefined) : undefined;
|
||||
const span = tracer.startSpan(
|
||||
@@ -959,6 +966,78 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
);
|
||||
return span;
|
||||
};
|
||||
const trustedTraceContext = (
|
||||
evt: DiagnosticEventPayload,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
) => (metadata.trusted ? normalizeTraceContext(evt.trace) : undefined);
|
||||
const activeTrustedParentContext = (
|
||||
evt: DiagnosticEventPayload,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
) => {
|
||||
const parentSpanId = trustedTraceContext(evt, metadata)?.parentSpanId;
|
||||
if (!parentSpanId) {
|
||||
return undefined;
|
||||
}
|
||||
const activeParentSpan =
|
||||
activeTrustedSpans.get(parentSpanId) ?? activeTrustedSpanAliases.get(parentSpanId);
|
||||
if (!activeParentSpan) {
|
||||
return undefined;
|
||||
}
|
||||
return trace.setSpanContext(otelContextApi.active(), activeParentSpan.spanContext());
|
||||
};
|
||||
const trackTrustedSpan = (
|
||||
evt: DiagnosticEventPayload,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
span: ReturnType<typeof tracer.startSpan>,
|
||||
) => {
|
||||
const spanId = trustedTraceContext(evt, metadata)?.spanId;
|
||||
if (spanId) {
|
||||
activeTrustedSpans.set(spanId, span);
|
||||
}
|
||||
return span;
|
||||
};
|
||||
const takeTrackedTrustedSpan = (
|
||||
evt: DiagnosticEventPayload,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
) => {
|
||||
const spanId = trustedTraceContext(evt, metadata)?.spanId;
|
||||
if (!spanId) {
|
||||
return undefined;
|
||||
}
|
||||
const span = activeTrustedSpans.get(spanId);
|
||||
if (span) {
|
||||
activeTrustedSpans.delete(spanId);
|
||||
}
|
||||
return span;
|
||||
};
|
||||
const setSpanAttrs = (
|
||||
span: ReturnType<typeof tracer.startSpan>,
|
||||
attributes: Record<string, string | number | boolean>,
|
||||
) => {
|
||||
span.setAttributes?.(redactOtelAttributes(attributes));
|
||||
};
|
||||
const scheduleTrackedRunSpanFinalize = (
|
||||
spanId: string,
|
||||
parentSpanId: string | undefined,
|
||||
span: ReturnType<typeof tracer.startSpan>,
|
||||
endTimeMs: number,
|
||||
) => {
|
||||
const existingHandle = pendingTrustedRunFinalizers.get(spanId);
|
||||
if (existingHandle) {
|
||||
clearImmediate(existingHandle);
|
||||
}
|
||||
const handle = setImmediate(() => {
|
||||
pendingTrustedRunFinalizers.delete(spanId);
|
||||
if (activeTrustedSpans.get(spanId) === span) {
|
||||
activeTrustedSpans.delete(spanId);
|
||||
}
|
||||
if (parentSpanId && activeTrustedSpanAliases.get(parentSpanId) === span) {
|
||||
activeTrustedSpanAliases.delete(parentSpanId);
|
||||
}
|
||||
span.end(endTimeMs);
|
||||
});
|
||||
pendingTrustedRunFinalizers.set(spanId, handle);
|
||||
};
|
||||
|
||||
const addRunAttrs = (
|
||||
spanAttrs: Record<string, string | number | boolean>,
|
||||
@@ -1093,7 +1172,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
);
|
||||
|
||||
const span = spanWithDuration("openclaw.model.usage", spanAttrs, evt.durationMs, {
|
||||
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
span.end(evt.ts);
|
||||
@@ -1258,6 +1337,29 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
span.end(evt.ts);
|
||||
};
|
||||
|
||||
const recordRunStarted = (
|
||||
evt: Extract<DiagnosticEventPayload, { type: "run.started" }>,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
) => {
|
||||
if (!tracesEnabled || !metadata.trusted) {
|
||||
return;
|
||||
}
|
||||
const spanAttrs: Record<string, string | number | boolean> = {};
|
||||
addRunAttrs(spanAttrs, evt);
|
||||
const span = trackTrustedSpan(
|
||||
evt,
|
||||
metadata,
|
||||
spanWithDuration("openclaw.run", spanAttrs, undefined, {
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
startTimeMs: evt.ts,
|
||||
}),
|
||||
);
|
||||
const parentSpanId = trustedTraceContext(evt, metadata)?.parentSpanId;
|
||||
if (parentSpanId && !activeTrustedSpans.has(parentSpanId)) {
|
||||
activeTrustedSpanAliases.set(parentSpanId, span);
|
||||
}
|
||||
};
|
||||
|
||||
const recordLaneEnqueue = (
|
||||
evt: Extract<DiagnosticEventPayload, { type: "queue.lane.enqueue" }>,
|
||||
) => {
|
||||
@@ -1421,28 +1523,65 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
if (evt.errorCategory) {
|
||||
spanAttrs["openclaw.errorCategory"] = lowCardinalityAttr(evt.errorCategory, "other");
|
||||
}
|
||||
const span = spanWithDuration("openclaw.run", spanAttrs, evt.durationMs, {
|
||||
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
const trustedTrace = trustedTraceContext(evt, metadata);
|
||||
const trackedSpan = trustedTrace?.spanId
|
||||
? activeTrustedSpans.get(trustedTrace.spanId)
|
||||
: undefined;
|
||||
const span =
|
||||
trackedSpan ??
|
||||
spanWithDuration("openclaw.run", spanAttrs, evt.durationMs, {
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
setSpanAttrs(span, spanAttrs);
|
||||
if (evt.outcome === "error") {
|
||||
span.setStatus({
|
||||
code: SpanStatusCode.ERROR,
|
||||
...(evt.errorCategory ? { message: redactSensitiveText(evt.errorCategory) } : {}),
|
||||
});
|
||||
}
|
||||
if (trackedSpan && trustedTrace?.spanId) {
|
||||
scheduleTrackedRunSpanFinalize(
|
||||
trustedTrace.spanId,
|
||||
trustedTrace.parentSpanId,
|
||||
trackedSpan,
|
||||
evt.ts,
|
||||
);
|
||||
return;
|
||||
}
|
||||
span.end(evt.ts);
|
||||
};
|
||||
|
||||
const harnessRunMetricAttrs = (evt: HarnessRunLifecycleDiagnosticEvent) => ({
|
||||
const harnessRunMetricAttrs = (evt: HarnessRunDiagnosticEvent) => ({
|
||||
"openclaw.harness.id": lowCardinalityAttr(evt.harnessId, "unknown"),
|
||||
"openclaw.harness.plugin": lowCardinalityAttr(evt.pluginId),
|
||||
"openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome,
|
||||
...(evt.type === "harness.run.started"
|
||||
? {}
|
||||
: {
|
||||
"openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome,
|
||||
}),
|
||||
"openclaw.provider": lowCardinalityAttr(evt.provider, "unknown"),
|
||||
"openclaw.model": lowCardinalityAttr(evt.model, "unknown"),
|
||||
...(evt.channel ? { "openclaw.channel": lowCardinalityAttr(evt.channel) } : {}),
|
||||
});
|
||||
|
||||
const recordHarnessRunStarted = (
|
||||
evt: Extract<DiagnosticEventPayload, { type: "harness.run.started" }>,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
) => {
|
||||
if (!tracesEnabled || !metadata.trusted) {
|
||||
return;
|
||||
}
|
||||
trackTrustedSpan(
|
||||
evt,
|
||||
metadata,
|
||||
spanWithDuration("openclaw.harness.run", harnessRunMetricAttrs(evt), undefined, {
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
startTimeMs: evt.ts,
|
||||
}),
|
||||
);
|
||||
};
|
||||
|
||||
const recordHarnessRunCompleted = (
|
||||
evt: Extract<DiagnosticEventPayload, { type: "harness.run.completed" }>,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
@@ -1467,10 +1606,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
spanAttrs["openclaw.harness.items.completed"] = evt.itemLifecycle.completedCount;
|
||||
spanAttrs["openclaw.harness.items.active"] = evt.itemLifecycle.activeCount;
|
||||
}
|
||||
const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
|
||||
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
const span =
|
||||
takeTrackedTrustedSpan(evt, metadata) ??
|
||||
spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
setSpanAttrs(span, spanAttrs);
|
||||
if (evt.outcome === "error") {
|
||||
span.setStatus({
|
||||
code: SpanStatusCode.ERROR,
|
||||
@@ -1499,10 +1641,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
"error.type": errorType,
|
||||
...(evt.cleanupFailed ? { "openclaw.harness.cleanup_failed": true } : {}),
|
||||
};
|
||||
const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
|
||||
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
const span =
|
||||
takeTrackedTrustedSpan(evt, metadata) ??
|
||||
spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
setSpanAttrs(span, spanAttrs);
|
||||
span.setStatus({
|
||||
code: SpanStatusCode.ERROR,
|
||||
message: errorType,
|
||||
@@ -1534,7 +1679,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
spanAttrs["openclaw.context.reserve_tokens"] = evt.reserveTokens;
|
||||
}
|
||||
const span = spanWithDuration("openclaw.context.assembled", spanAttrs, 0, {
|
||||
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
span.end(evt.ts);
|
||||
@@ -1556,6 +1701,34 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
...(errorType ? { "error.type": errorType } : {}),
|
||||
});
|
||||
|
||||
const recordModelCallStarted = (
|
||||
evt: Extract<DiagnosticEventPayload, { type: "model.call.started" }>,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
) => {
|
||||
if (!tracesEnabled || !metadata.trusted) {
|
||||
return;
|
||||
}
|
||||
const spanAttrs: Record<string, string | number | boolean> = {
|
||||
"openclaw.provider": evt.provider,
|
||||
"openclaw.model": evt.model,
|
||||
};
|
||||
assignGenAiModelCallAttrs(spanAttrs, evt);
|
||||
if (evt.api) {
|
||||
spanAttrs["openclaw.api"] = evt.api;
|
||||
}
|
||||
if (evt.transport) {
|
||||
spanAttrs["openclaw.transport"] = evt.transport;
|
||||
}
|
||||
trackTrustedSpan(
|
||||
evt,
|
||||
metadata,
|
||||
spanWithDuration("openclaw.model.call", spanAttrs, undefined, {
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
startTimeMs: evt.ts,
|
||||
}),
|
||||
);
|
||||
};
|
||||
|
||||
const recordModelCallCompleted = (
|
||||
evt: Extract<DiagnosticEventPayload, { type: "model.call.completed" }>,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
@@ -1584,10 +1757,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
evt as unknown as Record<string, unknown>,
|
||||
contentCapturePolicy,
|
||||
);
|
||||
const span = spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, {
|
||||
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
const span =
|
||||
takeTrackedTrustedSpan(evt, metadata) ??
|
||||
spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, {
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
setSpanAttrs(span, spanAttrs);
|
||||
addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash);
|
||||
span.end(evt.ts);
|
||||
};
|
||||
@@ -1626,10 +1802,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
evt as unknown as Record<string, unknown>,
|
||||
contentCapturePolicy,
|
||||
);
|
||||
const span = spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, {
|
||||
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
const span =
|
||||
takeTrackedTrustedSpan(evt, metadata) ??
|
||||
spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, {
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
setSpanAttrs(span, spanAttrs);
|
||||
addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash);
|
||||
span.setStatus({
|
||||
code: SpanStatusCode.ERROR,
|
||||
@@ -1638,6 +1817,36 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
span.end(evt.ts);
|
||||
};
|
||||
|
||||
const toolExecutionBaseAttrs = (
|
||||
evt: Extract<
|
||||
DiagnosticEventPayload,
|
||||
{
|
||||
type: "tool.execution.started" | "tool.execution.completed" | "tool.execution.error";
|
||||
}
|
||||
>,
|
||||
): Record<string, string | number | boolean> => ({
|
||||
"openclaw.toolName": evt.toolName,
|
||||
"gen_ai.tool.name": evt.toolName,
|
||||
...paramsSummaryAttrs(evt.paramsSummary),
|
||||
});
|
||||
|
||||
const recordToolExecutionStarted = (
|
||||
evt: Extract<DiagnosticEventPayload, { type: "tool.execution.started" }>,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
) => {
|
||||
if (!tracesEnabled || !metadata.trusted) {
|
||||
return;
|
||||
}
|
||||
trackTrustedSpan(
|
||||
evt,
|
||||
metadata,
|
||||
spanWithDuration("openclaw.tool.execution", toolExecutionBaseAttrs(evt), undefined, {
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
startTimeMs: evt.ts,
|
||||
}),
|
||||
);
|
||||
};
|
||||
|
||||
const recordToolExecutionCompleted = (
|
||||
evt: Extract<DiagnosticEventPayload, { type: "tool.execution.completed" }>,
|
||||
metadata: DiagnosticEventMetadata,
|
||||
@@ -1651,9 +1860,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
return;
|
||||
}
|
||||
const spanAttrs: Record<string, string | number | boolean> = {
|
||||
"openclaw.toolName": evt.toolName,
|
||||
"gen_ai.tool.name": evt.toolName,
|
||||
...paramsSummaryAttrs(evt.paramsSummary),
|
||||
...toolExecutionBaseAttrs(evt),
|
||||
};
|
||||
addRunAttrs(spanAttrs, evt);
|
||||
assignOtelToolContentAttributes(
|
||||
@@ -1661,10 +1868,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
evt as unknown as Record<string, unknown>,
|
||||
contentCapturePolicy,
|
||||
);
|
||||
const span = spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
|
||||
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
const span =
|
||||
takeTrackedTrustedSpan(evt, metadata) ??
|
||||
spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
setSpanAttrs(span, spanAttrs);
|
||||
span.end(evt.ts);
|
||||
};
|
||||
|
||||
@@ -1682,10 +1892,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
return;
|
||||
}
|
||||
const spanAttrs: Record<string, string | number | boolean> = {
|
||||
"openclaw.toolName": evt.toolName,
|
||||
...toolExecutionBaseAttrs(evt),
|
||||
"openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
|
||||
"gen_ai.tool.name": evt.toolName,
|
||||
...paramsSummaryAttrs(evt.paramsSummary),
|
||||
};
|
||||
addRunAttrs(spanAttrs, evt);
|
||||
if (evt.errorCode) {
|
||||
@@ -1696,10 +1904,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
evt as unknown as Record<string, unknown>,
|
||||
contentCapturePolicy,
|
||||
);
|
||||
const span = spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
|
||||
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
const span =
|
||||
takeTrackedTrustedSpan(evt, metadata) ??
|
||||
spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
|
||||
parentContext: activeTrustedParentContext(evt, metadata),
|
||||
endTimeMs: evt.ts,
|
||||
});
|
||||
setSpanAttrs(span, spanAttrs);
|
||||
span.setStatus({
|
||||
code: SpanStatusCode.ERROR,
|
||||
message: redactSensitiveText(evt.errorCategory),
|
||||
@@ -1827,9 +2038,15 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
case "diagnostic.heartbeat":
|
||||
recordHeartbeat(evt);
|
||||
return;
|
||||
case "run.started":
|
||||
recordRunStarted(evt, metadata);
|
||||
return;
|
||||
case "run.completed":
|
||||
recordRunCompleted(evt, metadata);
|
||||
return;
|
||||
case "harness.run.started":
|
||||
recordHarnessRunStarted(evt, metadata);
|
||||
return;
|
||||
case "harness.run.completed":
|
||||
recordHarnessRunCompleted(evt, metadata);
|
||||
return;
|
||||
@@ -1839,12 +2056,18 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
case "context.assembled":
|
||||
recordContextAssembled(evt, metadata);
|
||||
return;
|
||||
case "model.call.started":
|
||||
recordModelCallStarted(evt, metadata);
|
||||
return;
|
||||
case "model.call.completed":
|
||||
recordModelCallCompleted(evt, metadata);
|
||||
return;
|
||||
case "model.call.error":
|
||||
recordModelCallError(evt, metadata);
|
||||
return;
|
||||
case "tool.execution.started":
|
||||
recordToolExecutionStarted(evt, metadata);
|
||||
return;
|
||||
case "tool.execution.completed":
|
||||
recordToolExecutionCompleted(evt, metadata);
|
||||
return;
|
||||
@@ -1869,10 +2092,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
case "telemetry.exporter":
|
||||
recordTelemetryExporter(evt, metadata);
|
||||
return;
|
||||
case "tool.execution.started":
|
||||
case "run.started":
|
||||
case "harness.run.started":
|
||||
case "model.call.started":
|
||||
case "payload.large":
|
||||
return;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user