diff --git a/CHANGELOG.md b/CHANGELOG.md index 37a7733e6a0..1e90ffb1090 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai - Plugins/chat commands: refresh the persisted plugin registry after `/plugins enable` and `/plugins disable`, matching the CLI mutation path. Thanks @vincentkoc. - Plugins/compat: mark `OPENCLAW_DISABLE_PERSISTED_PLUGIN_REGISTRY` as a deprecated break-glass switch and point operators at registry repair instead. Thanks @vincentkoc. - Plugins/registry: ignore stale persisted registry reads when plugin policy no longer matches current config, and stamp generated registry files with a do-not-edit warning. Thanks @vincentkoc. +- Diagnostics/OTEL: surface provider request identifiers as bounded hashes on model-call diagnostics and span events, without exporting raw request IDs or metric labels. Thanks @Lidang-Jiang and @vincentkoc. - Diagnostics/OTEL: add bounded outbound message delivery lifecycle diagnostics and export them as low-cardinality delivery spans/metrics without message body, recipient, room, or media-path data. (#71471) Thanks @vincentkoc and @jlapenna. - Diagnostics/OTEL: emit bounded exec-process diagnostics and export them as `openclaw.exec` spans without exposing command text, working directories, or container identifiers. (#71451) Thanks @vincentkoc and @jlapenna. - Diagnostics/OTEL: support `OPENCLAW_OTEL_PRELOADED=1` so the plugin can reuse an already-registered OpenTelemetry SDK while keeping OpenClaw diagnostic listeners wired. (#71450) Thanks @vincentkoc and @jlapenna. diff --git a/extensions/diagnostics-otel/src/service.test.ts b/extensions/diagnostics-otel/src/service.test.ts index e954a9e9374..558214fcff2 100644 --- a/extensions/diagnostics-otel/src/service.test.ts +++ b/extensions/diagnostics-otel/src/service.test.ts @@ -5,12 +5,14 @@ const telemetryState = vi.hoisted(() => { const histograms = new Map }>(); const spans: Array<{ name: string; + addEvent: ReturnType; end: ReturnType; setStatus: ReturnType; }> = []; const tracer = { startSpan: vi.fn((name: string, _opts?: unknown, _ctx?: unknown) => { const span = { + addEvent: vi.fn(), end: vi.fn(), setStatus: vi.fn(), }; @@ -945,6 +947,48 @@ describe("diagnostics-otel service", () => { await service.stop?.(ctx); }); + test("records upstream request id hashes as model call span events only", async () => { + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true }); + await service.start(ctx); + + emitDiagnosticEvent({ + type: "model.call.error", + runId: "run-1", + callId: "call-1", + provider: "openai", + model: "gpt-5.4", + api: "openai-responses", + durationMs: 40, + errorCategory: "ProviderError", + upstreamRequestIdHash: "sha256:123456abcdef", + }); + await flushDiagnosticEvents(); + + const modelCall = telemetryState.tracer.startSpan.mock.calls.find( + (call) => call[0] === "openclaw.model.call", + ); + expect(modelCall?.[1]).toEqual({ + attributes: expect.not.objectContaining({ + "openclaw.upstreamRequestIdHash": expect.anything(), + }), + startTime: expect.any(Number), + }); + const span = telemetryState.spans.find((candidate) => candidate.name === "openclaw.model.call"); + expect(span?.addEvent).toHaveBeenCalledWith("openclaw.provider.request", { + "openclaw.upstreamRequestIdHash": "sha256:123456abcdef", + }); + expect( + telemetryState.histograms.get("openclaw.model_call.duration_ms")?.record, + ).toHaveBeenCalledWith( + 40, + expect.not.objectContaining({ + "openclaw.upstreamRequestIdHash": expect.anything(), + }), + ); + await service.stop?.(ctx); + }); + test("parents trusted diagnostic lifecycle spans from explicit parent ids", async () => { const service = createDiagnosticsOtelService(); const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true }); diff --git a/extensions/diagnostics-otel/src/service.ts b/extensions/diagnostics-otel/src/service.ts index 14f234a2d3f..d07009ff152 100644 --- a/extensions/diagnostics-otel/src/service.ts +++ b/extensions/diagnostics-otel/src/service.ts @@ -184,6 +184,22 @@ function assignGenAiModelCallAttrs( attrs["gen_ai.operation.name"] = genAiOperationName(evt.api); } +function addUpstreamRequestIdSpanEvent( + span: { addEvent?: (name: string, attributes?: Record) => void }, + upstreamRequestIdHash: string | undefined, +): void { + if (!upstreamRequestIdHash) { + return; + } + const boundedHash = lowCardinalityAttr(upstreamRequestIdHash); + if (boundedHash === "unknown") { + return; + } + span.addEvent?.("openclaw.provider.request", { + "openclaw.upstreamRequestIdHash": boundedHash, + }); +} + function clampOtelLogText(value: string, maxChars: number): string { return value.length > maxChars ? `${value.slice(0, maxChars)}...(truncated)` : value; } @@ -1148,6 +1164,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), endTimeMs: evt.ts, }); + addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash); span.end(evt.ts); }; @@ -1184,6 +1201,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), endTimeMs: evt.ts, }); + addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash); span.setStatus({ code: SpanStatusCode.ERROR, message: redactSensitiveText(evt.errorCategory), diff --git a/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.test.ts b/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.test.ts index 5ef758b2a50..8fc0c66d0b6 100644 --- a/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.test.ts +++ b/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.test.ts @@ -1,7 +1,7 @@ import type { StreamFn } from "@mariozechner/pi-agent-core"; import { beforeEach, describe, expect, it } from "vitest"; import { - onDiagnosticEvent, + onInternalDiagnosticEvent, resetDiagnosticEventsForTest, type DiagnosticEventPayload, } from "../../../infra/diagnostic-events.js"; @@ -10,7 +10,7 @@ import { wrapStreamFnWithDiagnosticModelCallEvents } from "./attempt.model-diagn async function collectModelCallEvents(run: () => Promise): Promise { const events: DiagnosticEventPayload[] = []; - const stop = onDiagnosticEvent((event) => { + const stop = onInternalDiagnosticEvent((event) => { if (event.type.startsWith("model.call.")) { events.push(event); } @@ -96,11 +96,12 @@ describe("wrapStreamFnWithDiagnosticModelCallEvents", () => { }); it("emits error events when stream iteration fails", async () => { + const requestId = "req_provider_123"; const stream = { [Symbol.asyncIterator]() { return { async next(): Promise> { - throw new TypeError("provider failed"); + throw new TypeError(`provider failed [request_id=${requestId}]`); }, }; }, @@ -127,8 +128,10 @@ describe("wrapStreamFnWithDiagnosticModelCallEvents", () => { type: "model.call.error", callId: "call-err", errorCategory: "TypeError", + upstreamRequestIdHash: expect.stringMatching(/^sha256:[a-f0-9]{12}$/), durationMs: expect.any(Number), }); + expect(JSON.stringify(events[1])).not.toContain(requestId); }); it("does not mutate non-configurable provider streams", async () => { diff --git a/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.ts b/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.ts index b535004b437..6f3c02800ff 100644 --- a/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.ts +++ b/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.ts @@ -1,5 +1,8 @@ import type { StreamFn } from "@mariozechner/pi-agent-core"; -import { diagnosticErrorCategory } from "../../../infra/diagnostic-error-metadata.js"; +import { + diagnosticErrorCategory, + diagnosticProviderRequestIdHash, +} from "../../../infra/diagnostic-error-metadata.js"; import { emitTrustedDiagnosticEvent, type DiagnosticEventInput, @@ -28,6 +31,10 @@ type ModelCallEventBase = Omit< Extract, "type" >; +type ModelCallErrorFields = Pick< + Extract, + "errorCategory" | "upstreamRequestIdHash" +>; const MODEL_CALL_STREAM_RETURN_TIMEOUT_MS = 1000; @@ -75,6 +82,14 @@ function baseModelCallEvent( }; } +function modelCallErrorFields(err: unknown): ModelCallErrorFields { + const upstreamRequestIdHash = diagnosticProviderRequestIdHash(err); + return { + errorCategory: diagnosticErrorCategory(err), + ...(upstreamRequestIdHash ? { upstreamRequestIdHash } : {}), + }; +} + async function safeReturnIterator(iterator: AsyncIterator): Promise { let returnResult: unknown; try { @@ -133,7 +148,7 @@ async function* observeModelCallIterator( type: "model.call.error", ...eventBase, durationMs: Date.now() - startedAt, - errorCategory: diagnosticErrorCategory(err), + ...modelCallErrorFields(err), }); throw err; } finally { @@ -226,7 +241,7 @@ export function wrapStreamFnWithDiagnosticModelCallEvents( type: "model.call.error", ...eventBase, durationMs: Date.now() - startedAt, - errorCategory: diagnosticErrorCategory(err), + ...modelCallErrorFields(err), }); throw err; }, @@ -238,7 +253,7 @@ export function wrapStreamFnWithDiagnosticModelCallEvents( type: "model.call.error", ...eventBase, durationMs: Date.now() - startedAt, - errorCategory: diagnosticErrorCategory(err), + ...modelCallErrorFields(err), }); throw err; } diff --git a/src/infra/diagnostic-error-metadata.test.ts b/src/infra/diagnostic-error-metadata.test.ts index d44ec12c523..e88b97bf0aa 100644 --- a/src/infra/diagnostic-error-metadata.test.ts +++ b/src/infra/diagnostic-error-metadata.test.ts @@ -1,5 +1,9 @@ import { describe, expect, it } from "vitest"; -import { diagnosticErrorCategory, diagnosticHttpStatusCode } from "./diagnostic-error-metadata.js"; +import { + diagnosticErrorCategory, + diagnosticHttpStatusCode, + diagnosticProviderRequestIdHash, +} from "./diagnostic-error-metadata.js"; describe("diagnostic error metadata", () => { it("returns stable categories without reading mutable Error.name", () => { @@ -47,4 +51,29 @@ describe("diagnostic error metadata", () => { expect(diagnosticHttpStatusCode(errorLike)).toBeUndefined(); }); + + it("extracts bounded provider request id hashes without exposing raw ids", () => { + expect(diagnosticProviderRequestIdHash({ requestId: "req_123" })).toMatch( + /^sha256:[a-f0-9]{12}$/, + ); + expect( + diagnosticProviderRequestIdHash( + new Error("Provider API error (429): quota [request_id=req_456]"), + ), + ).toMatch(/^sha256:[a-f0-9]{12}$/); + expect( + diagnosticProviderRequestIdHash({ requestId: "https://example.invalid/secret" }), + ).toBeUndefined(); + }); + + it("does not invoke throwing getters while extracting provider request ids", () => { + const errorLike = {}; + Object.defineProperty(errorLike, "requestId", { + get() { + throw new Error("should not read getter"); + }, + }); + + expect(diagnosticProviderRequestIdHash(errorLike)).toBeUndefined(); + }); }); diff --git a/src/infra/diagnostic-error-metadata.ts b/src/infra/diagnostic-error-metadata.ts index 4394090de49..043a94f32bb 100644 --- a/src/infra/diagnostic-error-metadata.ts +++ b/src/infra/diagnostic-error-metadata.ts @@ -1,5 +1,19 @@ +import crypto from "node:crypto"; + const HTTP_STATUS_MIN = 100; const HTTP_STATUS_MAX = 599; +const REQUEST_ID_HASH_PREFIX_LEN = 12; +const PROVIDER_REQUEST_ID_KEYS = [ + "upstreamRequestId", + "providerRequestId", + "requestId", + "request_id", +] as const; +const PROVIDER_REQUEST_ID_RE = /^[A-Za-z0-9._:-]{1,128}$/u; +const PROVIDER_REQUEST_ID_TEXT_PATTERNS = [ + /\b(?:x-request-id|request-id|request_id|requestId|trace-id|trace_id)\b["'\s:=([]+([A-Za-z0-9._:-]{1,128})/i, + /\((?:request_id|trace_id)\s*:\s*([A-Za-z0-9._:-]{1,128})\)/i, +] as const; function isObjectLike(value: unknown): value is object { return (typeof value === "object" || typeof value === "function") && value !== null; @@ -17,6 +31,25 @@ function readOwnDataProperty(value: unknown, key: string): unknown { } } +function findDiagnosticErrorProperty( + err: unknown, + reader: (candidate: unknown) => T | undefined, + seen: Set = new Set(), +): T | undefined { + const direct = reader(err); + if (direct !== undefined) { + return direct; + } + if (!isObjectLike(err) || seen.has(err)) { + return undefined; + } + seen.add(err); + return ( + findDiagnosticErrorProperty(readOwnDataProperty(err, "error"), reader, seen) ?? + findDiagnosticErrorProperty(readOwnDataProperty(err, "cause"), reader, seen) + ); +} + function isHttpStatusCode(value: unknown): value is number { return ( typeof value === "number" && @@ -26,6 +59,61 @@ function isHttpStatusCode(value: unknown): value is number { ); } +function normalizeProviderRequestId(value: unknown): string | undefined { + if (typeof value === "string") { + const trimmed = value.trim(); + return PROVIDER_REQUEST_ID_RE.test(trimmed) ? trimmed : undefined; + } + if (typeof value === "number" && Number.isFinite(value)) { + const normalized = String(value); + return PROVIDER_REQUEST_ID_RE.test(normalized) ? normalized : undefined; + } + if (typeof value === "bigint") { + const normalized = String(value); + return PROVIDER_REQUEST_ID_RE.test(normalized) ? normalized : undefined; + } + return undefined; +} + +function hashDiagnosticIdentifier(value: string): string { + return `sha256:${crypto + .createHash("sha256") + .update(value) + .digest("hex") + .slice(0, REQUEST_ID_HASH_PREFIX_LEN)}`; +} + +function readDirectProviderRequestId(err: unknown): string | undefined { + for (const key of PROVIDER_REQUEST_ID_KEYS) { + const normalized = normalizeProviderRequestId(readOwnDataProperty(err, key)); + if (normalized) { + return normalized; + } + } + return undefined; +} + +function readDirectMessage(err: unknown): string | undefined { + if (typeof err === "string") { + return err; + } + const message = readOwnDataProperty(err, "message"); + return typeof message === "string" ? message : undefined; +} + +function extractProviderRequestIdFromText(text: string | undefined): string | undefined { + if (!text) { + return undefined; + } + for (const pattern of PROVIDER_REQUEST_ID_TEXT_PATTERNS) { + const normalized = normalizeProviderRequestId(text.match(pattern)?.[1]); + if (normalized) { + return normalized; + } + } + return undefined; +} + export function diagnosticErrorCategory(err: unknown): string { try { if (err instanceof TypeError) { @@ -69,3 +157,14 @@ export function diagnosticHttpStatusCode(err: unknown): string | undefined { } return undefined; } + +export function diagnosticProviderRequestIdHash(err: unknown): string | undefined { + const fromProperty = findDiagnosticErrorProperty(err, readDirectProviderRequestId); + if (fromProperty) { + return hashDiagnosticIdentifier(fromProperty); + } + const fromMessage = findDiagnosticErrorProperty(err, (candidate) => + extractProviderRequestIdFromText(readDirectMessage(candidate)), + ); + return fromMessage ? hashDiagnosticIdentifier(fromMessage) : undefined; +} diff --git a/src/infra/diagnostic-events.ts b/src/infra/diagnostic-events.ts index cbe54b0d427..8108f737eee 100644 --- a/src/infra/diagnostic-events.ts +++ b/src/infra/diagnostic-events.ts @@ -262,6 +262,7 @@ type DiagnosticModelCallBaseEvent = DiagnosticBaseEvent & { model: string; api?: string; transport?: string; + upstreamRequestIdHash?: string; }; export type DiagnosticModelCallStartedEvent = DiagnosticModelCallBaseEvent & {