feat(diagnostics): surface provider request id hashes

This commit is contained in:
Vincent Koc
2026-04-25 10:38:31 -07:00
parent 2e0ae56b1a
commit 44114328b4
8 changed files with 218 additions and 8 deletions

View File

@@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai
- Plugins/chat commands: refresh the persisted plugin registry after `/plugins enable` and `/plugins disable`, matching the CLI mutation path. Thanks @vincentkoc.
- Plugins/compat: mark `OPENCLAW_DISABLE_PERSISTED_PLUGIN_REGISTRY` as a deprecated break-glass switch and point operators at registry repair instead. Thanks @vincentkoc.
- Plugins/registry: ignore stale persisted registry reads when plugin policy no longer matches current config, and stamp generated registry files with a do-not-edit warning. Thanks @vincentkoc.
- Diagnostics/OTEL: surface provider request identifiers as bounded hashes on model-call diagnostics and span events, without exporting raw request IDs or metric labels. Thanks @Lidang-Jiang and @vincentkoc.
- Diagnostics/OTEL: add bounded outbound message delivery lifecycle diagnostics and export them as low-cardinality delivery spans/metrics without message body, recipient, room, or media-path data. (#71471) Thanks @vincentkoc and @jlapenna.
- Diagnostics/OTEL: emit bounded exec-process diagnostics and export them as `openclaw.exec` spans without exposing command text, working directories, or container identifiers. (#71451) Thanks @vincentkoc and @jlapenna.
- Diagnostics/OTEL: support `OPENCLAW_OTEL_PRELOADED=1` so the plugin can reuse an already-registered OpenTelemetry SDK while keeping OpenClaw diagnostic listeners wired. (#71450) Thanks @vincentkoc and @jlapenna.

View File

@@ -5,12 +5,14 @@ const telemetryState = vi.hoisted(() => {
const histograms = new Map<string, { record: ReturnType<typeof vi.fn> }>();
const spans: Array<{
name: string;
addEvent: ReturnType<typeof vi.fn>;
end: ReturnType<typeof vi.fn>;
setStatus: ReturnType<typeof vi.fn>;
}> = [];
const tracer = {
startSpan: vi.fn((name: string, _opts?: unknown, _ctx?: unknown) => {
const span = {
addEvent: vi.fn(),
end: vi.fn(),
setStatus: vi.fn(),
};
@@ -945,6 +947,48 @@ describe("diagnostics-otel service", () => {
await service.stop?.(ctx);
});
test("records upstream request id hashes as model call span events only", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
await service.start(ctx);
emitDiagnosticEvent({
type: "model.call.error",
runId: "run-1",
callId: "call-1",
provider: "openai",
model: "gpt-5.4",
api: "openai-responses",
durationMs: 40,
errorCategory: "ProviderError",
upstreamRequestIdHash: "sha256:123456abcdef",
});
await flushDiagnosticEvents();
const modelCall = telemetryState.tracer.startSpan.mock.calls.find(
(call) => call[0] === "openclaw.model.call",
);
expect(modelCall?.[1]).toEqual({
attributes: expect.not.objectContaining({
"openclaw.upstreamRequestIdHash": expect.anything(),
}),
startTime: expect.any(Number),
});
const span = telemetryState.spans.find((candidate) => candidate.name === "openclaw.model.call");
expect(span?.addEvent).toHaveBeenCalledWith("openclaw.provider.request", {
"openclaw.upstreamRequestIdHash": "sha256:123456abcdef",
});
expect(
telemetryState.histograms.get("openclaw.model_call.duration_ms")?.record,
).toHaveBeenCalledWith(
40,
expect.not.objectContaining({
"openclaw.upstreamRequestIdHash": expect.anything(),
}),
);
await service.stop?.(ctx);
});
test("parents trusted diagnostic lifecycle spans from explicit parent ids", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });

View File

@@ -184,6 +184,22 @@ function assignGenAiModelCallAttrs(
attrs["gen_ai.operation.name"] = genAiOperationName(evt.api);
}
function addUpstreamRequestIdSpanEvent(
span: { addEvent?: (name: string, attributes?: Record<string, string>) => void },
upstreamRequestIdHash: string | undefined,
): void {
if (!upstreamRequestIdHash) {
return;
}
const boundedHash = lowCardinalityAttr(upstreamRequestIdHash);
if (boundedHash === "unknown") {
return;
}
span.addEvent?.("openclaw.provider.request", {
"openclaw.upstreamRequestIdHash": boundedHash,
});
}
function clampOtelLogText(value: string, maxChars: number): string {
return value.length > maxChars ? `${value.slice(0, maxChars)}...(truncated)` : value;
}
@@ -1148,6 +1164,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
endTimeMs: evt.ts,
});
addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash);
span.end(evt.ts);
};
@@ -1184,6 +1201,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
endTimeMs: evt.ts,
});
addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash);
span.setStatus({
code: SpanStatusCode.ERROR,
message: redactSensitiveText(evt.errorCategory),

View File

@@ -1,7 +1,7 @@
import type { StreamFn } from "@mariozechner/pi-agent-core";
import { beforeEach, describe, expect, it } from "vitest";
import {
onDiagnosticEvent,
onInternalDiagnosticEvent,
resetDiagnosticEventsForTest,
type DiagnosticEventPayload,
} from "../../../infra/diagnostic-events.js";
@@ -10,7 +10,7 @@ import { wrapStreamFnWithDiagnosticModelCallEvents } from "./attempt.model-diagn
async function collectModelCallEvents(run: () => Promise<void>): Promise<DiagnosticEventPayload[]> {
const events: DiagnosticEventPayload[] = [];
const stop = onDiagnosticEvent((event) => {
const stop = onInternalDiagnosticEvent((event) => {
if (event.type.startsWith("model.call.")) {
events.push(event);
}
@@ -96,11 +96,12 @@ describe("wrapStreamFnWithDiagnosticModelCallEvents", () => {
});
it("emits error events when stream iteration fails", async () => {
const requestId = "req_provider_123";
const stream = {
[Symbol.asyncIterator]() {
return {
async next(): Promise<IteratorResult<unknown>> {
throw new TypeError("provider failed");
throw new TypeError(`provider failed [request_id=${requestId}]`);
},
};
},
@@ -127,8 +128,10 @@ describe("wrapStreamFnWithDiagnosticModelCallEvents", () => {
type: "model.call.error",
callId: "call-err",
errorCategory: "TypeError",
upstreamRequestIdHash: expect.stringMatching(/^sha256:[a-f0-9]{12}$/),
durationMs: expect.any(Number),
});
expect(JSON.stringify(events[1])).not.toContain(requestId);
});
it("does not mutate non-configurable provider streams", async () => {

View File

@@ -1,5 +1,8 @@
import type { StreamFn } from "@mariozechner/pi-agent-core";
import { diagnosticErrorCategory } from "../../../infra/diagnostic-error-metadata.js";
import {
diagnosticErrorCategory,
diagnosticProviderRequestIdHash,
} from "../../../infra/diagnostic-error-metadata.js";
import {
emitTrustedDiagnosticEvent,
type DiagnosticEventInput,
@@ -28,6 +31,10 @@ type ModelCallEventBase = Omit<
Extract<DiagnosticEventInput, { type: "model.call.started" }>,
"type"
>;
type ModelCallErrorFields = Pick<
Extract<DiagnosticEventInput, { type: "model.call.error" }>,
"errorCategory" | "upstreamRequestIdHash"
>;
const MODEL_CALL_STREAM_RETURN_TIMEOUT_MS = 1000;
@@ -75,6 +82,14 @@ function baseModelCallEvent(
};
}
function modelCallErrorFields(err: unknown): ModelCallErrorFields {
const upstreamRequestIdHash = diagnosticProviderRequestIdHash(err);
return {
errorCategory: diagnosticErrorCategory(err),
...(upstreamRequestIdHash ? { upstreamRequestIdHash } : {}),
};
}
async function safeReturnIterator(iterator: AsyncIterator<unknown>): Promise<void> {
let returnResult: unknown;
try {
@@ -133,7 +148,7 @@ async function* observeModelCallIterator<T>(
type: "model.call.error",
...eventBase,
durationMs: Date.now() - startedAt,
errorCategory: diagnosticErrorCategory(err),
...modelCallErrorFields(err),
});
throw err;
} finally {
@@ -226,7 +241,7 @@ export function wrapStreamFnWithDiagnosticModelCallEvents(
type: "model.call.error",
...eventBase,
durationMs: Date.now() - startedAt,
errorCategory: diagnosticErrorCategory(err),
...modelCallErrorFields(err),
});
throw err;
},
@@ -238,7 +253,7 @@ export function wrapStreamFnWithDiagnosticModelCallEvents(
type: "model.call.error",
...eventBase,
durationMs: Date.now() - startedAt,
errorCategory: diagnosticErrorCategory(err),
...modelCallErrorFields(err),
});
throw err;
}

View File

@@ -1,5 +1,9 @@
import { describe, expect, it } from "vitest";
import { diagnosticErrorCategory, diagnosticHttpStatusCode } from "./diagnostic-error-metadata.js";
import {
diagnosticErrorCategory,
diagnosticHttpStatusCode,
diagnosticProviderRequestIdHash,
} from "./diagnostic-error-metadata.js";
describe("diagnostic error metadata", () => {
it("returns stable categories without reading mutable Error.name", () => {
@@ -47,4 +51,29 @@ describe("diagnostic error metadata", () => {
expect(diagnosticHttpStatusCode(errorLike)).toBeUndefined();
});
it("extracts bounded provider request id hashes without exposing raw ids", () => {
expect(diagnosticProviderRequestIdHash({ requestId: "req_123" })).toMatch(
/^sha256:[a-f0-9]{12}$/,
);
expect(
diagnosticProviderRequestIdHash(
new Error("Provider API error (429): quota [request_id=req_456]"),
),
).toMatch(/^sha256:[a-f0-9]{12}$/);
expect(
diagnosticProviderRequestIdHash({ requestId: "https://example.invalid/secret" }),
).toBeUndefined();
});
it("does not invoke throwing getters while extracting provider request ids", () => {
const errorLike = {};
Object.defineProperty(errorLike, "requestId", {
get() {
throw new Error("should not read getter");
},
});
expect(diagnosticProviderRequestIdHash(errorLike)).toBeUndefined();
});
});

View File

@@ -1,5 +1,19 @@
import crypto from "node:crypto";
const HTTP_STATUS_MIN = 100;
const HTTP_STATUS_MAX = 599;
const REQUEST_ID_HASH_PREFIX_LEN = 12;
const PROVIDER_REQUEST_ID_KEYS = [
"upstreamRequestId",
"providerRequestId",
"requestId",
"request_id",
] as const;
const PROVIDER_REQUEST_ID_RE = /^[A-Za-z0-9._:-]{1,128}$/u;
const PROVIDER_REQUEST_ID_TEXT_PATTERNS = [
/\b(?:x-request-id|request-id|request_id|requestId|trace-id|trace_id)\b["'\s:=([]+([A-Za-z0-9._:-]{1,128})/i,
/\((?:request_id|trace_id)\s*:\s*([A-Za-z0-9._:-]{1,128})\)/i,
] as const;
function isObjectLike(value: unknown): value is object {
return (typeof value === "object" || typeof value === "function") && value !== null;
@@ -17,6 +31,25 @@ function readOwnDataProperty(value: unknown, key: string): unknown {
}
}
function findDiagnosticErrorProperty<T>(
err: unknown,
reader: (candidate: unknown) => T | undefined,
seen: Set<object> = new Set(),
): T | undefined {
const direct = reader(err);
if (direct !== undefined) {
return direct;
}
if (!isObjectLike(err) || seen.has(err)) {
return undefined;
}
seen.add(err);
return (
findDiagnosticErrorProperty(readOwnDataProperty(err, "error"), reader, seen) ??
findDiagnosticErrorProperty(readOwnDataProperty(err, "cause"), reader, seen)
);
}
function isHttpStatusCode(value: unknown): value is number {
return (
typeof value === "number" &&
@@ -26,6 +59,61 @@ function isHttpStatusCode(value: unknown): value is number {
);
}
function normalizeProviderRequestId(value: unknown): string | undefined {
if (typeof value === "string") {
const trimmed = value.trim();
return PROVIDER_REQUEST_ID_RE.test(trimmed) ? trimmed : undefined;
}
if (typeof value === "number" && Number.isFinite(value)) {
const normalized = String(value);
return PROVIDER_REQUEST_ID_RE.test(normalized) ? normalized : undefined;
}
if (typeof value === "bigint") {
const normalized = String(value);
return PROVIDER_REQUEST_ID_RE.test(normalized) ? normalized : undefined;
}
return undefined;
}
function hashDiagnosticIdentifier(value: string): string {
return `sha256:${crypto
.createHash("sha256")
.update(value)
.digest("hex")
.slice(0, REQUEST_ID_HASH_PREFIX_LEN)}`;
}
function readDirectProviderRequestId(err: unknown): string | undefined {
for (const key of PROVIDER_REQUEST_ID_KEYS) {
const normalized = normalizeProviderRequestId(readOwnDataProperty(err, key));
if (normalized) {
return normalized;
}
}
return undefined;
}
function readDirectMessage(err: unknown): string | undefined {
if (typeof err === "string") {
return err;
}
const message = readOwnDataProperty(err, "message");
return typeof message === "string" ? message : undefined;
}
function extractProviderRequestIdFromText(text: string | undefined): string | undefined {
if (!text) {
return undefined;
}
for (const pattern of PROVIDER_REQUEST_ID_TEXT_PATTERNS) {
const normalized = normalizeProviderRequestId(text.match(pattern)?.[1]);
if (normalized) {
return normalized;
}
}
return undefined;
}
export function diagnosticErrorCategory(err: unknown): string {
try {
if (err instanceof TypeError) {
@@ -69,3 +157,14 @@ export function diagnosticHttpStatusCode(err: unknown): string | undefined {
}
return undefined;
}
export function diagnosticProviderRequestIdHash(err: unknown): string | undefined {
const fromProperty = findDiagnosticErrorProperty(err, readDirectProviderRequestId);
if (fromProperty) {
return hashDiagnosticIdentifier(fromProperty);
}
const fromMessage = findDiagnosticErrorProperty(err, (candidate) =>
extractProviderRequestIdFromText(readDirectMessage(candidate)),
);
return fromMessage ? hashDiagnosticIdentifier(fromMessage) : undefined;
}

View File

@@ -262,6 +262,7 @@ type DiagnosticModelCallBaseEvent = DiagnosticBaseEvent & {
model: string;
api?: string;
transport?: string;
upstreamRequestIdHash?: string;
};
export type DiagnosticModelCallStartedEvent = DiagnosticModelCallBaseEvent & {