fix(telemetry): bound message diagnostics labels

This commit is contained in:
Vincent Koc
2026-05-03 19:02:40 -07:00
parent 111df161df
commit 50da306c0a
7 changed files with 135 additions and 27 deletions

View File

@@ -41,6 +41,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Diagnostics: keep webhook/message OTEL attributes and Prometheus delivery labels low-cardinality and omit raw chat/message IDs from spans, so progress-draft and message-tool modes do not leak high-cardinality messaging identifiers.
- Telegram: render shared interactive reply buttons in reply delivery so plugin approval messages show inline keyboards. (#76238) Thanks @keshavbotagent.
- Release validation: install the cross-OS TypeScript harness through Windows-safe Node/npm shims so native Windows package checks reach the OpenClaw smoke suites instead of exiting before artifact capture. Thanks @vincentkoc.
- Release validation: let Windows packaged-upgrade checks continue after the shipped 2026.5.2 updater hits its native-module swap cleanup fallback, verifying the fallback-installed candidate through package metadata and downstream smoke instead of crashing on the immediate update-status probe. Thanks @vincentkoc.

View File

@@ -268,11 +268,11 @@ heartbeat tick. For the config knob and defaults, see
- `openclaw.exec`
- `openclaw.exec.target`, `openclaw.exec.mode`, `openclaw.outcome`, `openclaw.failureKind`, `openclaw.exec.command_length`, `openclaw.exec.exit_code`, `openclaw.exec.timed_out`
- `openclaw.webhook.processed`
- `openclaw.channel`, `openclaw.webhook`, `openclaw.chatId`
- `openclaw.channel`, `openclaw.webhook`
- `openclaw.webhook.error`
- `openclaw.channel`, `openclaw.webhook`, `openclaw.chatId`, `openclaw.error`
- `openclaw.channel`, `openclaw.webhook`, `openclaw.error`
- `openclaw.message.processed`
- `openclaw.channel`, `openclaw.outcome`, `openclaw.chatId`, `openclaw.messageId`, `openclaw.reason`
- `openclaw.channel`, `openclaw.outcome`, `openclaw.reason`
- `openclaw.message.delivery`
- `openclaw.channel`, `openclaw.delivery.kind`, `openclaw.outcome`, `openclaw.errorCategory`, `openclaw.delivery.result_count`
- `openclaw.session.stuck`

View File

@@ -296,6 +296,7 @@ describe("diagnostics-otel service", () => {
type: "webhook.processed",
channel: "telegram",
updateType: "telegram-post",
chatId: "chat-should-not-export",
durationMs: 120,
});
emitDiagnosticEvent({
@@ -307,7 +308,10 @@ describe("diagnostics-otel service", () => {
emitDiagnosticEvent({
type: "message.processed",
channel: "telegram",
chatId: "chat-should-not-export",
messageId: "message-should-not-export",
outcome: "completed",
reason: "progress draft / message tool 123",
durationMs: 55,
});
emitDiagnosticEvent({
@@ -348,6 +352,33 @@ describe("diagnostics-otel service", () => {
expect(spanNames).toContain("openclaw.webhook.processed");
expect(spanNames).toContain("openclaw.message.processed");
expect(spanNames).toContain("openclaw.session.stuck");
const webhookSpanCall = telemetryState.tracer.startSpan.mock.calls.find(
(call) => call[0] === "openclaw.webhook.processed",
);
expect(webhookSpanCall?.[1]).toEqual({
attributes: expect.not.objectContaining({
"openclaw.chatId": expect.anything(),
}),
startTime: expect.any(Number),
});
const messageSpanCall = telemetryState.tracer.startSpan.mock.calls.find(
(call) => call[0] === "openclaw.message.processed",
);
expect(messageSpanCall?.[1]).toEqual({
attributes: expect.objectContaining({
"openclaw.channel": "telegram",
"openclaw.outcome": "completed",
"openclaw.reason": "unknown",
}),
startTime: expect.any(Number),
});
expect(messageSpanCall?.[1]).toEqual({
attributes: expect.not.objectContaining({
"openclaw.chatId": expect.anything(),
"openclaw.messageId": expect.anything(),
}),
startTime: expect.any(Number),
});
emitDiagnosticEvent({
type: "log.record",
@@ -2387,6 +2418,7 @@ describe("diagnostics-otel service", () => {
for (const call of deliverySpanCalls) {
expect(call[1]).toEqual({
attributes: expect.not.objectContaining({
"openclaw.chatId": expect.anything(),
"openclaw.sessionKey": expect.anything(),
"openclaw.messageId": expect.anything(),
"openclaw.conversationId": expect.anything(),
@@ -2406,6 +2438,46 @@ describe("diagnostics-otel service", () => {
await service.stop?.(ctx);
});
test("bounds unsafe message delivery attributes before export", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
await service.start(ctx);
emitDiagnosticEvent({
type: "message.delivery.completed",
channel: "discord/custom",
deliveryKind: "progress draft" as never,
durationMs: 20,
resultCount: 1,
sessionKey: "session-secret",
});
await flushDiagnosticEvents();
expect(
telemetryState.histograms.get("openclaw.message.delivery.duration_ms")?.record,
).toHaveBeenCalledWith(
20,
expect.objectContaining({
"openclaw.channel": "unknown",
"openclaw.delivery.kind": "other",
"openclaw.outcome": "completed",
}),
);
const deliverySpanCall = telemetryState.tracer.startSpan.mock.calls.find(
(call) => call[0] === "openclaw.message.delivery",
);
expect(deliverySpanCall?.[1]).toMatchObject({
attributes: {
"openclaw.channel": "unknown",
"openclaw.delivery.kind": "other",
"openclaw.outcome": "completed",
"openclaw.delivery.result_count": 1,
},
startTime: expect.any(Number),
});
await service.stop?.(ctx);
});
test("does not export model or tool content unless capture is explicitly enabled", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });

View File

@@ -31,6 +31,8 @@ import {
const DEFAULT_SERVICE_NAME = "openclaw";
const DROPPED_OTEL_ATTRIBUTE_KEYS = new Set([
"openclaw.callId",
"openclaw.chatId",
"openclaw.messageId",
"openclaw.parentSpanId",
"openclaw.runId",
"openclaw.sessionId",
@@ -1262,8 +1264,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
evt: Extract<DiagnosticEventPayload, { type: "webhook.processed" }>,
) => {
const attrs = {
"openclaw.channel": evt.channel ?? "unknown",
"openclaw.webhook": evt.updateType ?? "unknown",
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.webhook": lowCardinalityAttr(evt.updateType),
};
if (typeof evt.durationMs === "number") {
webhookDurationHistogram.record(evt.durationMs, attrs);
@@ -1272,9 +1274,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
return;
}
const spanAttrs: Record<string, string | number> = { ...attrs };
if (evt.chatId !== undefined) {
spanAttrs["openclaw.chatId"] = String(evt.chatId);
}
const span = spanWithDuration("openclaw.webhook.processed", spanAttrs, evt.durationMs);
span.end();
};
@@ -1283,8 +1282,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
evt: Extract<DiagnosticEventPayload, { type: "webhook.error" }>,
) => {
const attrs = {
"openclaw.channel": evt.channel ?? "unknown",
"openclaw.webhook": evt.updateType ?? "unknown",
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.webhook": lowCardinalityAttr(evt.updateType),
};
webhookErrorCounter.add(1, attrs);
if (!tracesEnabled) {
@@ -1295,9 +1294,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
...attrs,
"openclaw.error": redactedError,
};
if (evt.chatId !== undefined) {
spanAttrs["openclaw.chatId"] = String(evt.chatId);
}
const span = tracer.startSpan("openclaw.webhook.error", {
attributes: spanAttrs,
});
@@ -1309,8 +1305,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
evt: Extract<DiagnosticEventPayload, { type: "message.queued" }>,
) => {
const attrs = {
"openclaw.channel": evt.channel ?? "unknown",
"openclaw.source": evt.source ?? "unknown",
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.source": lowCardinalityAttr(evt.source),
};
messageQueuedCounter.add(1, attrs);
if (typeof evt.queueDepth === "number") {
@@ -1322,7 +1318,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
evt: Extract<DiagnosticEventPayload, { type: "message.processed" }>,
) => {
const attrs = {
"openclaw.channel": evt.channel ?? "unknown",
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.outcome": evt.outcome ?? "unknown",
};
messageProcessedCounter.add(1, attrs);
@@ -1333,14 +1329,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
return;
}
const spanAttrs: Record<string, string | number> = { ...attrs };
if (evt.chatId !== undefined) {
spanAttrs["openclaw.chatId"] = String(evt.chatId);
}
if (evt.messageId !== undefined) {
spanAttrs["openclaw.messageId"] = String(evt.messageId);
}
if (evt.reason) {
spanAttrs["openclaw.reason"] = redactSensitiveText(evt.reason);
spanAttrs["openclaw.reason"] = lowCardinalityAttr(evt.reason, "unknown");
}
const span = spanWithDuration("openclaw.message.processed", spanAttrs, evt.durationMs);
if (evt.outcome === "error" && evt.error) {
@@ -1352,8 +1342,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
const messageDeliveryAttrs = (
evt: MessageDeliveryDiagnosticEvent,
): Record<string, string> => ({
"openclaw.channel": evt.channel,
"openclaw.delivery.kind": evt.deliveryKind,
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.delivery.kind": lowCardinalityAttr(evt.deliveryKind, "other"),
});
const recordMessageDeliveryStarted = (

View File

@@ -87,6 +87,49 @@ describe("diagnostics-prometheus service", () => {
expect(rendered).not.toContain("sk-secret");
});
it("bounds messaging labels without exporting raw chat identifiers", () => {
const store = __test__.createPrometheusMetricStore();
__test__.recordDiagnosticEvent(
store,
{
...baseEvent(),
type: "message.processed",
channel: "telegram/custom",
chatId: "chat-should-not-export",
messageId: "message-should-not-export",
outcome: "completed",
reason: "progress draft / message tool 123",
durationMs: 25,
},
trusted,
);
__test__.recordDiagnosticEvent(
store,
{
...baseEvent(),
type: "message.delivery.error",
channel: "discord/custom",
deliveryKind: "progress draft" as never,
durationMs: 50,
errorCategory: "TimeoutError",
},
trusted,
);
const rendered = __test__.renderPrometheusMetrics(store);
expect(rendered).toContain(
'openclaw_message_processed_total{channel="unknown",outcome="completed",reason="none"} 1',
);
expect(rendered).toContain(
'openclaw_message_delivery_total{channel="unknown",delivery_kind="other",error_category="TimeoutError",outcome="error"} 1',
);
expect(rendered).not.toContain("chat-should-not-export");
expect(rendered).not.toContain("message-should-not-export");
expect(rendered).not.toContain("progress draft");
});
it("caps metric series growth and reports dropped series", () => {
const store = __test__.createPrometheusMetricStore();

View File

@@ -504,7 +504,7 @@ function recordDiagnosticEvent(
"Outbound message delivery attempts by outcome.",
{
channel: lowCardinalityLabel(evt.channel),
delivery_kind: evt.deliveryKind,
delivery_kind: lowCardinalityLabel(evt.deliveryKind, "other"),
error_category:
evt.type === "message.delivery.error"
? lowCardinalityLabel(evt.errorCategory, "other")
@@ -517,7 +517,7 @@ function recordDiagnosticEvent(
"Outbound message delivery duration in seconds.",
{
channel: lowCardinalityLabel(evt.channel),
delivery_kind: evt.deliveryKind,
delivery_kind: lowCardinalityLabel(evt.deliveryKind, "other"),
error_category:
evt.type === "message.delivery.error"
? lowCardinalityLabel(evt.errorCategory, "other")

View File

@@ -88,6 +88,8 @@ const REQUIRED_SPAN_NAMES = [
] as const;
const DISALLOWED_ATTRIBUTE_KEYS = new Set([
"openclaw.runId",
"openclaw.chatId",
"openclaw.messageId",
"openclaw.sessionKey",
"openclaw.sessionId",
"openclaw.callId",