fix(telemetry): bound message diagnostics labels

2026-05-06 05:50:43 +00:00 · 2026-05-03 19:02:40 -07:00
parent 111df161df
commit 50da306c0a
7 changed files with 135 additions and 27 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -41,6 +41,7 @@ Docs: https://docs.openclaw.ai

 ### Fixes

+- Diagnostics: keep webhook/message OTEL attributes and Prometheus delivery labels low-cardinality and omit raw chat/message IDs from spans, so progress-draft and message-tool modes do not leak high-cardinality messaging identifiers.
 - Telegram: render shared interactive reply buttons in reply delivery so plugin approval messages show inline keyboards. (#76238) Thanks @keshavbotagent.
 - Release validation: install the cross-OS TypeScript harness through Windows-safe Node/npm shims so native Windows package checks reach the OpenClaw smoke suites instead of exiting before artifact capture. Thanks @vincentkoc.
 - Release validation: let Windows packaged-upgrade checks continue after the shipped 2026.5.2 updater hits its native-module swap cleanup fallback, verifying the fallback-installed candidate through package metadata and downstream smoke instead of crashing on the immediate update-status probe. Thanks @vincentkoc.
--- a/docs/gateway/opentelemetry.md
+++ b/docs/gateway/opentelemetry.md
@@ -268,11 +268,11 @@ heartbeat tick. For the config knob and defaults, see
 - `openclaw.exec`
  - `openclaw.exec.target`, `openclaw.exec.mode`, `openclaw.outcome`, `openclaw.failureKind`, `openclaw.exec.command_length`, `openclaw.exec.exit_code`, `openclaw.exec.timed_out`
 - `openclaw.webhook.processed`
-  - `openclaw.channel`, `openclaw.webhook`, `openclaw.chatId`
+  - `openclaw.channel`, `openclaw.webhook`
 - `openclaw.webhook.error`
-  - `openclaw.channel`, `openclaw.webhook`, `openclaw.chatId`, `openclaw.error`
+  - `openclaw.channel`, `openclaw.webhook`, `openclaw.error`
 - `openclaw.message.processed`
-  - `openclaw.channel`, `openclaw.outcome`, `openclaw.chatId`, `openclaw.messageId`, `openclaw.reason`
+  - `openclaw.channel`, `openclaw.outcome`, `openclaw.reason`
 - `openclaw.message.delivery`
  - `openclaw.channel`, `openclaw.delivery.kind`, `openclaw.outcome`, `openclaw.errorCategory`, `openclaw.delivery.result_count`
 - `openclaw.session.stuck`
--- a/extensions/diagnostics-otel/src/service.test.ts
+++ b/extensions/diagnostics-otel/src/service.test.ts
@@ -296,6 +296,7 @@ describe("diagnostics-otel service", () => {
      type: "webhook.processed",
      channel: "telegram",
      updateType: "telegram-post",
+      chatId: "chat-should-not-export",
      durationMs: 120,
    });
    emitDiagnosticEvent({
@@ -307,7 +308,10 @@ describe("diagnostics-otel service", () => {
    emitDiagnosticEvent({
      type: "message.processed",
      channel: "telegram",
+      chatId: "chat-should-not-export",
+      messageId: "message-should-not-export",
      outcome: "completed",
+      reason: "progress draft / message tool 123",
      durationMs: 55,
    });
    emitDiagnosticEvent({
@@ -348,6 +352,33 @@ describe("diagnostics-otel service", () => {
    expect(spanNames).toContain("openclaw.webhook.processed");
    expect(spanNames).toContain("openclaw.message.processed");
    expect(spanNames).toContain("openclaw.session.stuck");
+    const webhookSpanCall = telemetryState.tracer.startSpan.mock.calls.find(
+      (call) => call[0] === "openclaw.webhook.processed",
+    );
+    expect(webhookSpanCall?.[1]).toEqual({
+      attributes: expect.not.objectContaining({
+        "openclaw.chatId": expect.anything(),
+      }),
+      startTime: expect.any(Number),
+    });
+    const messageSpanCall = telemetryState.tracer.startSpan.mock.calls.find(
+      (call) => call[0] === "openclaw.message.processed",
+    );
+    expect(messageSpanCall?.[1]).toEqual({
+      attributes: expect.objectContaining({
+        "openclaw.channel": "telegram",
+        "openclaw.outcome": "completed",
+        "openclaw.reason": "unknown",
+      }),
+      startTime: expect.any(Number),
+    });
+    expect(messageSpanCall?.[1]).toEqual({
+      attributes: expect.not.objectContaining({
+        "openclaw.chatId": expect.anything(),
+        "openclaw.messageId": expect.anything(),
+      }),
+      startTime: expect.any(Number),
+    });

    emitDiagnosticEvent({
      type: "log.record",
@@ -2387,6 +2418,7 @@ describe("diagnostics-otel service", () => {
    for (const call of deliverySpanCalls) {
      expect(call[1]).toEqual({
        attributes: expect.not.objectContaining({
+          "openclaw.chatId": expect.anything(),
          "openclaw.sessionKey": expect.anything(),
          "openclaw.messageId": expect.anything(),
          "openclaw.conversationId": expect.anything(),
@@ -2406,6 +2438,46 @@ describe("diagnostics-otel service", () => {
    await service.stop?.(ctx);
  });

+  test("bounds unsafe message delivery attributes before export", async () => {
+    const service = createDiagnosticsOtelService();
+    const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
+    await service.start(ctx);
+
+    emitDiagnosticEvent({
+      type: "message.delivery.completed",
+      channel: "discord/custom",
+      deliveryKind: "progress draft" as never,
+      durationMs: 20,
+      resultCount: 1,
+      sessionKey: "session-secret",
+    });
+    await flushDiagnosticEvents();
+
+    expect(
+      telemetryState.histograms.get("openclaw.message.delivery.duration_ms")?.record,
+    ).toHaveBeenCalledWith(
+      20,
+      expect.objectContaining({
+        "openclaw.channel": "unknown",
+        "openclaw.delivery.kind": "other",
+        "openclaw.outcome": "completed",
+      }),
+    );
+    const deliverySpanCall = telemetryState.tracer.startSpan.mock.calls.find(
+      (call) => call[0] === "openclaw.message.delivery",
+    );
+    expect(deliverySpanCall?.[1]).toMatchObject({
+      attributes: {
+        "openclaw.channel": "unknown",
+        "openclaw.delivery.kind": "other",
+        "openclaw.outcome": "completed",
+        "openclaw.delivery.result_count": 1,
+      },
+      startTime: expect.any(Number),
+    });
+    await service.stop?.(ctx);
+  });
+
  test("does not export model or tool content unless capture is explicitly enabled", async () => {
    const service = createDiagnosticsOtelService();
    const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
--- a/extensions/diagnostics-otel/src/service.ts
+++ b/extensions/diagnostics-otel/src/service.ts
@@ -31,6 +31,8 @@ import {
 const DEFAULT_SERVICE_NAME = "openclaw";
 const DROPPED_OTEL_ATTRIBUTE_KEYS = new Set([
  "openclaw.callId",
+  "openclaw.chatId",
+  "openclaw.messageId",
  "openclaw.parentSpanId",
  "openclaw.runId",
  "openclaw.sessionId",
@@ -1262,8 +1264,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
        evt: Extract<DiagnosticEventPayload, { type: "webhook.processed" }>,
      ) => {
        const attrs = {
-          "openclaw.channel": evt.channel ?? "unknown",
-          "openclaw.webhook": evt.updateType ?? "unknown",
+          "openclaw.channel": lowCardinalityAttr(evt.channel),
+          "openclaw.webhook": lowCardinalityAttr(evt.updateType),
        };
        if (typeof evt.durationMs === "number") {
          webhookDurationHistogram.record(evt.durationMs, attrs);
@@ -1272,9 +1274,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
          return;
        }
        const spanAttrs: Record<string, string | number> = { ...attrs };
-        if (evt.chatId !== undefined) {
-          spanAttrs["openclaw.chatId"] = String(evt.chatId);
-        }
        const span = spanWithDuration("openclaw.webhook.processed", spanAttrs, evt.durationMs);
        span.end();
      };
@@ -1283,8 +1282,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
        evt: Extract<DiagnosticEventPayload, { type: "webhook.error" }>,
      ) => {
        const attrs = {
-          "openclaw.channel": evt.channel ?? "unknown",
-          "openclaw.webhook": evt.updateType ?? "unknown",
+          "openclaw.channel": lowCardinalityAttr(evt.channel),
+          "openclaw.webhook": lowCardinalityAttr(evt.updateType),
        };
        webhookErrorCounter.add(1, attrs);
        if (!tracesEnabled) {
@@ -1295,9 +1294,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
          ...attrs,
          "openclaw.error": redactedError,
        };
-        if (evt.chatId !== undefined) {
-          spanAttrs["openclaw.chatId"] = String(evt.chatId);
-        }
        const span = tracer.startSpan("openclaw.webhook.error", {
          attributes: spanAttrs,
        });
@@ -1309,8 +1305,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
        evt: Extract<DiagnosticEventPayload, { type: "message.queued" }>,
      ) => {
        const attrs = {
-          "openclaw.channel": evt.channel ?? "unknown",
-          "openclaw.source": evt.source ?? "unknown",
+          "openclaw.channel": lowCardinalityAttr(evt.channel),
+          "openclaw.source": lowCardinalityAttr(evt.source),
        };
        messageQueuedCounter.add(1, attrs);
        if (typeof evt.queueDepth === "number") {
@@ -1322,7 +1318,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
        evt: Extract<DiagnosticEventPayload, { type: "message.processed" }>,
      ) => {
        const attrs = {
-          "openclaw.channel": evt.channel ?? "unknown",
+          "openclaw.channel": lowCardinalityAttr(evt.channel),
          "openclaw.outcome": evt.outcome ?? "unknown",
        };
        messageProcessedCounter.add(1, attrs);
@@ -1333,14 +1329,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
          return;
        }
        const spanAttrs: Record<string, string | number> = { ...attrs };
-        if (evt.chatId !== undefined) {
-          spanAttrs["openclaw.chatId"] = String(evt.chatId);
-        }
-        if (evt.messageId !== undefined) {
-          spanAttrs["openclaw.messageId"] = String(evt.messageId);
-        }
        if (evt.reason) {
-          spanAttrs["openclaw.reason"] = redactSensitiveText(evt.reason);
+          spanAttrs["openclaw.reason"] = lowCardinalityAttr(evt.reason, "unknown");
        }
        const span = spanWithDuration("openclaw.message.processed", spanAttrs, evt.durationMs);
        if (evt.outcome === "error" && evt.error) {
@@ -1352,8 +1342,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
      const messageDeliveryAttrs = (
        evt: MessageDeliveryDiagnosticEvent,
      ): Record<string, string> => ({
-        "openclaw.channel": evt.channel,
-        "openclaw.delivery.kind": evt.deliveryKind,
+        "openclaw.channel": lowCardinalityAttr(evt.channel),
+        "openclaw.delivery.kind": lowCardinalityAttr(evt.deliveryKind, "other"),
      });

      const recordMessageDeliveryStarted = (
--- a/extensions/diagnostics-prometheus/src/service.test.ts
+++ b/extensions/diagnostics-prometheus/src/service.test.ts
@@ -87,6 +87,49 @@ describe("diagnostics-prometheus service", () => {
    expect(rendered).not.toContain("sk-secret");
  });

+  it("bounds messaging labels without exporting raw chat identifiers", () => {
+    const store = __test__.createPrometheusMetricStore();
+
+    __test__.recordDiagnosticEvent(
+      store,
+      {
+        ...baseEvent(),
+        type: "message.processed",
+        channel: "telegram/custom",
+        chatId: "chat-should-not-export",
+        messageId: "message-should-not-export",
+        outcome: "completed",
+        reason: "progress draft / message tool 123",
+        durationMs: 25,
+      },
+      trusted,
+    );
+    __test__.recordDiagnosticEvent(
+      store,
+      {
+        ...baseEvent(),
+        type: "message.delivery.error",
+        channel: "discord/custom",
+        deliveryKind: "progress draft" as never,
+        durationMs: 50,
+        errorCategory: "TimeoutError",
+      },
+      trusted,
+    );
+
+    const rendered = __test__.renderPrometheusMetrics(store);
+
+    expect(rendered).toContain(
+      'openclaw_message_processed_total{channel="unknown",outcome="completed",reason="none"} 1',
+    );
+    expect(rendered).toContain(
+      'openclaw_message_delivery_total{channel="unknown",delivery_kind="other",error_category="TimeoutError",outcome="error"} 1',
+    );
+    expect(rendered).not.toContain("chat-should-not-export");
+    expect(rendered).not.toContain("message-should-not-export");
+    expect(rendered).not.toContain("progress draft");
+  });
+
  it("caps metric series growth and reports dropped series", () => {
    const store = __test__.createPrometheusMetricStore();

--- a/extensions/diagnostics-prometheus/src/service.ts
+++ b/extensions/diagnostics-prometheus/src/service.ts
@@ -504,7 +504,7 @@ function recordDiagnosticEvent(
        "Outbound message delivery attempts by outcome.",
        {
          channel: lowCardinalityLabel(evt.channel),
-          delivery_kind: evt.deliveryKind,
+          delivery_kind: lowCardinalityLabel(evt.deliveryKind, "other"),
          error_category:
            evt.type === "message.delivery.error"
              ? lowCardinalityLabel(evt.errorCategory, "other")
@@ -517,7 +517,7 @@ function recordDiagnosticEvent(
        "Outbound message delivery duration in seconds.",
        {
          channel: lowCardinalityLabel(evt.channel),
-          delivery_kind: evt.deliveryKind,
+          delivery_kind: lowCardinalityLabel(evt.deliveryKind, "other"),
          error_category:
            evt.type === "message.delivery.error"
              ? lowCardinalityLabel(evt.errorCategory, "other")
--- a/scripts/qa-otel-smoke.ts
+++ b/scripts/qa-otel-smoke.ts
@@ -88,6 +88,8 @@ const REQUIRED_SPAN_NAMES = [
 ] as const;
 const DISALLOWED_ATTRIBUTE_KEYS = new Set([
  "openclaw.runId",
+  "openclaw.chatId",
+  "openclaw.messageId",
  "openclaw.sessionKey",
  "openclaw.sessionId",
  "openclaw.callId",