feat(diagnostics-otel): export memory diagnostics

2026-05-06 06:50:43 +00:00 · 2026-04-25 11:21:59 -07:00
parent 1380dc170e
commit b8a41739d5
3 changed files with 169 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,7 @@ Docs: https://docs.openclaw.ai
 - Plugins/diagnostics: add metadata-only `model_call_started` and `model_call_ended` hooks for provider/model call telemetry without exposing prompts, responses, headers, request bodies, or raw provider request IDs. Thanks @vincentkoc.
 - Diagnostics/OTEL: emit bounded context assembly diagnostics and export `openclaw.context.assembled` spans with prompt/history sizes but no prompt, history, response, or session-key content. Thanks @vincentkoc.
 - Diagnostics/OTEL: export existing tool-loop diagnostics as `openclaw.tool.loop` counters and spans without loop messages, session identifiers, params, or tool output. Thanks @vincentkoc.
+- Diagnostics/OTEL: export diagnostic memory samples and pressure as bounded memory histograms, counters, and pressure spans to help spot leak regressions without session or payload data. Thanks @vincentkoc.
 - Diagnostics/OTEL: add bounded outbound message delivery lifecycle diagnostics and export them as low-cardinality delivery spans/metrics without message body, recipient, room, or media-path data. (#71471) Thanks @vincentkoc and @jlapenna.
 - Diagnostics/OTEL: emit bounded exec-process diagnostics and export them as `openclaw.exec` spans without exposing command text, working directories, or container identifiers. (#71451) Thanks @vincentkoc and @jlapenna.
 - Diagnostics/OTEL: support `OPENCLAW_OTEL_PRELOADED=1` so the plugin can reuse an already-registered OpenTelemetry SDK while keeping OpenClaw diagnostic listeners wired. (#71450) Thanks @vincentkoc and @jlapenna.
--- a/extensions/diagnostics-otel/src/service.test.ts
+++ b/extensions/diagnostics-otel/src/service.test.ts
@@ -1100,6 +1100,82 @@ describe("diagnostics-otel service", () => {
    await service.stop?.(ctx);
  });

+  test("exports diagnostic memory samples and pressure without session identifiers", async () => {
+    const service = createDiagnosticsOtelService();
+    const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
+    await service.start(ctx);
+
+    emitDiagnosticEvent({
+      type: "diagnostic.memory.sample",
+      uptimeMs: 1234,
+      memory: {
+        rssBytes: 100,
+        heapUsedBytes: 40,
+        heapTotalBytes: 80,
+        externalBytes: 10,
+        arrayBuffersBytes: 5,
+      },
+    });
+    emitDiagnosticEvent({
+      type: "diagnostic.memory.pressure",
+      level: "critical",
+      reason: "rss_growth",
+      thresholdBytes: 512,
+      rssGrowthBytes: 256,
+      windowMs: 60_000,
+      memory: {
+        rssBytes: 200,
+        heapUsedBytes: 50,
+        heapTotalBytes: 90,
+        externalBytes: 20,
+        arrayBuffersBytes: 6,
+      },
+    });
+    await flushDiagnosticEvents();
+
+    expect(telemetryState.histograms.get("openclaw.memory.rss_bytes")?.record).toHaveBeenCalledWith(
+      100,
+      {},
+    );
+    expect(telemetryState.histograms.get("openclaw.memory.rss_bytes")?.record).toHaveBeenCalledWith(
+      200,
+      {
+        "openclaw.memory.level": "critical",
+        "openclaw.memory.reason": "rss_growth",
+      },
+    );
+    expect(telemetryState.counters.get("openclaw.memory.pressure")?.add).toHaveBeenCalledWith(1, {
+      "openclaw.memory.level": "critical",
+      "openclaw.memory.reason": "rss_growth",
+    });
+    const pressureCall = telemetryState.tracer.startSpan.mock.calls.find(
+      (call) => call[0] === "openclaw.memory.pressure",
+    );
+    expect(pressureCall?.[1]).toMatchObject({
+      attributes: {
+        "openclaw.memory.level": "critical",
+        "openclaw.memory.reason": "rss_growth",
+        "openclaw.memory.rss_bytes": 200,
+        "openclaw.memory.heap_used_bytes": 50,
+        "openclaw.memory.heap_total_bytes": 90,
+        "openclaw.memory.external_bytes": 20,
+        "openclaw.memory.array_buffers_bytes": 6,
+        "openclaw.memory.threshold_bytes": 512,
+        "openclaw.memory.rss_growth_bytes": 256,
+        "openclaw.memory.window_ms": 60_000,
+      },
+    });
+    const pressureSpan = telemetryState.spans.find(
+      (span) => span.name === "openclaw.memory.pressure",
+    );
+    expect(pressureSpan?.setStatus).toHaveBeenCalledWith({
+      code: 2,
+      message: "rss_growth",
+    });
+    expect(JSON.stringify(pressureCall)).not.toContain("session");
+    await service.stop?.(ctx);
+  });
+
  test("parents trusted diagnostic lifecycle spans from explicit parent ids", async () => {
    const service = createDiagnosticsOtelService();
    const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
--- a/extensions/diagnostics-otel/src/service.ts
+++ b/extensions/diagnostics-otel/src/service.ts
@@ -676,6 +676,33 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
        unit: "ms",
        description: "Exec process duration",
      });
+      const memoryRssHistogram = meter.createHistogram("openclaw.memory.rss_bytes", {
+        unit: "By",
+        description: "Resident set size reported by diagnostic memory samples",
+      });
+      const memoryHeapUsedHistogram = meter.createHistogram("openclaw.memory.heap_used_bytes", {
+        unit: "By",
+        description: "Heap used bytes reported by diagnostic memory samples",
+      });
+      const memoryHeapTotalHistogram = meter.createHistogram("openclaw.memory.heap_total_bytes", {
+        unit: "By",
+        description: "Heap total bytes reported by diagnostic memory samples",
+      });
+      const memoryExternalHistogram = meter.createHistogram("openclaw.memory.external_bytes", {
+        unit: "By",
+        description: "External memory bytes reported by diagnostic memory samples",
+      });
+      const memoryArrayBuffersHistogram = meter.createHistogram(
+        "openclaw.memory.array_buffers_bytes",
+        {
+          unit: "By",
+          description: "ArrayBuffer bytes reported by diagnostic memory samples",
+        },
+      );
+      const memoryPressureCounter = meter.createCounter("openclaw.memory.pressure", {
+        unit: "1",
+        description: "Diagnostic memory pressure events",
+      });

      let recordLogRecord:
        | ((
@@ -1126,6 +1153,65 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
        span.end(evt.ts);
      };

+      const recordMemoryUsageMetrics = (
+        evt: Extract<
+          DiagnosticEventPayload,
+          { type: "diagnostic.memory.sample" | "diagnostic.memory.pressure" }
+        >,
+        attrs: Record<string, string> = {},
+      ) => {
+        memoryRssHistogram.record(evt.memory.rssBytes, attrs);
+        memoryHeapUsedHistogram.record(evt.memory.heapUsedBytes, attrs);
+        memoryHeapTotalHistogram.record(evt.memory.heapTotalBytes, attrs);
+        memoryExternalHistogram.record(evt.memory.externalBytes, attrs);
+        memoryArrayBuffersHistogram.record(evt.memory.arrayBuffersBytes, attrs);
+      };
+
+      const recordMemorySample = (
+        evt: Extract<DiagnosticEventPayload, { type: "diagnostic.memory.sample" }>,
+      ) => {
+        recordMemoryUsageMetrics(evt);
+      };
+
+      const recordMemoryPressure = (
+        evt: Extract<DiagnosticEventPayload, { type: "diagnostic.memory.pressure" }>,
+      ) => {
+        const attrs = {
+          "openclaw.memory.level": evt.level,
+          "openclaw.memory.reason": evt.reason,
+        };
+        memoryPressureCounter.add(1, attrs);
+        recordMemoryUsageMetrics(evt, attrs);
+        if (!tracesEnabled) {
+          return;
+        }
+        const spanAttrs: Record<string, string | number | boolean> = {
+          ...attrs,
+          "openclaw.memory.rss_bytes": evt.memory.rssBytes,
+          "openclaw.memory.heap_used_bytes": evt.memory.heapUsedBytes,
+          "openclaw.memory.heap_total_bytes": evt.memory.heapTotalBytes,
+          "openclaw.memory.external_bytes": evt.memory.externalBytes,
+          "openclaw.memory.array_buffers_bytes": evt.memory.arrayBuffersBytes,
+          ...(evt.thresholdBytes !== undefined
+            ? { "openclaw.memory.threshold_bytes": evt.thresholdBytes }
+            : {}),
+          ...(evt.rssGrowthBytes !== undefined
+            ? { "openclaw.memory.rss_growth_bytes": evt.rssGrowthBytes }
+            : {}),
+          ...(evt.windowMs !== undefined ? { "openclaw.memory.window_ms": evt.windowMs } : {}),
+        };
+        const span = spanWithDuration("openclaw.memory.pressure", spanAttrs, 0, {
+          endTimeMs: evt.ts,
+        });
+        if (evt.level === "critical") {
+          span.setStatus({
+            code: SpanStatusCode.ERROR,
+            message: evt.reason,
+          });
+        }
+        span.end(evt.ts);
+      };
+
      const recordRunCompleted = (
        evt: Extract<DiagnosticEventPayload, { type: "run.completed" }>,
        metadata: DiagnosticEventMetadata,
@@ -1470,11 +1556,15 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
            case "tool.loop":
              recordToolLoop(evt);
              return;
+            case "diagnostic.memory.sample":
+              recordMemorySample(evt);
+              return;
+            case "diagnostic.memory.pressure":
+              recordMemoryPressure(evt);
+              return;
            case "tool.execution.started":
            case "run.started":
            case "model.call.started":
-            case "diagnostic.memory.sample":
-            case "diagnostic.memory.pressure":
            case "payload.large":
              return;
          }