diff --git a/CHANGELOG.md b/CHANGELOG.md index 0dba203b511..b92c8629124 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ Docs: https://docs.openclaw.ai - Plugins/diagnostics: add metadata-only `model_call_started` and `model_call_ended` hooks for provider/model call telemetry without exposing prompts, responses, headers, request bodies, or raw provider request IDs. Thanks @vincentkoc. - Diagnostics/OTEL: emit bounded context assembly diagnostics and export `openclaw.context.assembled` spans with prompt/history sizes but no prompt, history, response, or session-key content. Thanks @vincentkoc. - Diagnostics/OTEL: export existing tool-loop diagnostics as `openclaw.tool.loop` counters and spans without loop messages, session identifiers, params, or tool output. Thanks @vincentkoc. +- Diagnostics/OTEL: export diagnostic memory samples and pressure as bounded memory histograms, counters, and pressure spans to help spot leak regressions without session or payload data. Thanks @vincentkoc. - Diagnostics/OTEL: add bounded outbound message delivery lifecycle diagnostics and export them as low-cardinality delivery spans/metrics without message body, recipient, room, or media-path data. (#71471) Thanks @vincentkoc and @jlapenna. - Diagnostics/OTEL: emit bounded exec-process diagnostics and export them as `openclaw.exec` spans without exposing command text, working directories, or container identifiers. (#71451) Thanks @vincentkoc and @jlapenna. - Diagnostics/OTEL: support `OPENCLAW_OTEL_PRELOADED=1` so the plugin can reuse an already-registered OpenTelemetry SDK while keeping OpenClaw diagnostic listeners wired. (#71450) Thanks @vincentkoc and @jlapenna. diff --git a/extensions/diagnostics-otel/src/service.test.ts b/extensions/diagnostics-otel/src/service.test.ts index bfc1a63b1d5..28b7525df95 100644 --- a/extensions/diagnostics-otel/src/service.test.ts +++ b/extensions/diagnostics-otel/src/service.test.ts @@ -1100,6 +1100,82 @@ describe("diagnostics-otel service", () => { await service.stop?.(ctx); }); + test("exports diagnostic memory samples and pressure without session identifiers", async () => { + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true }); + await service.start(ctx); + + emitDiagnosticEvent({ + type: "diagnostic.memory.sample", + uptimeMs: 1234, + memory: { + rssBytes: 100, + heapUsedBytes: 40, + heapTotalBytes: 80, + externalBytes: 10, + arrayBuffersBytes: 5, + }, + }); + emitDiagnosticEvent({ + type: "diagnostic.memory.pressure", + level: "critical", + reason: "rss_growth", + thresholdBytes: 512, + rssGrowthBytes: 256, + windowMs: 60_000, + memory: { + rssBytes: 200, + heapUsedBytes: 50, + heapTotalBytes: 90, + externalBytes: 20, + arrayBuffersBytes: 6, + }, + }); + await flushDiagnosticEvents(); + + expect(telemetryState.histograms.get("openclaw.memory.rss_bytes")?.record).toHaveBeenCalledWith( + 100, + {}, + ); + expect(telemetryState.histograms.get("openclaw.memory.rss_bytes")?.record).toHaveBeenCalledWith( + 200, + { + "openclaw.memory.level": "critical", + "openclaw.memory.reason": "rss_growth", + }, + ); + expect(telemetryState.counters.get("openclaw.memory.pressure")?.add).toHaveBeenCalledWith(1, { + "openclaw.memory.level": "critical", + "openclaw.memory.reason": "rss_growth", + }); + const pressureCall = telemetryState.tracer.startSpan.mock.calls.find( + (call) => call[0] === "openclaw.memory.pressure", + ); + expect(pressureCall?.[1]).toMatchObject({ + attributes: { + "openclaw.memory.level": "critical", + "openclaw.memory.reason": "rss_growth", + "openclaw.memory.rss_bytes": 200, + "openclaw.memory.heap_used_bytes": 50, + "openclaw.memory.heap_total_bytes": 90, + "openclaw.memory.external_bytes": 20, + "openclaw.memory.array_buffers_bytes": 6, + "openclaw.memory.threshold_bytes": 512, + "openclaw.memory.rss_growth_bytes": 256, + "openclaw.memory.window_ms": 60_000, + }, + }); + const pressureSpan = telemetryState.spans.find( + (span) => span.name === "openclaw.memory.pressure", + ); + expect(pressureSpan?.setStatus).toHaveBeenCalledWith({ + code: 2, + message: "rss_growth", + }); + expect(JSON.stringify(pressureCall)).not.toContain("session"); + await service.stop?.(ctx); + }); + test("parents trusted diagnostic lifecycle spans from explicit parent ids", async () => { const service = createDiagnosticsOtelService(); const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true }); diff --git a/extensions/diagnostics-otel/src/service.ts b/extensions/diagnostics-otel/src/service.ts index 2c6a7509778..6e2a805ab74 100644 --- a/extensions/diagnostics-otel/src/service.ts +++ b/extensions/diagnostics-otel/src/service.ts @@ -676,6 +676,33 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { unit: "ms", description: "Exec process duration", }); + const memoryRssHistogram = meter.createHistogram("openclaw.memory.rss_bytes", { + unit: "By", + description: "Resident set size reported by diagnostic memory samples", + }); + const memoryHeapUsedHistogram = meter.createHistogram("openclaw.memory.heap_used_bytes", { + unit: "By", + description: "Heap used bytes reported by diagnostic memory samples", + }); + const memoryHeapTotalHistogram = meter.createHistogram("openclaw.memory.heap_total_bytes", { + unit: "By", + description: "Heap total bytes reported by diagnostic memory samples", + }); + const memoryExternalHistogram = meter.createHistogram("openclaw.memory.external_bytes", { + unit: "By", + description: "External memory bytes reported by diagnostic memory samples", + }); + const memoryArrayBuffersHistogram = meter.createHistogram( + "openclaw.memory.array_buffers_bytes", + { + unit: "By", + description: "ArrayBuffer bytes reported by diagnostic memory samples", + }, + ); + const memoryPressureCounter = meter.createCounter("openclaw.memory.pressure", { + unit: "1", + description: "Diagnostic memory pressure events", + }); let recordLogRecord: | (( @@ -1126,6 +1153,65 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { span.end(evt.ts); }; + const recordMemoryUsageMetrics = ( + evt: Extract< + DiagnosticEventPayload, + { type: "diagnostic.memory.sample" | "diagnostic.memory.pressure" } + >, + attrs: Record = {}, + ) => { + memoryRssHistogram.record(evt.memory.rssBytes, attrs); + memoryHeapUsedHistogram.record(evt.memory.heapUsedBytes, attrs); + memoryHeapTotalHistogram.record(evt.memory.heapTotalBytes, attrs); + memoryExternalHistogram.record(evt.memory.externalBytes, attrs); + memoryArrayBuffersHistogram.record(evt.memory.arrayBuffersBytes, attrs); + }; + + const recordMemorySample = ( + evt: Extract, + ) => { + recordMemoryUsageMetrics(evt); + }; + + const recordMemoryPressure = ( + evt: Extract, + ) => { + const attrs = { + "openclaw.memory.level": evt.level, + "openclaw.memory.reason": evt.reason, + }; + memoryPressureCounter.add(1, attrs); + recordMemoryUsageMetrics(evt, attrs); + if (!tracesEnabled) { + return; + } + const spanAttrs: Record = { + ...attrs, + "openclaw.memory.rss_bytes": evt.memory.rssBytes, + "openclaw.memory.heap_used_bytes": evt.memory.heapUsedBytes, + "openclaw.memory.heap_total_bytes": evt.memory.heapTotalBytes, + "openclaw.memory.external_bytes": evt.memory.externalBytes, + "openclaw.memory.array_buffers_bytes": evt.memory.arrayBuffersBytes, + ...(evt.thresholdBytes !== undefined + ? { "openclaw.memory.threshold_bytes": evt.thresholdBytes } + : {}), + ...(evt.rssGrowthBytes !== undefined + ? { "openclaw.memory.rss_growth_bytes": evt.rssGrowthBytes } + : {}), + ...(evt.windowMs !== undefined ? { "openclaw.memory.window_ms": evt.windowMs } : {}), + }; + const span = spanWithDuration("openclaw.memory.pressure", spanAttrs, 0, { + endTimeMs: evt.ts, + }); + if (evt.level === "critical") { + span.setStatus({ + code: SpanStatusCode.ERROR, + message: evt.reason, + }); + } + span.end(evt.ts); + }; + const recordRunCompleted = ( evt: Extract, metadata: DiagnosticEventMetadata, @@ -1470,11 +1556,15 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { case "tool.loop": recordToolLoop(evt); return; + case "diagnostic.memory.sample": + recordMemorySample(evt); + return; + case "diagnostic.memory.pressure": + recordMemoryPressure(evt); + return; case "tool.execution.started": case "run.started": case "model.call.started": - case "diagnostic.memory.sample": - case "diagnostic.memory.pressure": case "payload.large": return; }