mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 07:50:43 +00:00
feat(diagnostics-otel): export memory diagnostics
This commit is contained in:
@@ -27,6 +27,7 @@ Docs: https://docs.openclaw.ai
|
|||||||
- Plugins/diagnostics: add metadata-only `model_call_started` and `model_call_ended` hooks for provider/model call telemetry without exposing prompts, responses, headers, request bodies, or raw provider request IDs. Thanks @vincentkoc.
|
- Plugins/diagnostics: add metadata-only `model_call_started` and `model_call_ended` hooks for provider/model call telemetry without exposing prompts, responses, headers, request bodies, or raw provider request IDs. Thanks @vincentkoc.
|
||||||
- Diagnostics/OTEL: emit bounded context assembly diagnostics and export `openclaw.context.assembled` spans with prompt/history sizes but no prompt, history, response, or session-key content. Thanks @vincentkoc.
|
- Diagnostics/OTEL: emit bounded context assembly diagnostics and export `openclaw.context.assembled` spans with prompt/history sizes but no prompt, history, response, or session-key content. Thanks @vincentkoc.
|
||||||
- Diagnostics/OTEL: export existing tool-loop diagnostics as `openclaw.tool.loop` counters and spans without loop messages, session identifiers, params, or tool output. Thanks @vincentkoc.
|
- Diagnostics/OTEL: export existing tool-loop diagnostics as `openclaw.tool.loop` counters and spans without loop messages, session identifiers, params, or tool output. Thanks @vincentkoc.
|
||||||
|
- Diagnostics/OTEL: export diagnostic memory samples and pressure as bounded memory histograms, counters, and pressure spans to help spot leak regressions without session or payload data. Thanks @vincentkoc.
|
||||||
- Diagnostics/OTEL: add bounded outbound message delivery lifecycle diagnostics and export them as low-cardinality delivery spans/metrics without message body, recipient, room, or media-path data. (#71471) Thanks @vincentkoc and @jlapenna.
|
- Diagnostics/OTEL: add bounded outbound message delivery lifecycle diagnostics and export them as low-cardinality delivery spans/metrics without message body, recipient, room, or media-path data. (#71471) Thanks @vincentkoc and @jlapenna.
|
||||||
- Diagnostics/OTEL: emit bounded exec-process diagnostics and export them as `openclaw.exec` spans without exposing command text, working directories, or container identifiers. (#71451) Thanks @vincentkoc and @jlapenna.
|
- Diagnostics/OTEL: emit bounded exec-process diagnostics and export them as `openclaw.exec` spans without exposing command text, working directories, or container identifiers. (#71451) Thanks @vincentkoc and @jlapenna.
|
||||||
- Diagnostics/OTEL: support `OPENCLAW_OTEL_PRELOADED=1` so the plugin can reuse an already-registered OpenTelemetry SDK while keeping OpenClaw diagnostic listeners wired. (#71450) Thanks @vincentkoc and @jlapenna.
|
- Diagnostics/OTEL: support `OPENCLAW_OTEL_PRELOADED=1` so the plugin can reuse an already-registered OpenTelemetry SDK while keeping OpenClaw diagnostic listeners wired. (#71450) Thanks @vincentkoc and @jlapenna.
|
||||||
|
|||||||
@@ -1100,6 +1100,82 @@ describe("diagnostics-otel service", () => {
|
|||||||
await service.stop?.(ctx);
|
await service.stop?.(ctx);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("exports diagnostic memory samples and pressure without session identifiers", async () => {
|
||||||
|
const service = createDiagnosticsOtelService();
|
||||||
|
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
|
||||||
|
await service.start(ctx);
|
||||||
|
|
||||||
|
emitDiagnosticEvent({
|
||||||
|
type: "diagnostic.memory.sample",
|
||||||
|
uptimeMs: 1234,
|
||||||
|
memory: {
|
||||||
|
rssBytes: 100,
|
||||||
|
heapUsedBytes: 40,
|
||||||
|
heapTotalBytes: 80,
|
||||||
|
externalBytes: 10,
|
||||||
|
arrayBuffersBytes: 5,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
emitDiagnosticEvent({
|
||||||
|
type: "diagnostic.memory.pressure",
|
||||||
|
level: "critical",
|
||||||
|
reason: "rss_growth",
|
||||||
|
thresholdBytes: 512,
|
||||||
|
rssGrowthBytes: 256,
|
||||||
|
windowMs: 60_000,
|
||||||
|
memory: {
|
||||||
|
rssBytes: 200,
|
||||||
|
heapUsedBytes: 50,
|
||||||
|
heapTotalBytes: 90,
|
||||||
|
externalBytes: 20,
|
||||||
|
arrayBuffersBytes: 6,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
await flushDiagnosticEvents();
|
||||||
|
|
||||||
|
expect(telemetryState.histograms.get("openclaw.memory.rss_bytes")?.record).toHaveBeenCalledWith(
|
||||||
|
100,
|
||||||
|
{},
|
||||||
|
);
|
||||||
|
expect(telemetryState.histograms.get("openclaw.memory.rss_bytes")?.record).toHaveBeenCalledWith(
|
||||||
|
200,
|
||||||
|
{
|
||||||
|
"openclaw.memory.level": "critical",
|
||||||
|
"openclaw.memory.reason": "rss_growth",
|
||||||
|
},
|
||||||
|
);
|
||||||
|
expect(telemetryState.counters.get("openclaw.memory.pressure")?.add).toHaveBeenCalledWith(1, {
|
||||||
|
"openclaw.memory.level": "critical",
|
||||||
|
"openclaw.memory.reason": "rss_growth",
|
||||||
|
});
|
||||||
|
const pressureCall = telemetryState.tracer.startSpan.mock.calls.find(
|
||||||
|
(call) => call[0] === "openclaw.memory.pressure",
|
||||||
|
);
|
||||||
|
expect(pressureCall?.[1]).toMatchObject({
|
||||||
|
attributes: {
|
||||||
|
"openclaw.memory.level": "critical",
|
||||||
|
"openclaw.memory.reason": "rss_growth",
|
||||||
|
"openclaw.memory.rss_bytes": 200,
|
||||||
|
"openclaw.memory.heap_used_bytes": 50,
|
||||||
|
"openclaw.memory.heap_total_bytes": 90,
|
||||||
|
"openclaw.memory.external_bytes": 20,
|
||||||
|
"openclaw.memory.array_buffers_bytes": 6,
|
||||||
|
"openclaw.memory.threshold_bytes": 512,
|
||||||
|
"openclaw.memory.rss_growth_bytes": 256,
|
||||||
|
"openclaw.memory.window_ms": 60_000,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
const pressureSpan = telemetryState.spans.find(
|
||||||
|
(span) => span.name === "openclaw.memory.pressure",
|
||||||
|
);
|
||||||
|
expect(pressureSpan?.setStatus).toHaveBeenCalledWith({
|
||||||
|
code: 2,
|
||||||
|
message: "rss_growth",
|
||||||
|
});
|
||||||
|
expect(JSON.stringify(pressureCall)).not.toContain("session");
|
||||||
|
await service.stop?.(ctx);
|
||||||
|
});
|
||||||
|
|
||||||
test("parents trusted diagnostic lifecycle spans from explicit parent ids", async () => {
|
test("parents trusted diagnostic lifecycle spans from explicit parent ids", async () => {
|
||||||
const service = createDiagnosticsOtelService();
|
const service = createDiagnosticsOtelService();
|
||||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
|
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
|
||||||
|
|||||||
@@ -676,6 +676,33 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|||||||
unit: "ms",
|
unit: "ms",
|
||||||
description: "Exec process duration",
|
description: "Exec process duration",
|
||||||
});
|
});
|
||||||
|
const memoryRssHistogram = meter.createHistogram("openclaw.memory.rss_bytes", {
|
||||||
|
unit: "By",
|
||||||
|
description: "Resident set size reported by diagnostic memory samples",
|
||||||
|
});
|
||||||
|
const memoryHeapUsedHistogram = meter.createHistogram("openclaw.memory.heap_used_bytes", {
|
||||||
|
unit: "By",
|
||||||
|
description: "Heap used bytes reported by diagnostic memory samples",
|
||||||
|
});
|
||||||
|
const memoryHeapTotalHistogram = meter.createHistogram("openclaw.memory.heap_total_bytes", {
|
||||||
|
unit: "By",
|
||||||
|
description: "Heap total bytes reported by diagnostic memory samples",
|
||||||
|
});
|
||||||
|
const memoryExternalHistogram = meter.createHistogram("openclaw.memory.external_bytes", {
|
||||||
|
unit: "By",
|
||||||
|
description: "External memory bytes reported by diagnostic memory samples",
|
||||||
|
});
|
||||||
|
const memoryArrayBuffersHistogram = meter.createHistogram(
|
||||||
|
"openclaw.memory.array_buffers_bytes",
|
||||||
|
{
|
||||||
|
unit: "By",
|
||||||
|
description: "ArrayBuffer bytes reported by diagnostic memory samples",
|
||||||
|
},
|
||||||
|
);
|
||||||
|
const memoryPressureCounter = meter.createCounter("openclaw.memory.pressure", {
|
||||||
|
unit: "1",
|
||||||
|
description: "Diagnostic memory pressure events",
|
||||||
|
});
|
||||||
|
|
||||||
let recordLogRecord:
|
let recordLogRecord:
|
||||||
| ((
|
| ((
|
||||||
@@ -1126,6 +1153,65 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|||||||
span.end(evt.ts);
|
span.end(evt.ts);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const recordMemoryUsageMetrics = (
|
||||||
|
evt: Extract<
|
||||||
|
DiagnosticEventPayload,
|
||||||
|
{ type: "diagnostic.memory.sample" | "diagnostic.memory.pressure" }
|
||||||
|
>,
|
||||||
|
attrs: Record<string, string> = {},
|
||||||
|
) => {
|
||||||
|
memoryRssHistogram.record(evt.memory.rssBytes, attrs);
|
||||||
|
memoryHeapUsedHistogram.record(evt.memory.heapUsedBytes, attrs);
|
||||||
|
memoryHeapTotalHistogram.record(evt.memory.heapTotalBytes, attrs);
|
||||||
|
memoryExternalHistogram.record(evt.memory.externalBytes, attrs);
|
||||||
|
memoryArrayBuffersHistogram.record(evt.memory.arrayBuffersBytes, attrs);
|
||||||
|
};
|
||||||
|
|
||||||
|
const recordMemorySample = (
|
||||||
|
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.memory.sample" }>,
|
||||||
|
) => {
|
||||||
|
recordMemoryUsageMetrics(evt);
|
||||||
|
};
|
||||||
|
|
||||||
|
const recordMemoryPressure = (
|
||||||
|
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.memory.pressure" }>,
|
||||||
|
) => {
|
||||||
|
const attrs = {
|
||||||
|
"openclaw.memory.level": evt.level,
|
||||||
|
"openclaw.memory.reason": evt.reason,
|
||||||
|
};
|
||||||
|
memoryPressureCounter.add(1, attrs);
|
||||||
|
recordMemoryUsageMetrics(evt, attrs);
|
||||||
|
if (!tracesEnabled) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const spanAttrs: Record<string, string | number | boolean> = {
|
||||||
|
...attrs,
|
||||||
|
"openclaw.memory.rss_bytes": evt.memory.rssBytes,
|
||||||
|
"openclaw.memory.heap_used_bytes": evt.memory.heapUsedBytes,
|
||||||
|
"openclaw.memory.heap_total_bytes": evt.memory.heapTotalBytes,
|
||||||
|
"openclaw.memory.external_bytes": evt.memory.externalBytes,
|
||||||
|
"openclaw.memory.array_buffers_bytes": evt.memory.arrayBuffersBytes,
|
||||||
|
...(evt.thresholdBytes !== undefined
|
||||||
|
? { "openclaw.memory.threshold_bytes": evt.thresholdBytes }
|
||||||
|
: {}),
|
||||||
|
...(evt.rssGrowthBytes !== undefined
|
||||||
|
? { "openclaw.memory.rss_growth_bytes": evt.rssGrowthBytes }
|
||||||
|
: {}),
|
||||||
|
...(evt.windowMs !== undefined ? { "openclaw.memory.window_ms": evt.windowMs } : {}),
|
||||||
|
};
|
||||||
|
const span = spanWithDuration("openclaw.memory.pressure", spanAttrs, 0, {
|
||||||
|
endTimeMs: evt.ts,
|
||||||
|
});
|
||||||
|
if (evt.level === "critical") {
|
||||||
|
span.setStatus({
|
||||||
|
code: SpanStatusCode.ERROR,
|
||||||
|
message: evt.reason,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
span.end(evt.ts);
|
||||||
|
};
|
||||||
|
|
||||||
const recordRunCompleted = (
|
const recordRunCompleted = (
|
||||||
evt: Extract<DiagnosticEventPayload, { type: "run.completed" }>,
|
evt: Extract<DiagnosticEventPayload, { type: "run.completed" }>,
|
||||||
metadata: DiagnosticEventMetadata,
|
metadata: DiagnosticEventMetadata,
|
||||||
@@ -1470,11 +1556,15 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|||||||
case "tool.loop":
|
case "tool.loop":
|
||||||
recordToolLoop(evt);
|
recordToolLoop(evt);
|
||||||
return;
|
return;
|
||||||
|
case "diagnostic.memory.sample":
|
||||||
|
recordMemorySample(evt);
|
||||||
|
return;
|
||||||
|
case "diagnostic.memory.pressure":
|
||||||
|
recordMemoryPressure(evt);
|
||||||
|
return;
|
||||||
case "tool.execution.started":
|
case "tool.execution.started":
|
||||||
case "run.started":
|
case "run.started":
|
||||||
case "model.call.started":
|
case "model.call.started":
|
||||||
case "diagnostic.memory.sample":
|
|
||||||
case "diagnostic.memory.pressure":
|
|
||||||
case "payload.large":
|
case "payload.large":
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user