fix(diagnostics-otel): support preloaded sdk mode (#71450)

This commit is contained in:
Vincent Koc
2026-04-24 23:55:34 -07:00
committed by GitHub
parent 417b1c5507
commit 56eb1ffabf
5 changed files with 128 additions and 24 deletions

View File

@@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai
### Changes
- Diagnostics/OTEL: support `OPENCLAW_OTEL_PRELOADED=1` so the plugin can reuse an already-registered OpenTelemetry SDK while keeping OpenClaw diagnostic listeners wired. (#70424) Thanks @jlapenna.
- Control UI: refine the agent Tool Access panel with compact live-tool chips,
collapsible tool groups, direct per-tool toggles, and clearer runtime/source
provenance. (#71405) Thanks @BunsDev.

View File

@@ -896,6 +896,7 @@ Notes:
- `otel.sampleRate`: trace sampling rate `0``1`.
- `otel.flushIntervalMs`: periodic telemetry flush interval in ms.
- `otel.captureContent`: opt-in raw content capture for OTEL span attributes. Defaults to off. Boolean `true` captures non-system message/tool content; the object form lets you enable `inputMessages`, `outputMessages`, `toolInputs`, `toolOutputs`, and `systemPrompt` explicitly.
- `OPENCLAW_OTEL_PRELOADED=1`: environment toggle for hosts that already registered a global OpenTelemetry SDK. OpenClaw then skips plugin-owned SDK startup/shutdown while keeping diagnostic listeners active.
- `cacheTrace.enabled`: log cache trace snapshots for embedded runs (default: `false`).
- `cacheTrace.filePath`: output path for cache trace JSONL (default: `$OPENCLAW_STATE_DIR/logs/cache-trace.jsonl`).
- `cacheTrace.includeMessages` / `includePrompt` / `includeSystem`: control what is included in cache trace output (all default: `true`).

View File

@@ -307,6 +307,10 @@ Notes:
- Set `headers` when your collector requires auth.
- Environment variables supported: `OTEL_EXPORTER_OTLP_ENDPOINT`,
`OTEL_SERVICE_NAME`, `OTEL_EXPORTER_OTLP_PROTOCOL`.
- Set `OPENCLAW_OTEL_PRELOADED=1` when another preload or host process already
registered the global OpenTelemetry SDK. In that mode the plugin does not start
or shut down its own SDK, but it still wires OpenClaw diagnostic listeners and
honors `diagnostics.otel.traces`, `metrics`, and `logs`.
### Exported metrics (names + types)
@@ -389,6 +393,8 @@ classes you opted into.
`OTEL_EXPORTER_OTLP_ENDPOINT`.
- If the endpoint already contains `/v1/traces` or `/v1/metrics`, it is used as-is.
- If the endpoint already contains `/v1/logs`, it is used as-is for logs.
- `OPENCLAW_OTEL_PRELOADED=1` reuses an externally registered OpenTelemetry SDK
for traces/metrics instead of starting a plugin-owned NodeSDK.
- `diagnostics.otel.logs` enables OTLP log export for the main logger output.
### Log export behavior

View File

@@ -1,4 +1,4 @@
import { beforeEach, describe, expect, test, vi } from "vitest";
import { afterEach, beforeEach, describe, expect, test, vi } from "vitest";
const telemetryState = vi.hoisted(() => {
const counters = new Map<string, { add: ReturnType<typeof vi.fn> }>();
@@ -125,6 +125,7 @@ const GRANDCHILD_SPAN_ID = "2222222222222222";
const PROTO_KEY = "__proto__";
const MAX_TEST_OTEL_CONTENT_ATTRIBUTE_CHARS = 4096;
const OTEL_TRUNCATED_SUFFIX_MAX_CHARS = 20;
const ORIGINAL_OPENCLAW_OTEL_PRELOADED = process.env.OPENCLAW_OTEL_PRELOADED;
function createLogger() {
return {
@@ -194,6 +195,7 @@ function flushDiagnosticEvents() {
describe("diagnostics-otel service", () => {
beforeEach(() => {
delete process.env.OPENCLAW_OTEL_PRELOADED;
telemetryState.counters.clear();
telemetryState.histograms.clear();
telemetryState.spans.length = 0;
@@ -208,6 +210,14 @@ describe("diagnostics-otel service", () => {
traceExporterCtor.mockClear();
});
afterEach(() => {
if (ORIGINAL_OPENCLAW_OTEL_PRELOADED === undefined) {
delete process.env.OPENCLAW_OTEL_PRELOADED;
} else {
process.env.OPENCLAW_OTEL_PRELOADED = ORIGINAL_OPENCLAW_OTEL_PRELOADED;
}
});
test("records message-flow metrics and spans", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true });
@@ -318,6 +328,84 @@ describe("diagnostics-otel service", () => {
expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled();
});
test("uses a preloaded OpenTelemetry SDK without dropping diagnostic listeners", async () => {
process.env.OPENCLAW_OTEL_PRELOADED = "1";
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true });
await service.start(ctx);
expect(sdkStart).not.toHaveBeenCalled();
expect(traceExporterCtor).not.toHaveBeenCalled();
expect(ctx.logger.info).toHaveBeenCalledWith(
"diagnostics-otel: using preloaded OpenTelemetry SDK",
);
emitDiagnosticEvent({
type: "run.completed",
runId: "run-1",
provider: "openai",
model: "gpt-5.4",
outcome: "completed",
durationMs: 100,
});
emitDiagnosticEvent({
type: "log.record",
level: "INFO",
message: "preloaded log",
});
await flushDiagnosticEvents();
expect(telemetryState.histograms.get("openclaw.run.duration_ms")?.record).toHaveBeenCalledWith(
100,
expect.objectContaining({
"openclaw.provider": "openai",
"openclaw.model": "gpt-5.4",
}),
);
expect(telemetryState.tracer.startSpan).toHaveBeenCalledWith(
"openclaw.run",
expect.objectContaining({
attributes: expect.objectContaining({
"openclaw.outcome": "completed",
}),
}),
undefined,
);
expect(logEmit).toHaveBeenCalled();
await service.stop?.(ctx);
expect(sdkShutdown).not.toHaveBeenCalled();
expect(logShutdown).toHaveBeenCalledTimes(1);
});
test("honors disabled traces when an OpenTelemetry SDK is preloaded", async () => {
process.env.OPENCLAW_OTEL_PRELOADED = "1";
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: false, metrics: true });
await service.start(ctx);
emitDiagnosticEvent({
type: "run.completed",
runId: "run-1",
provider: "openai",
model: "gpt-5.4",
outcome: "completed",
durationMs: 100,
});
expect(sdkStart).not.toHaveBeenCalled();
expect(telemetryState.histograms.get("openclaw.run.duration_ms")?.record).toHaveBeenCalledWith(
100,
expect.objectContaining({
"openclaw.provider": "openai",
}),
);
expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled();
await service.stop?.(ctx);
expect(sdkShutdown).not.toHaveBeenCalled();
});
test("tears down active handles when restarted with diagnostics disabled", async () => {
const service = createDiagnosticsOtelService();
const enabledCtx = createOtelContext(OTEL_TEST_ENDPOINT, {

View File

@@ -49,6 +49,7 @@ const LOG_RECORD_EXPORT_FAILURE_REPORT_INTERVAL_MS = 60_000;
const OTEL_LOG_RAW_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,64}$/u;
const OTEL_LOG_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,96}$/u;
const BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
const PRELOADED_OTEL_SDK_ENV = "OPENCLAW_OTEL_PRELOADED";
type OtelContentCapturePolicy = {
inputMessages: boolean;
@@ -164,6 +165,10 @@ function resolveContentCapturePolicy(value: unknown): OtelContentCapturePolicy {
};
}
function hasPreloadedOtelSdk(): boolean {
return process.env[PRELOADED_OTEL_SDK_ENV] === "1";
}
function normalizeOtelContentValue(value: unknown): string | undefined {
if (typeof value === "string") {
return normalizeOtelLogString(value, MAX_OTEL_CONTENT_ATTRIBUTE_CHARS);
@@ -400,38 +405,39 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
if (!tracesEnabled && !metricsEnabled && !logsEnabled) {
return;
}
const sdkPreloaded = hasPreloadedOtelSdk();
const resource = resourceFromAttributes({
[ATTR_SERVICE_NAME]: serviceName,
});
const traceUrl = resolveOtelUrl(endpoint, "v1/traces");
const metricUrl = resolveOtelUrl(endpoint, "v1/metrics");
const logUrl = resolveOtelUrl(endpoint, "v1/logs");
const traceExporter = tracesEnabled
? new OTLPTraceExporter({
...(traceUrl ? { url: traceUrl } : {}),
...(headers ? { headers } : {}),
})
: undefined;
if (!sdkPreloaded && (tracesEnabled || metricsEnabled)) {
const traceUrl = resolveOtelUrl(endpoint, "v1/traces");
const metricUrl = resolveOtelUrl(endpoint, "v1/metrics");
const traceExporter = tracesEnabled
? new OTLPTraceExporter({
...(traceUrl ? { url: traceUrl } : {}),
...(headers ? { headers } : {}),
})
: undefined;
const metricExporter = metricsEnabled
? new OTLPMetricExporter({
...(metricUrl ? { url: metricUrl } : {}),
...(headers ? { headers } : {}),
})
: undefined;
const metricExporter = metricsEnabled
? new OTLPMetricExporter({
...(metricUrl ? { url: metricUrl } : {}),
...(headers ? { headers } : {}),
})
: undefined;
const metricReader = metricExporter
? new PeriodicExportingMetricReader({
exporter: metricExporter,
...(typeof otel.flushIntervalMs === "number"
? { exportIntervalMillis: Math.max(1000, otel.flushIntervalMs) }
: {}),
})
: undefined;
const metricReader = metricExporter
? new PeriodicExportingMetricReader({
exporter: metricExporter,
...(typeof otel.flushIntervalMs === "number"
? { exportIntervalMillis: Math.max(1000, otel.flushIntervalMs) }
: {}),
})
: undefined;
if (tracesEnabled || metricsEnabled) {
sdk = new NodeSDK({
resource,
...(traceExporter ? { traceExporter } : {}),
@@ -452,6 +458,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
ctx.logger.error(`diagnostics-otel: failed to start SDK: ${formatError(err)}`);
throw err;
}
} else if (sdkPreloaded && (tracesEnabled || metricsEnabled)) {
ctx.logger.info("diagnostics-otel: using preloaded OpenTelemetry SDK");
}
const logSeverityMap: Record<string, SeverityNumber> = {